Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
if __name__ == "__main__":
from create_enron_network import tokenize_text, get_labels
from edgelabelgraph import EdgeLabelGraph
from collections import Counter
import os
data_dir = "../Physics Theory Citation Network/"
G = EdgeLabelGraph()
id_to_authors = dict()
author_names = dict()
with open(os.path.join(data_dir, "AuthorNodes.csv"), encoding="utf8", errors='ignore') as f:
for line in f.readlines():
id, authors = line.split(",")
authors = authors[:-1]
if authors.endswith(" and"):
authors = authors[:-4]
if " & " in authors:
authors = authors.split(" & ")
else:
authors = authors.split(" and ")
for author in authors:
author_name = author
name_parts = []
for part in author.split(" "):
for part2 in part.split("."):
if part2:
# Split two-part first names that cointain a hyphen but not surnames since surnames are never shortened
if not name_parts:
for part3 in part2.split("-"):
if part3:
name_parts.append(part2)
else:
name_parts.append(part2)
surname = name_parts[-1]
firstname_parts = name_parts[:-1]
if not surname in author_names:
author_names[surname] = []
author_names[surname].append((firstname_parts, author))
else:
match_index = 0
for namesake_firstname_parts, _ in author_names[surname]:
for i_part in range(min(len(firstname_parts), len(namesake_firstname_parts))):
if not namesake_firstname_parts[i_part].startswith(firstname_parts[i_part]) \
and not firstname_parts[i_part].startswith(namesake_firstname_parts[i_part]):
break
else:
break
match_index += 1
if match_index >= len(author_names[surname]):
author_names[surname].append((firstname_parts, author))
else:
author_name = author_names[surname][match_index][1]
if not id in id_to_authors:
id_to_authors[id] = set()
id_to_authors[id].add(author_name)
edges_to_add = []
papers_with_label = Counter()
author_edges = Counter()
n_papers = 0
with open(os.path.join(data_dir, "ArticleNodes.csv"), encoding="utf8", errors='ignore') as f:
for line in f.readlines():
id, title, year, journal, abstract = line.split(",")
if not id in id_to_authors:
print(f"No authors for {id}")
continue
authors = tuple(id_to_authors[id])
if len(authors) <= 1: # Ignore papers with only one author
continue
n_papers += 1
title = title.strip()
labels = get_labels(title)
for label in labels:
papers_with_label[label] += 1
for i_author in range(len(authors)-1):
for j_author in range(i_author+1, len(authors)):
edges_to_add.append(((authors[i_author],authors[j_author]), labels.copy()))
author_edges[(authors[i_author],authors[j_author])] += 1
min_papers_with_label = int(0.005*n_papers)
min_shared_papers_for_edge = 2
for edge, labels in edges_to_add:
for label in tuple(labels):
if papers_with_label[label] < min_papers_with_label:
labels.remove(label)
if len(labels) > 0 and author_edges[edge] >= min_shared_papers_for_edge:
G.add_edge_with_labels(edge, labels)
import pickle
with open(f"phys_graph_title_min_{min_papers_with_label}_shared_{min_shared_papers_for_edge}.pkl", "wb") as file:
pickle.dump(G, file)