Skip to content
Snippets Groups Projects
Commit 5821a06c authored by Iiro Kumpulainen's avatar Iiro Kumpulainen
Browse files

Upload New File

parent 146e71c1
No related branches found
No related tags found
No related merge requests found
if __name__ == "__main__":
from create_enron_network import tokenize_text, get_labels
from edgelabelgraph import EdgeLabelGraph
from collections import Counter
import os
data_dir = "../Physics Theory Citation Network/"
G = EdgeLabelGraph()
id_to_authors = dict()
author_names = dict()
with open(os.path.join(data_dir, "AuthorNodes.csv"), encoding="utf8", errors='ignore') as f:
for line in f.readlines():
id, authors = line.split(",")
authors = authors[:-1]
if authors.endswith(" and"):
authors = authors[:-4]
if " & " in authors:
authors = authors.split(" & ")
else:
authors = authors.split(" and ")
for author in authors:
author_name = author
name_parts = []
for part in author.split(" "):
for part2 in part.split("."):
if part2:
# Split two-part first names that cointain a hyphen but not surnames since surnames are never shortened
if not name_parts:
for part3 in part2.split("-"):
if part3:
name_parts.append(part2)
else:
name_parts.append(part2)
surname = name_parts[-1]
firstname_parts = name_parts[:-1]
if not surname in author_names:
author_names[surname] = []
author_names[surname].append((firstname_parts, author))
else:
match_index = 0
for namesake_firstname_parts, _ in author_names[surname]:
for i_part in range(min(len(firstname_parts), len(namesake_firstname_parts))):
if not namesake_firstname_parts[i_part].startswith(firstname_parts[i_part]) \
and not firstname_parts[i_part].startswith(namesake_firstname_parts[i_part]):
break
else:
break
match_index += 1
if match_index >= len(author_names[surname]):
author_names[surname].append((firstname_parts, author))
else:
author_name = author_names[surname][match_index][1]
if not id in id_to_authors:
id_to_authors[id] = set()
id_to_authors[id].add(author_name)
edges_to_add = []
papers_with_label = Counter()
author_edges = Counter()
n_papers = 0
with open(os.path.join(data_dir, "ArticleNodes.csv"), encoding="utf8", errors='ignore') as f:
for line in f.readlines():
id, title, year, journal, abstract = line.split(",")
if not id in id_to_authors:
print(f"No authors for {id}")
continue
authors = tuple(id_to_authors[id])
if len(authors) <= 1: # Ignore papers with only one author
continue
n_papers += 1
title = title.strip()
labels = get_labels(title)
for label in labels:
papers_with_label[label] += 1
for i_author in range(len(authors)-1):
for j_author in range(i_author+1, len(authors)):
edges_to_add.append(((authors[i_author],authors[j_author]), labels.copy()))
author_edges[(authors[i_author],authors[j_author])] += 1
min_papers_with_label = int(0.005*n_papers)
min_shared_papers_for_edge = 2
for edge, labels in edges_to_add:
for label in tuple(labels):
if papers_with_label[label] < min_papers_with_label:
labels.remove(label)
if len(labels) > 0 and author_edges[edge] >= min_shared_papers_for_edge:
G.add_edge_with_labels(edge, labels)
import pickle
with open(f"phys_graph_title_min_{min_papers_with_label}_shared_{min_shared_papers_for_edge}.pkl", "wb") as file:
pickle.dump(G, file)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment