if __name__ == "__main__": from create_enron_network import tokenize_text, get_labels from edgelabelgraph import EdgeLabelGraph from collections import Counter import os data_dir = "../Physics Theory Citation Network/" G = EdgeLabelGraph() id_to_authors = dict() author_names = dict() with open(os.path.join(data_dir, "AuthorNodes.csv"), encoding="utf8", errors='ignore') as f: for line in f.readlines(): id, authors = line.split(",") authors = authors[:-1] if authors.endswith(" and"): authors = authors[:-4] if " & " in authors: authors = authors.split(" & ") else: authors = authors.split(" and ") for author in authors: author_name = author name_parts = [] for part in author.split(" "): for part2 in part.split("."): if part2: # Split two-part first names that cointain a hyphen but not surnames since surnames are never shortened if not name_parts: for part3 in part2.split("-"): if part3: name_parts.append(part2) else: name_parts.append(part2) surname = name_parts[-1] firstname_parts = name_parts[:-1] if not surname in author_names: author_names[surname] = [] author_names[surname].append((firstname_parts, author)) else: match_index = 0 for namesake_firstname_parts, _ in author_names[surname]: for i_part in range(min(len(firstname_parts), len(namesake_firstname_parts))): if not namesake_firstname_parts[i_part].startswith(firstname_parts[i_part]) \ and not firstname_parts[i_part].startswith(namesake_firstname_parts[i_part]): break else: break match_index += 1 if match_index >= len(author_names[surname]): author_names[surname].append((firstname_parts, author)) else: author_name = author_names[surname][match_index][1] if not id in id_to_authors: id_to_authors[id] = set() id_to_authors[id].add(author_name) edges_to_add = [] papers_with_label = Counter() author_edges = Counter() n_papers = 0 with open(os.path.join(data_dir, "ArticleNodes.csv"), encoding="utf8", errors='ignore') as f: for line in f.readlines(): id, title, year, journal, abstract = line.split(",") if not id in id_to_authors: print(f"No authors for {id}") continue authors = tuple(id_to_authors[id]) if len(authors) <= 1: # Ignore papers with only one author continue n_papers += 1 title = title.strip() labels = get_labels(title) for label in labels: papers_with_label[label] += 1 for i_author in range(len(authors)-1): for j_author in range(i_author+1, len(authors)): edges_to_add.append(((authors[i_author],authors[j_author]), labels.copy())) author_edges[(authors[i_author],authors[j_author])] += 1 min_papers_with_label = int(0.005*n_papers) min_shared_papers_for_edge = 2 for edge, labels in edges_to_add: for label in tuple(labels): if papers_with_label[label] < min_papers_with_label: labels.remove(label) if len(labels) > 0 and author_edges[edge] >= min_shared_papers_for_edge: G.add_edge_with_labels(edge, labels) import pickle with open(f"phys_graph_title_min_{min_papers_with_label}_shared_{min_shared_papers_for_edge}.pkl", "wb") as file: pickle.dump(G, file)