diff --git a/create_phys_network.py b/create_phys_network.py new file mode 100644 index 0000000000000000000000000000000000000000..7598b6cea8de8ab173681cb5a23940783f627c70 --- /dev/null +++ b/create_phys_network.py @@ -0,0 +1,97 @@ +if __name__ == "__main__": + from create_enron_network import tokenize_text, get_labels + from edgelabelgraph import EdgeLabelGraph + from collections import Counter + import os + + data_dir = "../Physics Theory Citation Network/" + + G = EdgeLabelGraph() + + id_to_authors = dict() + author_names = dict() + with open(os.path.join(data_dir, "AuthorNodes.csv"), encoding="utf8", errors='ignore') as f: + for line in f.readlines(): + id, authors = line.split(",") + authors = authors[:-1] + if authors.endswith(" and"): + authors = authors[:-4] + if " & " in authors: + authors = authors.split(" & ") + else: + authors = authors.split(" and ") + for author in authors: + author_name = author + name_parts = [] + for part in author.split(" "): + for part2 in part.split("."): + if part2: + # Split two-part first names that cointain a hyphen but not surnames since surnames are never shortened + if not name_parts: + for part3 in part2.split("-"): + if part3: + name_parts.append(part2) + else: + name_parts.append(part2) + surname = name_parts[-1] + firstname_parts = name_parts[:-1] + if not surname in author_names: + author_names[surname] = [] + author_names[surname].append((firstname_parts, author)) + else: + match_index = 0 + for namesake_firstname_parts, _ in author_names[surname]: + for i_part in range(min(len(firstname_parts), len(namesake_firstname_parts))): + if not namesake_firstname_parts[i_part].startswith(firstname_parts[i_part]) \ + and not firstname_parts[i_part].startswith(namesake_firstname_parts[i_part]): + break + else: + break + match_index += 1 + if match_index >= len(author_names[surname]): + author_names[surname].append((firstname_parts, author)) + else: + author_name = author_names[surname][match_index][1] + + if not id in id_to_authors: + id_to_authors[id] = set() + id_to_authors[id].add(author_name) + + edges_to_add = [] + papers_with_label = Counter() + author_edges = Counter() + n_papers = 0 + with open(os.path.join(data_dir, "ArticleNodes.csv"), encoding="utf8", errors='ignore') as f: + for line in f.readlines(): + id, title, year, journal, abstract = line.split(",") + if not id in id_to_authors: + print(f"No authors for {id}") + continue + authors = tuple(id_to_authors[id]) + if len(authors) <= 1: # Ignore papers with only one author + continue + n_papers += 1 + title = title.strip() + labels = get_labels(title) + + for label in labels: + papers_with_label[label] += 1 + + for i_author in range(len(authors)-1): + for j_author in range(i_author+1, len(authors)): + edges_to_add.append(((authors[i_author],authors[j_author]), labels.copy())) + author_edges[(authors[i_author],authors[j_author])] += 1 + + min_papers_with_label = int(0.005*n_papers) + min_shared_papers_for_edge = 2 + + for edge, labels in edges_to_add: + for label in tuple(labels): + if papers_with_label[label] < min_papers_with_label: + labels.remove(label) + if len(labels) > 0 and author_edges[edge] >= min_shared_papers_for_edge: + G.add_edge_with_labels(edge, labels) + + import pickle + with open(f"phys_graph_title_min_{min_papers_with_label}_shared_{min_shared_papers_for_edge}.pkl", "wb") as file: + pickle.dump(G, file)