From 146e71c1a8b8140c07f18a27d67e2fc33138f146 Mon Sep 17 00:00:00 2001 From: Iiro Kumpulainen <iiro.kumpulainen@helsinki.fi> Date: Mon, 17 Oct 2022 18:22:55 +0300 Subject: [PATCH] Upload New File --- create_enron_network.py | 56 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 create_enron_network.py diff --git a/create_enron_network.py b/create_enron_network.py new file mode 100644 index 0000000..715f4ef --- /dev/null +++ b/create_enron_network.py @@ -0,0 +1,56 @@ +from nltk.stem import PorterStemmer +from nltk.corpus import stopwords +import nltk + +stop_words = set(stopwords.words('english')) +numbers = set('0123456789') + +def tokenize_text(t): + ps = PorterStemmer() + tokenizer = nltk.RegexpTokenizer(r"\w+") + return [ps.stem(w) for w in tokenizer.tokenize(t) if w.lower() not in stop_words \ + and len(w) >= 3 and not w.isdigit() and not set(w).intersection(numbers)] + +def get_labels(s): + return set(tokenize_text(s)) + +if __name__ == "__main__": + from edgelabelgraph import EdgeLabelGraph + import os + + data_dir = "../maildir/" + mail_folder = "sent" + + G = EdgeLabelGraph() + + for folder in os.listdir(data_dir): + mail_path = os.path.join(data_dir, folder, mail_folder) + if not os.path.exists(mail_path): + print(f"{mail_path} not found") + continue + print(mail_path) + for mail in os.listdir(mail_path): + labels = set() + with open(os.path.join(mail_path, mail), encoding="utf8", errors='ignore') as f: + header_read = False + for line in f.readlines(): + if not header_read: + if line.startswith("X-From: "): + sender = line[len("X-From: "):-1] + elif line.startswith("X-To: "): + receivers = line[len("X-To: "):-1].split(", ") + elif line.startswith("Subject: "): + subject = line[len("Subject: "):-1] + labels = get_labels(subject) + elif line.startswith("X-FileName: "): + header_read = True + else: + continue + else: + for receiver in receivers: + if sender != receiver: + G.add_edge_with_labels((sender,receiver), labels) + + import pickle + with open("enron_graph_subject_only.pkl", "wb") as file: + pickle.dump(G, file) -- GitLab