Upload New File

f4ed460e · Iiro Kumpulainen · ae4563c6 · f4ed460e
Commit f4ed460e authored 2 years ago by Iiro Kumpulainen
--- a/Tweet_Collection.ipynb
+++ b/Tweet_Collection.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 115,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " 10000 / 10000"
+     ]
+    }
+   ],
+   "source": [
+    "import tweepy\n",
+    "from datetime import datetime\n",
+    "\n",
+    "print_tweets = False\n",
+    "n_tweets = 10000\n",
+    "max_results = 100\n",
+    "    \n",
+    "\n",
+    "auth = tweepy.OAuth1UserHandler(\n",
+    "    consumer_key, consumer_secret, access_token, access_token_secret\n",
+    ")\n",
+    "\n",
+    "api = tweepy.API(auth)\n",
+    "client = tweepy.Client(bearer_token)\n",
+    "    \n",
+    "query = \"#metoo\"\n",
+    "\n",
+    "tweet_fields = ['attachments', 'author_id', 'text', 'in_reply_to_user_id', 'referenced_tweets', 'entities']\n",
+    "next_token = None\n",
+    "end_time = datetime(year=2022,month=5,day=28)\n",
+    "\n",
+    "all_tweets = []\n",
+    "\n",
+    "\n",
+    "# This endpoint/method returns Tweets from the last seven days\n",
+    "while len(all_tweets) < n_tweets:\n",
+    "    response = client.search_recent_tweets(query, tweet_fields=tweet_fields,\n",
+    "                                           max_results=max_results, next_token=next_token, end_time=end_time)\n",
+    "    tweets = response.data\n",
+    "    if tweets == None:\n",
+    "        break\n",
+    "\n",
+    "    for tweet in tweets:\n",
+    "        all_tweets.append(tweet)\n",
+    "        \n",
+    "        if print_tweets:\n",
+    "            print(tweet.text)\n",
+    "            print(\"-------------------\")\n",
+    "        \n",
+    "        if len(all_tweets) == n_tweets:\n",
+    "            break\n",
+    "\n",
+    "    if print_tweets:\n",
+    "        print(len(all_tweets), \"/\", n_tweets)\n",
+    "    else:\n",
+    "        print(\"\\r\",len(all_tweets), \"/\", n_tweets, end=\"\")\n",
+    "\n",
+    "    if 'next_token' in response.meta:\n",
+    "        next_token = response.meta['next_token']\n",
+    "    else:\n",
+    "        break"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 116,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "with open(\"tweets.txt\", \"w+\") as file:\n",
+    "    tweet_dicts = []\n",
+    "    for tweet in all_tweets:\n",
+    "        dict_tweet = dict(tweet)\n",
+    "        if tweet.referenced_tweets != None: \n",
+    "            dict_tweet['referenced_tweets'] = [dict(r_tweet) for r_tweet in tweet.referenced_tweets]\n",
+    "        tweet_dicts.append(dict_tweet)\n",
+    "    file.write(json.dumps(tweet_dicts))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "with open(\"tweets.txt\", \"r\") as file:\n",
+    "    all_tweets_json = json.load(file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "8077\n",
+      "9429\n",
+      "1.1673888820106475\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pickle\n",
+    "from create_enron_network import tokenize_text, get_labels\n",
+    "from edgelabelgraph import EdgeLabelGraph\n",
+    "\n",
+    "G = EdgeLabelGraph()\n",
+    "\n",
+    "for tweet in all_tweets_json:\n",
+    "        \n",
+    "    author = tweet['author_id']\n",
+    "    \n",
+    "    # Use hashtags as labels\n",
+    "    if not ('entities' in tweet and 'hashtags' in tweet['entities']):\n",
+    "        continue\n",
+    "    labels = set([hashtag['tag'].lower() for hashtag in tweet['entities']['hashtags']])\n",
+    "    \n",
+    "    if 'mentions' in tweet['entities']:\n",
+    "        referenced_users = set([int(user['id']) for user in tweet['entities']['mentions']])\n",
+    "    \n",
+    "    if 'in_reply_to_user_id' in tweet:\n",
+    "        referenced_users.add(tweet['in_reply_to_user_id'])\n",
+    "    \n",
+    "    for user in referenced_users:\n",
+    "        if user != author:\n",
+    "            G.add_edge_with_labels((author, user), labels)\n",
+    "        \n",
+    "print(G.number_of_nodes())\n",
+    "print(G.number_of_edges())\n",
+    "print(G.density())\n",
+    "\n",
+    "with open(\"tweets_graph.pkl\", \"wb\") as file:\n",
+    "    pickle.dump(G, file)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
+%% Cell type:code id: tags:
+
+``` python
+import tweepy
+from datetime import datetime
+
+print_tweets = False
+n_tweets = 10000
+max_results = 100
+
+
+auth = tweepy.OAuth1UserHandler(
+    consumer_key, consumer_secret, access_token, access_token_secret
+)
+
+api = tweepy.API(auth)
+client = tweepy.Client(bearer_token)
+
+query = "#metoo"
+
+tweet_fields = ['attachments', 'author_id', 'text', 'in_reply_to_user_id', 'referenced_tweets', 'entities']
+next_token = None
+end_time = datetime(year=2022,month=5,day=28)
+
+all_tweets = []
+
+
+# This endpoint/method returns Tweets from the last seven days
+while len(all_tweets) < n_tweets:
+    response = client.search_recent_tweets(query, tweet_fields=tweet_fields,
+                                           max_results=max_results, next_token=next_token, end_time=end_time)
+    tweets = response.data
+    if tweets == None:
+        break
+
+    for tweet in tweets:
+        all_tweets.append(tweet)
+
+        if print_tweets:
+            print(tweet.text)
+            print("-------------------")
+
+        if len(all_tweets) == n_tweets:
+            break
+
+    if print_tweets:
+        print(len(all_tweets), "/", n_tweets)
+    else:
+        print("\r",len(all_tweets), "/", n_tweets, end="")
+
+    if 'next_token' in response.meta:
+        next_token = response.meta['next_token']
+    else:
+        break
+```
+
+%% Output
+
+     10000 / 10000
+
+%% Cell type:code id: tags:
+
+``` python
+import json
+with open("tweets.txt", "w+") as file:
+    tweet_dicts = []
+    for tweet in all_tweets:
+        dict_tweet = dict(tweet)
+        if tweet.referenced_tweets != None:
+            dict_tweet['referenced_tweets'] = [dict(r_tweet) for r_tweet in tweet.referenced_tweets]
+        tweet_dicts.append(dict_tweet)
+    file.write(json.dumps(tweet_dicts))
+```
+
+%% Cell type:code id: tags:
+
+``` python
+import json
+with open("tweets.txt", "r") as file:
+    all_tweets_json = json.load(file)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+import pickle
+from create_enron_network import tokenize_text, get_labels
+from edgelabelgraph import EdgeLabelGraph
+
+G = EdgeLabelGraph()
+
+for tweet in all_tweets_json:
+
+    author = tweet['author_id']
+
+    # Use hashtags as labels
+    if not ('entities' in tweet and 'hashtags' in tweet['entities']):
+        continue
+    labels = set([hashtag['tag'].lower() for hashtag in tweet['entities']['hashtags']])
+
+    if 'mentions' in tweet['entities']:
+        referenced_users = set([int(user['id']) for user in tweet['entities']['mentions']])
+
+    if 'in_reply_to_user_id' in tweet:
+        referenced_users.add(tweet['in_reply_to_user_id'])
+
+    for user in referenced_users:
+        if user != author:
+            G.add_edge_with_labels((author, user), labels)
+
+print(G.number_of_nodes())
+print(G.number_of_edges())
+print(G.density())
+
+with open("tweets_graph.pkl", "wb") as file:
+    pickle.dump(G, file)
+```
+
+%% Output
+
+    8077
+    9429
+    1.1673888820106475