From f4ed460ede1ca0916f84cb6536da02d2401bee6c Mon Sep 17 00:00:00 2001 From: Iiro Kumpulainen <iiro.kumpulainen@helsinki.fi> Date: Mon, 17 Oct 2022 18:23:31 +0300 Subject: [PATCH] Upload New File --- Tweet_Collection.ipynb | 169 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 169 insertions(+) create mode 100644 Tweet_Collection.ipynb diff --git a/Tweet_Collection.ipynb b/Tweet_Collection.ipynb new file mode 100644 index 0000000..7764de4 --- /dev/null +++ b/Tweet_Collection.ipynb @@ -0,0 +1,169 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 115, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 10000 / 10000" + ] + } + ], + "source": [ + "import tweepy\n", + "from datetime import datetime\n", + "\n", + "print_tweets = False\n", + "n_tweets = 10000\n", + "max_results = 100\n", + " \n", + "\n", + "auth = tweepy.OAuth1UserHandler(\n", + " consumer_key, consumer_secret, access_token, access_token_secret\n", + ")\n", + "\n", + "api = tweepy.API(auth)\n", + "client = tweepy.Client(bearer_token)\n", + " \n", + "query = \"#metoo\"\n", + "\n", + "tweet_fields = ['attachments', 'author_id', 'text', 'in_reply_to_user_id', 'referenced_tweets', 'entities']\n", + "next_token = None\n", + "end_time = datetime(year=2022,month=5,day=28)\n", + "\n", + "all_tweets = []\n", + "\n", + "\n", + "# This endpoint/method returns Tweets from the last seven days\n", + "while len(all_tweets) < n_tweets:\n", + " response = client.search_recent_tweets(query, tweet_fields=tweet_fields,\n", + " max_results=max_results, next_token=next_token, end_time=end_time)\n", + " tweets = response.data\n", + " if tweets == None:\n", + " break\n", + "\n", + " for tweet in tweets:\n", + " all_tweets.append(tweet)\n", + " \n", + " if print_tweets:\n", + " print(tweet.text)\n", + " print(\"-------------------\")\n", + " \n", + " if len(all_tweets) == n_tweets:\n", + " break\n", + "\n", + " if print_tweets:\n", + " print(len(all_tweets), \"/\", n_tweets)\n", + " else:\n", + " print(\"\\r\",len(all_tweets), \"/\", n_tweets, end=\"\")\n", + "\n", + " if 'next_token' in response.meta:\n", + " next_token = response.meta['next_token']\n", + " else:\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "with open(\"tweets.txt\", \"w+\") as file:\n", + " tweet_dicts = []\n", + " for tweet in all_tweets:\n", + " dict_tweet = dict(tweet)\n", + " if tweet.referenced_tweets != None: \n", + " dict_tweet['referenced_tweets'] = [dict(r_tweet) for r_tweet in tweet.referenced_tweets]\n", + " tweet_dicts.append(dict_tweet)\n", + " file.write(json.dumps(tweet_dicts))" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "with open(\"tweets.txt\", \"r\") as file:\n", + " all_tweets_json = json.load(file)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "8077\n", + "9429\n", + "1.1673888820106475\n" + ] + } + ], + "source": [ + "import pickle\n", + "from create_enron_network import tokenize_text, get_labels\n", + "from edgelabelgraph import EdgeLabelGraph\n", + "\n", + "G = EdgeLabelGraph()\n", + "\n", + "for tweet in all_tweets_json:\n", + " \n", + " author = tweet['author_id']\n", + " \n", + " # Use hashtags as labels\n", + " if not ('entities' in tweet and 'hashtags' in tweet['entities']):\n", + " continue\n", + " labels = set([hashtag['tag'].lower() for hashtag in tweet['entities']['hashtags']])\n", + " \n", + " if 'mentions' in tweet['entities']:\n", + " referenced_users = set([int(user['id']) for user in tweet['entities']['mentions']])\n", + " \n", + " if 'in_reply_to_user_id' in tweet:\n", + " referenced_users.add(tweet['in_reply_to_user_id'])\n", + " \n", + " for user in referenced_users:\n", + " if user != author:\n", + " G.add_edge_with_labels((author, user), labels)\n", + " \n", + "print(G.number_of_nodes())\n", + "print(G.number_of_edges())\n", + "print(G.density())\n", + "\n", + "with open(\"tweets_graph.pkl\", \"wb\") as file:\n", + " pickle.dump(G, file)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} -- GitLab