Skip to content
Snippets Groups Projects
Tweet_Collection.ipynb 4.62 KiB
Newer Older
  • Learn to ignore specific revisions
  • Iiro Kumpulainen's avatar
    Iiro Kumpulainen committed
    {
     "cells": [
      {
       "cell_type": "code",
       "execution_count": 115,
       "metadata": {},
       "outputs": [
        {
         "name": "stdout",
         "output_type": "stream",
         "text": [
          " 10000 / 10000"
         ]
        }
       ],
       "source": [
        "import tweepy\n",
        "from datetime import datetime\n",
        "\n",
        "print_tweets = False\n",
        "n_tweets = 10000\n",
        "max_results = 100\n",
        "    \n",
        "\n",
        "auth = tweepy.OAuth1UserHandler(\n",
        "    consumer_key, consumer_secret, access_token, access_token_secret\n",
        ")\n",
        "\n",
        "api = tweepy.API(auth)\n",
        "client = tweepy.Client(bearer_token)\n",
        "    \n",
        "query = \"#metoo\"\n",
        "\n",
        "tweet_fields = ['attachments', 'author_id', 'text', 'in_reply_to_user_id', 'referenced_tweets', 'entities']\n",
        "next_token = None\n",
        "end_time = datetime(year=2022,month=5,day=28)\n",
        "\n",
        "all_tweets = []\n",
        "\n",
        "\n",
        "# This endpoint/method returns Tweets from the last seven days\n",
        "while len(all_tweets) < n_tweets:\n",
        "    response = client.search_recent_tweets(query, tweet_fields=tweet_fields,\n",
        "                                           max_results=max_results, next_token=next_token, end_time=end_time)\n",
        "    tweets = response.data\n",
        "    if tweets == None:\n",
        "        break\n",
        "\n",
        "    for tweet in tweets:\n",
        "        all_tweets.append(tweet)\n",
        "        \n",
        "        if print_tweets:\n",
        "            print(tweet.text)\n",
        "            print(\"-------------------\")\n",
        "        \n",
        "        if len(all_tweets) == n_tweets:\n",
        "            break\n",
        "\n",
        "    if print_tweets:\n",
        "        print(len(all_tweets), \"/\", n_tweets)\n",
        "    else:\n",
        "        print(\"\\r\",len(all_tweets), \"/\", n_tweets, end=\"\")\n",
        "\n",
        "    if 'next_token' in response.meta:\n",
        "        next_token = response.meta['next_token']\n",
        "    else:\n",
        "        break"
       ]
      },
      {
       "cell_type": "code",
       "execution_count": 116,
       "metadata": {},
       "outputs": [],
       "source": [
        "import json\n",
        "with open(\"tweets.txt\", \"w+\") as file:\n",
        "    tweet_dicts = []\n",
        "    for tweet in all_tweets:\n",
        "        dict_tweet = dict(tweet)\n",
        "        if tweet.referenced_tweets != None: \n",
        "            dict_tweet['referenced_tweets'] = [dict(r_tweet) for r_tweet in tweet.referenced_tweets]\n",
        "        tweet_dicts.append(dict_tweet)\n",
        "    file.write(json.dumps(tweet_dicts))"
       ]
      },
      {
       "cell_type": "code",
       "execution_count": 2,
       "metadata": {},
       "outputs": [],
       "source": [
        "import json\n",
        "with open(\"tweets.txt\", \"r\") as file:\n",
        "    all_tweets_json = json.load(file)"
       ]
      },
      {
       "cell_type": "code",
       "execution_count": 3,
       "metadata": {},
       "outputs": [
        {
         "name": "stdout",
         "output_type": "stream",
         "text": [
          "8077\n",
          "9429\n",
          "1.1673888820106475\n"
         ]
        }
       ],
       "source": [
        "import pickle\n",
        "from create_enron_network import tokenize_text, get_labels\n",
        "from edgelabelgraph import EdgeLabelGraph\n",
        "\n",
        "G = EdgeLabelGraph()\n",
        "\n",
        "for tweet in all_tweets_json:\n",
        "        \n",
        "    author = tweet['author_id']\n",
        "    \n",
        "    # Use hashtags as labels\n",
        "    if not ('entities' in tweet and 'hashtags' in tweet['entities']):\n",
        "        continue\n",
        "    labels = set([hashtag['tag'].lower() for hashtag in tweet['entities']['hashtags']])\n",
        "    \n",
        "    if 'mentions' in tweet['entities']:\n",
        "        referenced_users = set([int(user['id']) for user in tweet['entities']['mentions']])\n",
        "    \n",
        "    if 'in_reply_to_user_id' in tweet:\n",
        "        referenced_users.add(tweet['in_reply_to_user_id'])\n",
        "    \n",
        "    for user in referenced_users:\n",
        "        if user != author:\n",
        "            G.add_edge_with_labels((author, user), labels)\n",
        "        \n",
        "print(G.number_of_nodes())\n",
        "print(G.number_of_edges())\n",
        "print(G.density())\n",
        "\n",
        "with open(\"tweets_graph.pkl\", \"wb\") as file:\n",
        "    pickle.dump(G, file)"
       ]
      }
     ],
     "metadata": {
      "kernelspec": {
       "display_name": "Python 3",
       "language": "python",
       "name": "python3"
      },
      "language_info": {
       "codemirror_mode": {
        "name": "ipython",
        "version": 3
       },
       "file_extension": ".py",
       "mimetype": "text/x-python",
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
       "version": "3.8.10"
      }
     },
     "nbformat": 4,
     "nbformat_minor": 4
    }