Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
L
labeled-densest-subgraph
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
dacs
labeled-densest-subgraph
Commits
f4ed460e
Commit
f4ed460e
authored
2 years ago
by
Iiro Kumpulainen
Browse files
Options
Downloads
Patches
Plain Diff
Upload New File
parent
ae4563c6
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
Tweet_Collection.ipynb
+169
-0
169 additions, 0 deletions
Tweet_Collection.ipynb
with
169 additions
and
0 deletions
Tweet_Collection.ipynb
0 → 100644
+
169
−
0
View file @
f4ed460e
{
"cells": [
{
"cell_type": "code",
"execution_count": 115,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 10000 / 10000"
]
}
],
"source": [
"import tweepy\n",
"from datetime import datetime\n",
"\n",
"print_tweets = False\n",
"n_tweets = 10000\n",
"max_results = 100\n",
" \n",
"\n",
"auth = tweepy.OAuth1UserHandler(\n",
" consumer_key, consumer_secret, access_token, access_token_secret\n",
")\n",
"\n",
"api = tweepy.API(auth)\n",
"client = tweepy.Client(bearer_token)\n",
" \n",
"query = \"#metoo\"\n",
"\n",
"tweet_fields = ['attachments', 'author_id', 'text', 'in_reply_to_user_id', 'referenced_tweets', 'entities']\n",
"next_token = None\n",
"end_time = datetime(year=2022,month=5,day=28)\n",
"\n",
"all_tweets = []\n",
"\n",
"\n",
"# This endpoint/method returns Tweets from the last seven days\n",
"while len(all_tweets) < n_tweets:\n",
" response = client.search_recent_tweets(query, tweet_fields=tweet_fields,\n",
" max_results=max_results, next_token=next_token, end_time=end_time)\n",
" tweets = response.data\n",
" if tweets == None:\n",
" break\n",
"\n",
" for tweet in tweets:\n",
" all_tweets.append(tweet)\n",
" \n",
" if print_tweets:\n",
" print(tweet.text)\n",
" print(\"-------------------\")\n",
" \n",
" if len(all_tweets) == n_tweets:\n",
" break\n",
"\n",
" if print_tweets:\n",
" print(len(all_tweets), \"/\", n_tweets)\n",
" else:\n",
" print(\"\\r\",len(all_tweets), \"/\", n_tweets, end=\"\")\n",
"\n",
" if 'next_token' in response.meta:\n",
" next_token = response.meta['next_token']\n",
" else:\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"with open(\"tweets.txt\", \"w+\") as file:\n",
" tweet_dicts = []\n",
" for tweet in all_tweets:\n",
" dict_tweet = dict(tweet)\n",
" if tweet.referenced_tweets != None: \n",
" dict_tweet['referenced_tweets'] = [dict(r_tweet) for r_tweet in tweet.referenced_tweets]\n",
" tweet_dicts.append(dict_tweet)\n",
" file.write(json.dumps(tweet_dicts))"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"with open(\"tweets.txt\", \"r\") as file:\n",
" all_tweets_json = json.load(file)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"8077\n",
"9429\n",
"1.1673888820106475\n"
]
}
],
"source": [
"import pickle\n",
"from create_enron_network import tokenize_text, get_labels\n",
"from edgelabelgraph import EdgeLabelGraph\n",
"\n",
"G = EdgeLabelGraph()\n",
"\n",
"for tweet in all_tweets_json:\n",
" \n",
" author = tweet['author_id']\n",
" \n",
" # Use hashtags as labels\n",
" if not ('entities' in tweet and 'hashtags' in tweet['entities']):\n",
" continue\n",
" labels = set([hashtag['tag'].lower() for hashtag in tweet['entities']['hashtags']])\n",
" \n",
" if 'mentions' in tweet['entities']:\n",
" referenced_users = set([int(user['id']) for user in tweet['entities']['mentions']])\n",
" \n",
" if 'in_reply_to_user_id' in tweet:\n",
" referenced_users.add(tweet['in_reply_to_user_id'])\n",
" \n",
" for user in referenced_users:\n",
" if user != author:\n",
" G.add_edge_with_labels((author, user), labels)\n",
" \n",
"print(G.number_of_nodes())\n",
"print(G.number_of_edges())\n",
"print(G.density())\n",
"\n",
"with open(\"tweets_graph.pkl\", \"wb\") as file:\n",
" pickle.dump(G, file)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
%% Cell type:code id: tags:
```
python
import
tweepy
from
datetime
import
datetime
print_tweets
=
False
n_tweets
=
10000
max_results
=
100
auth
=
tweepy
.
OAuth1UserHandler
(
consumer_key
,
consumer_secret
,
access_token
,
access_token_secret
)
api
=
tweepy
.
API
(
auth
)
client
=
tweepy
.
Client
(
bearer_token
)
query
=
"
#metoo
"
tweet_fields
=
[
'
attachments
'
,
'
author_id
'
,
'
text
'
,
'
in_reply_to_user_id
'
,
'
referenced_tweets
'
,
'
entities
'
]
next_token
=
None
end_time
=
datetime
(
year
=
2022
,
month
=
5
,
day
=
28
)
all_tweets
=
[]
# This endpoint/method returns Tweets from the last seven days
while
len
(
all_tweets
)
<
n_tweets
:
response
=
client
.
search_recent_tweets
(
query
,
tweet_fields
=
tweet_fields
,
max_results
=
max_results
,
next_token
=
next_token
,
end_time
=
end_time
)
tweets
=
response
.
data
if
tweets
==
None
:
break
for
tweet
in
tweets
:
all_tweets
.
append
(
tweet
)
if
print_tweets
:
print
(
tweet
.
text
)
print
(
"
-------------------
"
)
if
len
(
all_tweets
)
==
n_tweets
:
break
if
print_tweets
:
print
(
len
(
all_tweets
),
"
/
"
,
n_tweets
)
else
:
print
(
"
\r
"
,
len
(
all_tweets
),
"
/
"
,
n_tweets
,
end
=
""
)
if
'
next_token
'
in
response
.
meta
:
next_token
=
response
.
meta
[
'
next_token
'
]
else
:
break
```
%% Output
10000 / 10000
%% Cell type:code id: tags:
```
python
import
json
with
open
(
"
tweets.txt
"
,
"
w+
"
)
as
file
:
tweet_dicts
=
[]
for
tweet
in
all_tweets
:
dict_tweet
=
dict
(
tweet
)
if
tweet
.
referenced_tweets
!=
None
:
dict_tweet
[
'
referenced_tweets
'
]
=
[
dict
(
r_tweet
)
for
r_tweet
in
tweet
.
referenced_tweets
]
tweet_dicts
.
append
(
dict_tweet
)
file
.
write
(
json
.
dumps
(
tweet_dicts
))
```
%% Cell type:code id: tags:
```
python
import
json
with
open
(
"
tweets.txt
"
,
"
r
"
)
as
file
:
all_tweets_json
=
json
.
load
(
file
)
```
%% Cell type:code id: tags:
```
python
import
pickle
from
create_enron_network
import
tokenize_text
,
get_labels
from
edgelabelgraph
import
EdgeLabelGraph
G
=
EdgeLabelGraph
()
for
tweet
in
all_tweets_json
:
author
=
tweet
[
'
author_id
'
]
# Use hashtags as labels
if
not
(
'
entities
'
in
tweet
and
'
hashtags
'
in
tweet
[
'
entities
'
]):
continue
labels
=
set
([
hashtag
[
'
tag
'
].
lower
()
for
hashtag
in
tweet
[
'
entities
'
][
'
hashtags
'
]])
if
'
mentions
'
in
tweet
[
'
entities
'
]:
referenced_users
=
set
([
int
(
user
[
'
id
'
])
for
user
in
tweet
[
'
entities
'
][
'
mentions
'
]])
if
'
in_reply_to_user_id
'
in
tweet
:
referenced_users
.
add
(
tweet
[
'
in_reply_to_user_id
'
])
for
user
in
referenced_users
:
if
user
!=
author
:
G
.
add_edge_with_labels
((
author
,
user
),
labels
)
print
(
G
.
number_of_nodes
())
print
(
G
.
number_of_edges
())
print
(
G
.
density
())
with
open
(
"
tweets_graph.pkl
"
,
"
wb
"
)
as
file
:
pickle
.
dump
(
G
,
file
)
```
%% Output
8077
9429
1.1673888820106475
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment