Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
L
labeled-densest-subgraph
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
dacs
labeled-densest-subgraph
Commits
5821a06c
Commit
5821a06c
authored
2 years ago
by
Iiro Kumpulainen
Browse files
Options
Downloads
Patches
Plain Diff
Upload New File
parent
146e71c1
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
create_phys_network.py
+97
-0
97 additions, 0 deletions
create_phys_network.py
with
97 additions
and
0 deletions
create_phys_network.py
0 → 100644
+
97
−
0
View file @
5821a06c
if
__name__
==
"
__main__
"
:
from
create_enron_network
import
tokenize_text
,
get_labels
from
edgelabelgraph
import
EdgeLabelGraph
from
collections
import
Counter
import
os
data_dir
=
"
../Physics Theory Citation Network/
"
G
=
EdgeLabelGraph
()
id_to_authors
=
dict
()
author_names
=
dict
()
with
open
(
os
.
path
.
join
(
data_dir
,
"
AuthorNodes.csv
"
),
encoding
=
"
utf8
"
,
errors
=
'
ignore
'
)
as
f
:
for
line
in
f
.
readlines
():
id
,
authors
=
line
.
split
(
"
,
"
)
authors
=
authors
[:
-
1
]
if
authors
.
endswith
(
"
and
"
):
authors
=
authors
[:
-
4
]
if
"
&
"
in
authors
:
authors
=
authors
.
split
(
"
&
"
)
else
:
authors
=
authors
.
split
(
"
and
"
)
for
author
in
authors
:
author_name
=
author
name_parts
=
[]
for
part
in
author
.
split
(
"
"
):
for
part2
in
part
.
split
(
"
.
"
):
if
part2
:
# Split two-part first names that cointain a hyphen but not surnames since surnames are never shortened
if
not
name_parts
:
for
part3
in
part2
.
split
(
"
-
"
):
if
part3
:
name_parts
.
append
(
part2
)
else
:
name_parts
.
append
(
part2
)
surname
=
name_parts
[
-
1
]
firstname_parts
=
name_parts
[:
-
1
]
if
not
surname
in
author_names
:
author_names
[
surname
]
=
[]
author_names
[
surname
].
append
((
firstname_parts
,
author
))
else
:
match_index
=
0
for
namesake_firstname_parts
,
_
in
author_names
[
surname
]:
for
i_part
in
range
(
min
(
len
(
firstname_parts
),
len
(
namesake_firstname_parts
))):
if
not
namesake_firstname_parts
[
i_part
].
startswith
(
firstname_parts
[
i_part
])
\
and
not
firstname_parts
[
i_part
].
startswith
(
namesake_firstname_parts
[
i_part
]):
break
else
:
break
match_index
+=
1
if
match_index
>=
len
(
author_names
[
surname
]):
author_names
[
surname
].
append
((
firstname_parts
,
author
))
else
:
author_name
=
author_names
[
surname
][
match_index
][
1
]
if
not
id
in
id_to_authors
:
id_to_authors
[
id
]
=
set
()
id_to_authors
[
id
].
add
(
author_name
)
edges_to_add
=
[]
papers_with_label
=
Counter
()
author_edges
=
Counter
()
n_papers
=
0
with
open
(
os
.
path
.
join
(
data_dir
,
"
ArticleNodes.csv
"
),
encoding
=
"
utf8
"
,
errors
=
'
ignore
'
)
as
f
:
for
line
in
f
.
readlines
():
id
,
title
,
year
,
journal
,
abstract
=
line
.
split
(
"
,
"
)
if
not
id
in
id_to_authors
:
print
(
f
"
No authors for
{
id
}
"
)
continue
authors
=
tuple
(
id_to_authors
[
id
])
if
len
(
authors
)
<=
1
:
# Ignore papers with only one author
continue
n_papers
+=
1
title
=
title
.
strip
()
labels
=
get_labels
(
title
)
for
label
in
labels
:
papers_with_label
[
label
]
+=
1
for
i_author
in
range
(
len
(
authors
)
-
1
):
for
j_author
in
range
(
i_author
+
1
,
len
(
authors
)):
edges_to_add
.
append
(((
authors
[
i_author
],
authors
[
j_author
]),
labels
.
copy
()))
author_edges
[(
authors
[
i_author
],
authors
[
j_author
])]
+=
1
min_papers_with_label
=
int
(
0.005
*
n_papers
)
min_shared_papers_for_edge
=
2
for
edge
,
labels
in
edges_to_add
:
for
label
in
tuple
(
labels
):
if
papers_with_label
[
label
]
<
min_papers_with_label
:
labels
.
remove
(
label
)
if
len
(
labels
)
>
0
and
author_edges
[
edge
]
>=
min_shared_papers_for_edge
:
G
.
add_edge_with_labels
(
edge
,
labels
)
import
pickle
with
open
(
f
"
phys_graph_title_min_
{
min_papers_with_label
}
_shared_
{
min_shared_papers_for_edge
}
.pkl
"
,
"
wb
"
)
as
file
:
pickle
.
dump
(
G
,
file
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment