From 4ec25091334a9ba0752360a677071b4cf7c8c74b Mon Sep 17 00:00:00 2001 From: chamalee <chamalee.wickramaarachch@helsinki.fi> Date: Mon, 24 Jan 2022 21:53:10 +0200 Subject: [PATCH] new experiments --- .DS_Store | Bin 0 -> 6148 bytes README.md | 52 +++-- bike_times_edges-large.py | 108 +++++++++ bike_times_edges.py | 143 ++++++++++++ bikes_santander.py | 4 +- experiment.py | 12 +- likelihood_vs_H_bikes.py | 61 +++--- likelihood_vs_H_bitcoin.py | 64 +++--- likelihood_vs_H_eu_dep2.py | 64 +++--- likelihood_vs_H_synthetic.py | 64 +++--- optimize.py | 58 ++--- real_dataset_time_vs_edges.py | 13 +- sbm_core.py | 72 +++++- synthetic_experiment_1.py | 2 +- ...ic_experiment_likelihood_vs_H_synthetic.py | 205 ------------------ synthetic_experiment_time_vs_edges.py | 19 +- 16 files changed, 527 insertions(+), 414 deletions(-) create mode 100644 .DS_Store mode change 100644 => 100755 README.md create mode 100644 bike_times_edges-large.py create mode 100644 bike_times_edges.py delete mode 100755 synthetic_experiment_likelihood_vs_H_synthetic.py diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**<q8>++&mCkOWA81W14cNZ<zv;LbK1Poaz?KmsK2CSc!( z0ynLxE!0092;Krf2c+FF_Fe*7ECH>lEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0<F0fCPF1$Cyrb|F7^5{eNG?83~ZUUlGt@xh*qZDeu<Z%US-OSsOPv j)R!Z4KLME7ReXlK;d!wEw5GODWMKRea10D2@KpjYNUI8I literal 0 HcmV?d00001 diff --git a/README.md b/README.md old mode 100644 new mode 100755 index d0666b7..94fab83 --- a/README.md +++ b/README.md @@ -11,20 +11,7 @@ The estimated intensity functions, groups/clusters and change-points can be obta The file `utils.py` contains the utility code to read data, SMAWK utilities, creating networkx graph object etc. The file `experiments.py` contains the function to run the simulations. -# Experimental files - -From the file `synthetic_experiment_1.py` to `synthetic_experiment_5.py` contain code to simulate synthetic datasets using our algorithms. -These files contain the code to generate synthetic data as well. - -The file `synthetic_experiment_likelihood_vs_H_synthetic.py` returns likelihood value for given `H` level. -To reproduce the results of the paper, set `current_h` = 1,2,3,4,5,6,7,8 ; one at a time. - -The file `synthetic_experiment_time_vs_edges.py` returns the running time and edges for given `NO_SAMPLES`. -To reproduce the results of the paper, eg. set `NO_SAMPLES` = 50 from the list_samples = [50,100,150,200] ; one at a time. - -Note that to find the `Normlaized likelihood`, in the bottom of each files we have a code snippet as follows. -(Normalized log-likelihood is the ratio between a particular likelihood and -the likelihoodvalue which corresponds to a single group and a single segment.) +# Experimental files and Running the code ``` num_roles=1 @@ -34,16 +21,10 @@ algo_ver= 3 ``` The files `bikes.py` , `collegeMsg.py` , `bitcoin.py` , `eu_email_dep1.py`,`eu_email_dep2.py`, `mathoverflow.py`, and `mooc.py` - provide examples on real world dynamic networks. - -The files `likelihood_vs_H_bikes.py`,`likelihood_vs_H_bitcoin.py`,`likelihood_vs_H_dep2.py` returns likelihood value for given `H` level. -To reproduce the results of the paper, set `current_h` = 1,2,3,4,5,6,7,8 ; one at a time. +provide examples on real world dynamic networks. -The file `real_dataset_time_vs_edges.py` returns the running time and edges for given `NO_SAMPLES`. -To reproduce the results of the paper, eg. set `_frac` = 1 from the _frac_list = .2 , .4 , .6, .8 and 1 ; one at a time. - -# Running the code -All the experimental files can be run directly to do the simulations. +This zip folder consists experiments for both for Synthetic and real-world datasets. +The data we have used is publicly available. There are four main user parameters. Example is as follows. ``` @@ -54,16 +35,33 @@ algo_ver= 3 dest_folder='./Results/bikes/' ``` -algo_ver `3` is dedicated to (K-H)-segmentation algorithm. +algo_ver `3` is dedicated to (K-H)-segmentation algorithm using SMAWK. +algo_ver `2` is dedicated to (K-H)-segmentation algorithm using naive segmentation. There are two tuning parameters. Example is as follows. ``` theta = 1e-5 eta = 1 ``` -This zip folder consists experiments for both for Synthetic and real-world datasets. -Real-world data is excluded from Data folder due to the large size. -The data we have used is publicly available. +All the experimental files can be run directly to do the simulations. + +From the file `synthetic_experiment_1.py` to `synthetic_experiment_5.py` contain code to simulate synthetic datasets using our algorithms. +These files contain the code to generate synthetic data as well. + +The files `likelihood_vs_H_bikes.py`,`likelihood_vs_H_bitcoin.py`,`likelihood_vs_H_dep2.py` return normalized likelihood values for +a set of given `H` levels. Ex: `current_h` = 1,2,3,4,5,6,7,8,...20; +To reproduce the results of the paper just run `synthetic_experiment_likelihood_vs_H_?.py`files. + +Note that to find the `Normlaized likelihood`, in the bottom of each files we have a code snippet as follows. +(Normalized log-likelihood is the ratio between a particular likelihood and +the likelihoodvalue which corresponds to a single group and a single segment.) + +The file `bike_times_edges_large.py` returns the running time and edges for given a fraction of edges. +To reproduce the results of the paper, eg. set `_frac` = .4, .5, .8 or 1; one at a time. Note that running time can be dependent on the machine you run. +To switch the algorithm you choose, change `algo_ver` parameter to either 2 or 3. + +The file `synthetic_experiment_time_vs_edges.py` returns the running time and edges for given `NO_SAMPLES`. +To reproduce the results of the paper, eg. set `NO_SAMPLES` = 50 from the list_samples = [50,100,150,200] ; one at a time. ## References diff --git a/bike_times_edges-large.py b/bike_times_edges-large.py new file mode 100644 index 0000000..080198d --- /dev/null +++ b/bike_times_edges-large.py @@ -0,0 +1,108 @@ + + +""" + +SANTANDER-LARGE dataset + +REAL-TIME EDGES PAPER: +Generate piecewise non-homogeneous poisson point process (NHPPP) + +To check the running time vs number of edges +real-world Dataset + +To reproduce the results of the paper: +set + _frac = .4 , .6, .8 and 1 ; +one a time +""" +import experiment +import os +import pandas as pd +import pandas as pd +import time +import experiment +import numpy as np +import os +from streamlit import caching + +# read data + +filepath = os.path.join("Data","9b-Journey-Data-Extract-06Sep15-19Sep15.csv") +# pick 9th of September-2015 +start_date = "2015-9-9 0:00:00" +# end_date = "2015-9-9 23:59:59" +end_date = "2015-9-13 13:59:59" + +# Read data +df = pd.read_csv(filepath, dtype={'StartStation Id': 'Int64', 'EndStation Id': 'Int64'}, usecols=lambda x: x in ['Start Date', 'StartStation Id', 'EndStation Id'], parse_dates=['Start Date']) +df=df.set_axis(['source', 'timestamp', 'target'], axis=1) + + +# Remove null value +df = df[df['target'].isnull() != True] +#sort +df=df.sort_values('timestamp') + +# Filter dates +if start_date and end_date: + after_start_date = df["timestamp"] >= start_date + before_end_date = df["timestamp"] <= end_date + between_two_dates = after_start_date & before_end_date + df = df.loc[between_two_dates] + +# Remove self-loops +df = df[((df['source'] ) != (df['target']))] + +# convert datetime to epoch +df['timestamp'] = df['timestamp'].astype('int64')//1e9 + +def _swap (row): + if row['source'] > row['target']: + row['source'] , row['target'] =row['target'] , row['source'] + return row + +# Undirected graph +df=df.apply(lambda row: _swap(row), axis=1) +#scale timestamps for zeroth reference point +refValue = df['timestamp'].min() +df['timestamp'] -= refValue + +# df = df[:1000] + + # Remove self-loops +df = df[((df['source'] ) != (df['target']))] + + + + +########FRACTION###### .4,.6,.8,1 +_frac =1 + +caching.clear_cache() + +df1 = df.sample(frac=_frac).reset_index(drop=True) +#sort +df1=df1.sort_values('timestamp') + +#scale timestamps for zeroth reference point +refValue = df1['timestamp'].min() +df1['timestamp'] -= refValue + + +# User parameters +num_roles=3 +num_segments=5 +num_levels=3# Optional arg +algo_ver=3 +dest_folder='./Results/synthetic/' + +# tuning parameters +theta = 1e-20 +eta = 1 +tuning_params= {'theta':theta,'eta':eta} + +start_time = time.time() +exp_obj = experiment.Experiment(df1,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) +exp_obj.execute() +print("--- %s seconds ---" % (time.time() - start_time)) + diff --git a/bike_times_edges.py b/bike_times_edges.py new file mode 100644 index 0000000..3a60a94 --- /dev/null +++ b/bike_times_edges.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sat Jan 22 18:09:35 2022 + +@author: chamwick +""" + +""" + +REAL-TIME EDGES PAPER +Generate piecewise non-homogeneous poisson point process (NHPPP) + +To check the running time vs number of edges +real-world Dataset + +To reproduce the results of the paper: +set + _frac = .2 , .4 , .6, .8 and 1 ; +one a time +""" +import experiment +import os +import pandas as pd +import pandas as pd +import time +import experiment +import numpy as np +import os +from streamlit import caching + +# read data + +filepath = os.path.join("Data","9b-Journey-Data-Extract-06Sep15-19Sep15.csv") +# pick 9th of September-2015 +start_date = "2015-9-9 0:00:00" +end_date = "2015-9-9 23:59:59" +# end_date = "2015-9-10 00:06:59" + +# Read data +df = pd.read_csv(filepath, dtype={'StartStation Id': 'Int64', 'EndStation Id': 'Int64'}, usecols=lambda x: x in ['Start Date', 'StartStation Id', 'EndStation Id'], parse_dates=['Start Date']) +df=df.set_axis(['source', 'timestamp', 'target'], axis=1) + + +# Remove null value +df = df[df['target'].isnull() != True] +#sort +df=df.sort_values('timestamp') + +# Filter dates +if start_date and end_date: + after_start_date = df["timestamp"] >= start_date + before_end_date = df["timestamp"] <= end_date + between_two_dates = after_start_date & before_end_date + df = df.loc[between_two_dates] + +# Remove self-loops +df = df[((df['source'] ) != (df['target']))] + +# convert datetime to epoch +df['timestamp'] = df['timestamp'].astype('int64')//1e9 + +def _swap (row): + if row['source'] > row['target']: + row['source'] , row['target'] =row['target'] , row['source'] + return row + +# Undirected graph +df=df.apply(lambda row: _swap(row), axis=1) +#scale timestamps for zeroth reference point +refValue = df['timestamp'].min() +df['timestamp'] -= refValue + +# df = df[:1000] + + # Remove self-loops +df = df[((df['source'] ) != (df['target']))] + + + + +########FRACTION###### .2,.4,.6,.8,1 +_frac = .4 + +caching.clear_cache() + +df1 = df.sample(frac=_frac).reset_index(drop=True) +#sort +df1=df1.sort_values('timestamp') + +#scale timestamps for zeroth reference point +refValue = df1['timestamp'].min() +df1['timestamp'] -= refValue + + +# User parameters +num_roles=3 +num_segments=5 +num_levels=3# Optional arg +algo_ver=2 +dest_folder='./Results/synthetic/' + +# tuning parameters +theta = 1e-20 +eta = 1 +tuning_params= {'theta':theta,'eta':eta} + +start_time = time.time() +exp_obj = experiment.Experiment(df1,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) +exp_obj.execute() +print("--- %s seconds ---" % (time.time() - start_time)) + + + +# .2 --- 10.875536918640137 seconds --- +# .4 --- 20.28241729736328 seconds --- +# .6--- 28.719234943389893 seconds --- +# .8 --- 32.54414510726929 seconds --- +# 1 --- 38.315009117126465 seconds --- + +# frac edges time +# 0.2 9354 10.95566916 +# 0.4 18709 22.68538189 +# 0.6 28063 28.25948501 +# 0.8 37418 33.04973316 +# 1 46772 39.48938107 + +# new algo +# frac edges time +# 0.2 6452 3690.8258459568024 + +# 0.5 16129 7304.616402864456 + +# 1 32258 8373.52380952 + + +# new algo +# frac edges time +# 0.2 6452 3690.8258459568024 + +# 0.5 16129 7304.616402864456 + +# 1 32258 5042.336783885956 \ No newline at end of file diff --git a/bikes_santander.py b/bikes_santander.py index fc9a986..003aadb 100755 --- a/bikes_santander.py +++ b/bikes_santander.py @@ -53,7 +53,7 @@ df=df.apply(lambda row: _swap(row), axis=1) refValue = df['timestamp'].min() df['timestamp'] -= refValue - +df = df[:1000] # # Experiments @@ -62,7 +62,7 @@ df['timestamp'] -= refValue num_roles=3 num_segments=7 num_levels=5 -algo_ver= 3 +algo_ver= 2 dest_folder='./Results/bikes/' # tuning parameters diff --git a/experiment.py b/experiment.py index d9011ed..2dd87b4 100755 --- a/experiment.py +++ b/experiment.py @@ -50,17 +50,17 @@ class Experiment: ### K-segmentation ### if self.algo_ver == 1: opt = optimize.Optimize( group_dic,lambda_estimates,change_points_arr,nodes,num_roles,num_segments,dic,None,self.tuning_params) - [group_dic,lambda_estimates,change_points_arr] = opt.k_seg() + [group_dic,lambda_estimates,change_points_arr,likelihood] = opt.k_seg() ### (K,H)-segmentation variant-1 ### elif self.algo_ver == 2: opt = optimize.Optimize( group_dic,lambda_estimates,change_points_arr,nodes,num_roles,num_segments,dic, self.num_levels,self.tuning_params) - [group_dic,lambda_estimates,change_points_arr] = opt.k_h_seg_var_1() + [group_dic,lambda_estimates,change_points_arr,likelihood] = opt.k_h_seg_var_1() ### (K,H)-segmentation variant-2 ### elif self.algo_ver == 3: opt = optimize.Optimize( group_dic,lambda_estimates,change_points_arr,nodes,num_roles,num_segments,dic,self.num_levels,self.tuning_params) - [group_dic,lambda_estimates,change_points_arr] = opt.k_h_seg_var_2() + [group_dic,lambda_estimates,change_points_arr,likelihood] = opt.k_h_seg_var_2() # Plotting # dest_folder= self.dest + str(self.algo_ver)+'/' @@ -69,5 +69,7 @@ class Experiment: list_of_groups= [[] for _ in range(num_roles)] for idx, val in group_dic.items(): list_of_groups[val].append(idx) - print('group assignments: {}'.format(list_of_groups)) - print('lambdas: {}'.format(lambda_estimates)) \ No newline at end of file + # print('group assignments: {}'.format(list_of_groups)) + # print('lambdas: {}'.format(lambda_estimates)) + + return likelihood \ No newline at end of file diff --git a/likelihood_vs_H_bikes.py b/likelihood_vs_H_bikes.py index 540c5d7..5d2d726 100755 --- a/likelihood_vs_H_bikes.py +++ b/likelihood_vs_H_bikes.py @@ -4,9 +4,7 @@ To check the likelihood vs H using bikes Dataset To reproduce the results of the paper: -set - current_h = 1,2,3,4,5,6,7,8 ; -one at a time +run this file """ @@ -63,18 +61,15 @@ df=df.apply(lambda row: _swap(row), axis=1) refValue = df['timestamp'].min() df['timestamp'] -= refValue - -# To reproduce the results of the paper: -# set current_h = i choosing from H_list ; one at a time -# H_list = [1,2,3,4,5,6,7,8] +# Likelihood when one segment, one group +# Normlaized likelihood +res = [] current_h= 1 - - caching.clear_cache() # User parameters -num_roles=2 -num_segments=20 +num_roles=1 +num_segments=1 num_levels=current_h algo_ver=3 dest_folder='./Results/synthetic/' @@ -85,27 +80,31 @@ eta = 1 tuning_params= {'theta':theta,'eta':eta} exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) -exp_obj.execute() - - -# Normlaized likelihood -# num_roles=1 -# num_segments=1 -# num_levels=1 -# exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) -# exp_obj.execute() +nl = exp_obj.execute() -# results +for i in range(1,21): + current_h= i + + + caching.clear_cache() + + # User parameters + num_roles=2 + num_segments=20 + num_levels=current_h + algo_ver=3 + dest_folder='./Results/synthetic/' + + # tuning parameters + theta = 1e-20 + eta = 1 + tuning_params= {'theta':theta,'eta':eta} + + exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) + l = exp_obj.execute() + + res.append(l/nl) -# h=1 -457222.6285 -# h=2 -448905.9252 -# h=3 -445109.8155 -# h=4 -444475.4396 -# h=5 -444307.0014 -# h=6 -444226.6047 -# h=7 -444029.6038 -# h=8 -444012.6393 +print(res) -# Likelihood when one segment, one group -# Normlaized likelihood : -467411.027916393 diff --git a/likelihood_vs_H_bitcoin.py b/likelihood_vs_H_bitcoin.py index bbc7599..ce2839a 100755 --- a/likelihood_vs_H_bitcoin.py +++ b/likelihood_vs_H_bitcoin.py @@ -7,9 +7,7 @@ To check the likelihood vs H using Bitcoin Dataset To reproduce the results of the paper: -set - current_h = 1,2,3,4,5,6,7,8 ; -one at a time +run this file """ @@ -46,21 +44,15 @@ df['timestamp'] -= refValue # Remove self-loops df = df[((df['source'] ) != (df['target']))] - - - - -# To reproduce the results of the paper: -# set current_h = i choosing from H_list ; one at a time -# H_list = [1,2,3,4,5,6,7,8] +# Likelihood when one segment, one group +# Normlaized likelihood +res=[] current_h= 1 - - caching.clear_cache() # User parameters -num_roles=2 -num_segments=20 +num_roles=1 +num_segments=1 num_levels=current_h algo_ver=3 dest_folder='./Results/synthetic/' @@ -71,27 +63,31 @@ eta = 1 tuning_params= {'theta':theta,'eta':eta} exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) -exp_obj.execute() +nl = exp_obj.execute() +caching.clear_cache() +for i in range(20,21): + current_h= i -# Normlaized likelihood -# num_roles=1 -# num_segments=1 -# num_levels=1 -# exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) -# exp_obj.execute() - - -# results + + caching.clear_cache() + + # User parameters + num_roles=2 + num_segments=20 + num_levels=current_h + algo_ver=3 + dest_folder='./Results/synthetic/' + + # tuning parameters + theta = 1e-20 + eta = 1 + tuning_params= {'theta':theta,'eta':eta} + + exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) + l = exp_obj.execute() + + res.append(l/nl) -# h=1 -581330.1065 -# h=2 -574937.7445 -# h=3 -573027.7905 -# h=4 -570359.3383 -# h=5 -570300.061 -# h=6 -570337.8106 -# h=7 -570142.8354 -# h=8 -570038.9478 +print(res) -# Likelihood when one segment, one group -# Normlaized likelihood : -619318.392137728 diff --git a/likelihood_vs_H_eu_dep2.py b/likelihood_vs_H_eu_dep2.py index bdbccc9..74d1b21 100755 --- a/likelihood_vs_H_eu_dep2.py +++ b/likelihood_vs_H_eu_dep2.py @@ -6,9 +6,7 @@ To check the likelihood vs H using Eu-email-dep-2 Dataset To reproduce the results of the paper: -set - current_h = 1,2,3,4,5,6,7,8 ; -one at a time +run this file """ @@ -47,20 +45,15 @@ df['timestamp'] -= refValue df = df[((df['source'] ) != (df['target']))] - - - -# To reproduce the results of the paper: -# set current_h = i choosing from H_list ; one at a time -# H_list = [1,2,3,4,5,6,7,8] +# Likelihood when one segment, one group +# Normlaized likelihood +res = [] current_h= 1 - - caching.clear_cache() # User parameters -num_roles=2 -num_segments=20 +num_roles=1 +num_segments=1 num_levels=current_h algo_ver=3 dest_folder='./Results/synthetic/' @@ -71,27 +64,32 @@ eta = 1 tuning_params= {'theta':theta,'eta':eta} exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) -exp_obj.execute() - - -# Normlaized likelihood -# num_roles=1 -# num_segments=1 -# num_levels=1 -# exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) -# exp_obj.execute() +nl = exp_obj.execute() -# results +for i in range(2,4): + current_h= i + + + caching.clear_cache() + + # User parameters + num_roles=2 + num_segments=20 + num_levels=current_h + algo_ver=3 + dest_folder='./Results/synthetic/' + + # tuning parameters + theta = 1e-20 + eta = 1 + tuning_params= {'theta':theta,'eta':eta} + + exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) + l = exp_obj.execute() + + res.append(l/nl) -# h=1 -783492.0927 -# h=2 -764671.531 -# h=3 -761175.1947 -# h=4 -759937.1638 -# h=5 -761905.0863 -# h=6 -761858.5747 -# h=7 -761499.0086 -# h=8 -760671.881 +print(res) -# Likelihood when one segment, one group -# Normlaized likelihood : -831549.922260613 + diff --git a/likelihood_vs_H_synthetic.py b/likelihood_vs_H_synthetic.py index 14ee47e..884eb87 100755 --- a/likelihood_vs_H_synthetic.py +++ b/likelihood_vs_H_synthetic.py @@ -4,9 +4,7 @@ To check the likelihood vs H using Synthetic Dataset To reproduce the results of the paper: -set - current_h = 1,2,3,4,5,6,7,8 ; -one at a time +run this file """ @@ -31,7 +29,7 @@ np.random.seed(111) num_roles=2 num_vertices=10 -num_segments = 10 +num_segments = 20 NO_SAMPLES= 250 group_assignment= np.random.randint(num_roles, size=(num_vertices)) @@ -157,19 +155,15 @@ df=df.apply(lambda row: _swap(row), axis=1) refValue = df['timestamp'].min() df['timestamp'] -= refValue - - -# To reproduce the results of the paper: -# set current_h = i choosing from H_list ; one at a time -# H_list = [1,2,3,4,5,6,7,8] +# Likelihood when one segment, one group +# Normlaized likelihood +res = [] current_h= 1 - - caching.clear_cache() # User parameters -num_roles=2 -num_segments=20 +num_roles=1 +num_segments=1 num_levels=current_h algo_ver=3 dest_folder='./Results/synthetic/' @@ -180,26 +174,30 @@ eta = 1 tuning_params= {'theta':theta,'eta':eta} exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) -exp_obj.execute() +nl = exp_obj.execute() + +for i in range(1,9): + current_h= i -# Normlaized likelihood -# num_roles=1 -# num_segments=1 -# num_levels=1 -# exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) -# exp_obj.execute() - - -# sample results -# h= 1 :-73402.83746 -# h= 2 :-68267.30975 -# h= 3 :-67919.07115 -# h= 4 :-67390.64768 -# h= 5 : -67390.39226 -# h= 6 :-67056.12304 -# h= 7 :-67054.4862 -# h= 8 :-67051.29741 + + caching.clear_cache() + + # User parameters + num_roles=2 + num_segments=20 + num_levels=current_h + algo_ver=3 + dest_folder='./Results/synthetic/' + + # tuning parameters + theta = 1e-20 + eta = 1 + tuning_params= {'theta':theta,'eta':eta} + + exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) + l = exp_obj.execute() + + res.append(l/nl) -# Likelihood when one segment, one group -# Normlaized likelihood : -74053.078339717 +print(res) diff --git a/optimize.py b/optimize.py index bf77a7b..c77497d 100755 --- a/optimize.py +++ b/optimize.py @@ -24,7 +24,9 @@ class Optimize: """K-segmentation""" liklihood_sum = 0 - + # initialize lambdas randomly + # self.num_levels = self.num_segments + # self.com_h_lvl_lambda() self.lambda_estimates=sbm_core.estimate_lamda(self.group_assignment,self.lambda_estimates,self.change_points_arr,self.num_roles,self.num_segments,self.df,self.tuning_params) liklihood_sum = sbm_core.compute_cost(self.group_assignment,self.lambda_estimates,self.change_points_arr,self.num_roles,self.num_segments,self.df) @@ -54,7 +56,7 @@ class Optimize: _prev_val = _curr_val _curr_val = liklihood_sum _itr+=1 - return [self.group_assignment,self.lambda_estimates,self.change_points_arr] + return [self.group_assignment,self.lambda_estimates,self.change_points_arr,liklihood_sum] def com_h_lvl_lambda(self): H = self.num_levels @@ -79,39 +81,42 @@ class Optimize: def k_h_seg_var_1(self): """(K,H)-segmentation variant-1""" - # initialize lambdas randomly - self.com_h_lvl_lambda() - self.lambda_estimates=sbm_core.estimate_lamda_kh(self.num_roles,self.num_segments,self.lambda_estimates,self.group_assignment,self.change_points_arr,self.df,self.tuning_params) + # initilaization algorithm: initialize lambdas randomly and segments through linear seg. ver 2. + self.com_h_lvl_lambda() + + [self.lambda_estimatess,self.change_points_arr]=sbm_core.linear_seg_ver_2(self.num_roles,self.num_segments,self.group_assignment,self.lambda_estimates,self.change_points_arr,self.df) + self.lambda_estimates=sbm_core.estimate_lamda_kh(self.num_roles,self.num_segments,self.lambda_estimates,self.group_assignment,self.change_points_arr,self.df,self.tuning_params) liklihood_sum = sbm_core.compute_cost(self.group_assignment,self.lambda_estimates,self.change_points_arr,self.num_roles,self.num_segments,self.df) - _prev_val = math.inf _curr_val = liklihood_sum - _itr = 0 + _itr = 0 while round(_prev_val,2) != round(_curr_val,2): print("iteration no........... %d " %(_itr+1)) - - self.group_assignment=sbm_core.assign_groups(self.group_assignment,self.lambda_estimates,self.change_points_arr,self.nodes,self.num_roles,self.num_segments,self.df) - # self.group_assignment=sbm_core.group_assignment_ver2_2(self.nodes,self.num_roles,self.num_segments,self.lambda_estimates,self.group_assignment,self.change_points_arr,self.df) - print('after grouping') - liklihood_sum = sbm_core.compute_cost(self.group_assignment,self.lambda_estimates,self.change_points_arr,self.num_roles,self.num_segments,self.df) + # self.group_assignment=sbm_core.assign_groups(self.group_assignment,self.lambda_estimates,self.change_points_arr,self.nodes,self.num_roles,self.num_segments,self.df) + self.group_assignment=sbm_core.group_assignment_ver2_2(self.nodes,self.num_roles,self.num_segments,self.lambda_estimates,self.group_assignment,self.change_points_arr,self.df) + # print('after grouping') + # liklihood_sum = sbm_core.compute_cost(self.group_assignment,self.lambda_estimates,self.change_points_arr,self.num_roles,self.num_segments,self.df) self.lambda_estimates=sbm_core.estimate_lamda_kh(self.num_roles,self.num_segments,self.lambda_estimates,self.group_assignment,self.change_points_arr,self.df,self.tuning_params) - print('after lambda estimate') - liklihood_sum = sbm_core.compute_cost(self.group_assignment,self.lambda_estimates,self.change_points_arr,self.num_roles,self.num_segments,self.df) - - print('after seg') + # print('after lambda estimate') + # liklihood_sum = sbm_core.compute_cost(self.group_assignment,self.lambda_estimates,self.change_points_arr,self.num_roles,self.num_segments,self.df) + + # print('after seg') self.change_points_arr = sbm_core.dyn_prog_seg(self.group_assignment,self.lambda_estimates,self.change_points_arr,self.num_roles,self.num_segments,self.df) - liklihood_sum = sbm_core.compute_cost(self.group_assignment,self.lambda_estimates,self.change_points_arr,self.num_roles,self.num_segments,self.df) - self.lambda_estimates=sbm_core.estimate_lamda_kh(self.num_roles,self.num_segments,self.lambda_estimates,self.group_assignment,self.change_points_arr,self.df,self.tuning_params) - print('after lambda estimate') - liklihood_sum = sbm_core.compute_cost(self.group_assignment,self.lambda_estimates,self.change_points_arr,self.num_roles,self.num_segments,self.df) + # [self.lambda_estimates,self.change_points_arr]=sbm_core.linear_seg_ver_2(self.num_roles,self.num_segments,self.group_assignment,self.lambda_estimates,self.change_points_arr,self.df) + # liklihood_sum = sbm_core.compute_cost(self.group_assignment,self.lambda_estimates,self.change_points_arr,self.num_roles,self.num_segments,self.df) + self.lambda_estimates=sbm_core.estimate_lamda_kh(self.num_roles,self.num_segments,self.lambda_estimates,self.group_assignment,self.change_points_arr,self.df,self.tuning_params) + # print('after lambda estimate') + liklihood_sum = sbm_core.compute_cost(self.group_assignment,self.lambda_estimates,self.change_points_arr,self.num_roles,self.num_segments,self.df) + # liklihood_sum = sbm_core.com_cost(self.num_roles,self.num_segments,self.lambda_estimates,self.change_points_arr,self.group_assignment,self.df) + print(' %d %f'%(_itr+1,liklihood_sum)) _prev_val = _curr_val _curr_val = liklihood_sum _itr+=1 - return [self.group_assignment,self.lambda_estimates,self.change_points_arr] + return [self.group_assignment,self.lambda_estimates,self.change_points_arr,liklihood_sum] def k_h_seg_var_2(self): """(K,H)-segmentation variant-2""" @@ -130,22 +135,23 @@ class Optimize: print("iteration no........... %d " %(_itr+1)) self.group_assignment=sbm_core.group_assignment_ver2_2(self.nodes,self.num_roles,self.num_segments,self.lambda_estimates,self.group_assignment,self.change_points_arr,self.df) - print('after grouping') + # print('after grouping') liklihood_sum = sbm_core.compute_cost(self.group_assignment,self.lambda_estimates,self.change_points_arr,self.num_roles,self.num_segments,self.df) self.lambda_estimates=sbm_core.estimate_lamda_kh(self.num_roles,self.num_segments,self.lambda_estimates,self.group_assignment,self.change_points_arr,self.df,self.tuning_params) - print('after lambda estimate') + # print('after lambda estimate') liklihood_sum = sbm_core.compute_cost(self.group_assignment,self.lambda_estimates,self.change_points_arr,self.num_roles,self.num_segments,self.df) - print('after seg') + # print('after seg') + # self.change_points_arr = sbm_core.dyn_prog_seg(self.group_assignment,self.lambda_estimates,self.change_points_arr,self.num_roles,self.num_segments,self.df) [self.lambda_estimates,self.change_points_arr]=sbm_core.linear_seg_ver_2(self.num_roles,self.num_segments,self.group_assignment,self.lambda_estimates,self.change_points_arr,self.df) liklihood_sum = sbm_core.compute_cost(self.group_assignment,self.lambda_estimates,self.change_points_arr,self.num_roles,self.num_segments,self.df) self.lambda_estimates=sbm_core.estimate_lamda_kh(self.num_roles,self.num_segments,self.lambda_estimates,self.group_assignment,self.change_points_arr,self.df,self.tuning_params) - print('after lambda estimate') + # print('after lambda estimate') # liklihood_sum = sbm_core.compute_cost(self.group_assignment,self.lambda_estimates,self.change_points_arr,self.num_roles,self.num_segments,self.df) print(' %d %f'%(_itr+1,liklihood_sum)) _prev_val = _curr_val _curr_val = liklihood_sum _itr+=1 - return [self.group_assignment,self.lambda_estimates,self.change_points_arr] + return [self.group_assignment,self.lambda_estimates,self.change_points_arr,liklihood_sum] diff --git a/real_dataset_time_vs_edges.py b/real_dataset_time_vs_edges.py index 62d8c25..ec5298f 100755 --- a/real_dataset_time_vs_edges.py +++ b/real_dataset_time_vs_edges.py @@ -42,11 +42,11 @@ df['timestamp'] -= refValue # Remove self-loops df = df[((df['source'] ) != (df['target']))] - +# df=df[:1000] ########FRACTION###### .2,.4,.6,.8,1 -_frac = 1 +_frac = .5 caching.clear_cache() @@ -63,7 +63,7 @@ df1['timestamp'] -= refValue num_roles=3 num_segments=5 num_levels=3# Optional arg -algo_ver=3 +algo_ver=2 dest_folder='./Results/synthetic/' # tuning parameters @@ -91,4 +91,11 @@ print("--- %s seconds ---" % (time.time() - start_time)) # 0.8 37418 33.04973316 # 1 46772 39.48938107 +# new algo +# frac edges time +# 0.2 9354 10.95566916 +# 0.4 18709 22.68538189 +# 0.6 28063 28.25948501 +# 0.8 37418 33.04973316 +# 1 46772 39.48938107 diff --git a/sbm_core.py b/sbm_core.py index 8f96883..5117cbc 100755 --- a/sbm_core.py +++ b/sbm_core.py @@ -468,6 +468,74 @@ def estimate_lamda_kh(num_roles,num_segments,lambda_estimates,group_assignment,c # print('{} {} {}'.format(k,g,lambda_estimates[k,g,:]) ) return lambda_estimates + +# Compute cost (edge by edge) +def com_cost(num_roles,num_segments,lamda_estimates,change_points_arr,group_assignment,dic): + + + list_of_groups= [[] for _ in range(num_roles)] + + for idx, val in group_assignment.items(): + list_of_groups[val].append(idx) + + i_j_d = {} + + for i in range(0, num_roles): + for j in range(0, num_roles): + for d in range(0, num_segments): + i_j_d[(i,j,d)] = 0 + + for key, val in dic.items(): + + i=group_assignment.get(key[0]) + j=group_assignment.get(key[1]) + + if i>j: + i,j=j,i + + a = change_points_arr[i,j,:] + + ranges_arr = [ [a[s]+1,a[s+1]] for s in range(0,len(a)-1)] + ranges_arr[0][0]=0 + + n = len(ranges_arr) + + for item in val: + + d = _findSegment(ranges_arr, n, int(item)) + i_j_d[(i,j,d)] += 1 + + + liklihood_sum = 0 + + for d in range(0, num_segments): + for k in range(0, num_roles): + for g in range(k, num_roles): + + U=list_of_groups[k] + W=list_of_groups[g] + + size_all_pairs = 0 + if k == g: + size_all_pairs = math.comb(len(U), 2) + if k != g: + size_all_pairs = len(U)*len(W) + + alpha = (size_all_pairs * lamda_estimates[k,g,d]) + + delta= change_points_arr[k,g,d+1]-change_points_arr[k,g,d] + + if d == 0: + delta += 1 + + if lamda_estimates[k,g,d] != 0: + liklihood_sum += (i_j_d[(k,g,d)]* math.log(lamda_estimates[k,g,d])- (alpha*delta)) + print('Likelihood sum: %f'%(liklihood_sum)) + + + return liklihood_sum + + # Compute cost def compute_cost(group_assignment,lambda_estimates,change_points_arr,num_roles,num_segments,dic): @@ -534,7 +602,7 @@ def compute_cost(group_assignment,lambda_estimates,change_points_arr,num_roles,n liklihood_sum += (inter_count*math.log(lambda_estimates[k,g,d]) - size_all_pairs*lambda_estimates[k,g,d]*delta_t) # print('Likelihood sum: %d %d %f'%(k,g,liklihood_sum)) # print('Likelihood sum: %d %d %f'%(k,g,temp)) - print('Likelihood sum: %f'%(liklihood_sum)) + # print('Likelihood sum: %f'%(liklihood_sum)) return liklihood_sum # Estimate change points ( Naive Dynamic programming) @@ -917,6 +985,8 @@ def linear_seg_ver_2(num_roles,num_segments,group_assignment,lambda_estimates,ch d = ele[0] val = ele[1] # print('{} ,{}'.format(d,val)) + # print(m_lambda_estimates) + # print(lambda_estimates_h) for i1 in range(0, num_roles): for i2 in range(i1, num_roles): m_lambda_estimates[i1,i2,d] = lambda_estimates_h[i1,i2,val] diff --git a/synthetic_experiment_1.py b/synthetic_experiment_1.py index a872673..5a3a649 100755 --- a/synthetic_experiment_1.py +++ b/synthetic_experiment_1.py @@ -165,7 +165,7 @@ import experiment num_roles=2 num_segments=2 num_levels=2# Optional arg -algo_ver=3 +algo_ver=2 dest_folder='./Results/synthetic/' # tuning parameters diff --git a/synthetic_experiment_likelihood_vs_H_synthetic.py b/synthetic_experiment_likelihood_vs_H_synthetic.py deleted file mode 100755 index 14ee47e..0000000 --- a/synthetic_experiment_likelihood_vs_H_synthetic.py +++ /dev/null @@ -1,205 +0,0 @@ -""" - -To check the likelihood vs H -using Synthetic Dataset - -To reproduce the results of the paper: -set - current_h = 1,2,3,4,5,6,7,8 ; -one at a time -""" - - -import numpy as np -import math -from itertools import combinations -import itertools -import experiment -from streamlit import caching -import pandas as pd - - - -""" -Generate piecewise non-homogeneous poisson point process (NHPPP) -To check the ground truth -Dataset-5 -""" - -# Initilaize -np.random.seed(111) - -num_roles=2 -num_vertices=10 -num_segments = 10 - -NO_SAMPLES= 250 -group_assignment= np.random.randint(num_roles, size=(num_vertices)) - -nodes = np.arange(num_vertices) - -list_of_groups= [[] for _ in range(num_roles)] - -for idx, val in enumerate(group_assignment): - list_of_groups[val].append(nodes[idx]) - -print(list_of_groups) - -size_all_pairs = {} -for k in range(0, num_roles): - for g in range(k, num_roles): - U=list_of_groups[k] - W=list_of_groups[g] - - if k == g: - size_all_pairs[k,g] = math.comb(len(U), 2) - if k != g: - size_all_pairs[k,g] = len(U)*len(W) - -lamda_arr = np.ones((num_roles, num_roles,num_segments) , dtype=float) -lamda_arr = 1e-1* np.random.randint(1,9, size=(num_roles, num_roles,num_segments)) -lamda_arr_act = np.zeros((num_roles, num_roles,num_segments) , dtype=float) - -num_levels = 5 -H =num_levels -print('k-h levels %d'%(num_levels)) - -# h-level lambda estimates -lambda_estimates_h = np.random.rand(num_roles, num_roles, H) - -l1 =list(range(0, H)) -l2 = [] -if num_segments > num_levels: - l2 = [np.random.randint(0,H) for i in range(num_segments-H)] - -# Mapping from segment to a level -g_mapping= np.array(l1 + l2) -# print('g mapping {}'.format(g_mapping)) -#Initialize lamda -lamda_arr = np.zeros((num_roles, num_roles,num_segments) , dtype=float) -for d in range(0, num_segments): - lamda_arr[:,:, d]= lambda_estimates_h[:,:,g_mapping[d]] - -change_points_arr = np.zeros((num_roles, num_roles, num_segments+1) , dtype=int) -df_all= None - -points= list(range(0, (num_segments+1)*NO_SAMPLES, NO_SAMPLES)) -list1 = [] - -# Generate piecewise non-homogeneous poisson process -for k in range(0, num_roles): - for g in range(k, num_roles): - comb = [] - if k == g: - comb = list(combinations(list_of_groups[k], 2)) - else: - key_data = [list_of_groups[k],list_of_groups[g],] - comb = list(itertools.product(*key_data)) - # print(comb) - if len(comb) != size_all_pairs[k,g]: - print('not equal..') - - change_points_arr[k,g,:] = points - lamda_arr[k,g,:] = lamda_arr[g,k,:] - - tot_count = np.zeros((num_segments) , dtype=float) - - for pair in comb: - - for d in range(0,num_segments): - - s = np.random.poisson(lamda_arr[k,g,d], NO_SAMPLES) - tot_count[d] += np.count_nonzero(s) - list1=[i for i, e in enumerate(s) if e != 0] - - if len(list1) == 0: - print('zero') - - list1 = [x+points[d] for x in list1] - - df = pd.DataFrame(data=list1) - df.columns =['timestamp'] - - - N= df.size - list_start_stations =[pair[0]] * N - list_end_stations =[pair[1]] * N - - df['source'] = list_start_stations - df['target'] = list_end_stations - - df_all=pd.concat([df_all, df], ignore_index=True) - - for d in range(0,num_segments): - lamda_arr_act[k,g,d] = tot_count[d]/(NO_SAMPLES*len(comb)) - # print(tot_count[d]) -## Other preparations - -# Remove self loops -df_all = df_all[((df_all['source'] ) != (df_all['target']))] -#sort -df_all=df_all.sort_values('timestamp') -df_all = df_all[['target', 'timestamp','source']] - -# Save as .csv file -# df_all.to_csv('./Data/synthetic_ground_truth_g1.csv') -df=df_all - - -def _swap (row): - if row['source'] > row['target']: - row['source'] , row['target'] =row['target'] , row['source'] - return row - -# Undirected graph -df=df.apply(lambda row: _swap(row), axis=1) -#scale timestamps for zeroth reference point -refValue = df['timestamp'].min() -df['timestamp'] -= refValue - - - -# To reproduce the results of the paper: -# set current_h = i choosing from H_list ; one at a time -# H_list = [1,2,3,4,5,6,7,8] -current_h= 1 - - -caching.clear_cache() - -# User parameters -num_roles=2 -num_segments=20 -num_levels=current_h -algo_ver=3 -dest_folder='./Results/synthetic/' - -# tuning parameters -theta = 1e-20 -eta = 1 -tuning_params= {'theta':theta,'eta':eta} - -exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) -exp_obj.execute() - - -# Normlaized likelihood -# num_roles=1 -# num_segments=1 -# num_levels=1 -# exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) -# exp_obj.execute() - - -# sample results -# h= 1 :-73402.83746 -# h= 2 :-68267.30975 -# h= 3 :-67919.07115 -# h= 4 :-67390.64768 -# h= 5 : -67390.39226 -# h= 6 :-67056.12304 -# h= 7 :-67054.4862 -# h= 8 :-67051.29741 - -# Likelihood when one segment, one group -# Normlaized likelihood : -74053.078339717 diff --git a/synthetic_experiment_time_vs_edges.py b/synthetic_experiment_time_vs_edges.py index 7594bf5..6b5420e 100755 --- a/synthetic_experiment_time_vs_edges.py +++ b/synthetic_experiment_time_vs_edges.py @@ -24,10 +24,10 @@ from streamlit import caching np.random.seed(113) # To reproduce the results of the paper: -# set NO_SAMPLES = 30 from list_samples = [50,100,150,200] ; +# set NO_SAMPLES = 50 from list_samples = [50,100,150,200] ; # one at a time -NO_SAMPLES= 150 +NO_SAMPLES= 50 num_roles=3 num_vertices=20 @@ -144,15 +144,13 @@ df['timestamp'] -= refValue - - # Experiments # User parameters num_roles=3 num_segments=5 num_levels=3# Optional arg -algo_ver=3 +algo_ver=2 dest_folder='./Results/synthetic/' # tuning parameters @@ -166,13 +164,8 @@ exp_obj.execute() print("--- %s seconds ---" % (time.time() - start_time)) print("no of edges: %d"%df.shape[0]) - - -# edges running_time -# 11404 1.406843901 -# 22600 2.683860064 -# 34008 4.017066002 -# 45186 4.687281132 + # running time can be dependent on the machine you run. -# However, it should have a linear trend w.r.t. edges. +# However, it should have a linear trend w.r.t. edges with algo_ver=3. +# However, it should have a quadratic trend w.r.t. edges with algo_ver=2. \ No newline at end of file -- GitLab