From 821013fc02115f6db4d550b37deb98ee0505d03e Mon Sep 17 00:00:00 2001 From: chamalee <chamalee.wickramaarachch@helsinki.fi> Date: Thu, 2 Feb 2023 13:05:42 +0200 Subject: [PATCH] level-dependent-membership variant added: --- README.md | 14 +- experiment.py | 25 +- ld-1-syn.py | 299 ++++++++++++++++++++++++ ld-2-syn.py | 318 +++++++++++++++++++++++++ ld-3-syn.py | 298 ++++++++++++++++++++++++ ld-4-syn.py | 326 ++++++++++++++++++++++++++ ld-5-syn.py | 304 ++++++++++++++++++++++++ ld-bikes_santander.py | 105 +++++++++ ld-bitcoin.py | 75 ++++++ ld-eu_email_dep1.py | 69 ++++++ ld-eu_email_dep2.py | 81 +++++++ ld-mathoverflow.py | 71 ++++++ ld-mooc.py | 75 ++++++ optimize.py | 83 +++++++ sbm_core.py | 530 +++++++++++++++++++++++++++++++++++++++++- 15 files changed, 2663 insertions(+), 10 deletions(-) create mode 100644 ld-1-syn.py create mode 100644 ld-2-syn.py create mode 100644 ld-3-syn.py create mode 100644 ld-4-syn.py create mode 100644 ld-5-syn.py create mode 100755 ld-bikes_santander.py create mode 100755 ld-bitcoin.py create mode 100755 ld-eu_email_dep1.py create mode 100755 ld-eu_email_dep2.py create mode 100755 ld-mathoverflow.py create mode 100755 ld-mooc.py diff --git a/README.md b/README.md index b2ed5d9..c97bf06 100755 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ num_levels=1 algo_ver= 3 ``` -The files `bikes.py` , `collegeMsg.py` , `bitcoin.py` , `eu_email_dep1.py`,`eu_email_dep2.py`, `mathoverflow.py`, and `mooc.py` +The files `bikes_santander.py` , `collegeMsg.py` , `bitcoin.py` , `eu_email_dep1.py`,`eu_email_dep2.py`, `mathoverflow.py`, and `mooc.py` provide examples on real world dynamic networks. This zip folder consists experiments for both for Synthetic and real-world datasets. @@ -62,6 +62,18 @@ To switch the algorithm you choose, change `algo_ver` parameter to either 2 or 3 The file `synthetic_experiment_time_vs_edges.py` returns the running time and edges for given `NO_SAMPLES`. To reproduce the results of the paper, eg. set `NO_SAMPLES` = 50 from the list_samples = [50,100,150,200] ; one at a time. +Level-dependent Membership extension: + +New algo_ver `4` is dedicated to level-dependent (K-H)-segmentation algorithm variant. + +From the file `ld-1-syn.py` to `ld-5-syn.py` contain code to simulate synthetic datasets using our level-dependent algorithms. +These files contain the code to generate synthetic data as well. + +The files `ld-bikes_santander.py` , `ld-bitcoin.py` , `ld-eu_email_dep1.py`,`ld-eu_email_dep2.py`, `mathoverflow.py`, and `mooc.py` +provide examples on real world dynamic networks for level-dependent variant. + + + ## References SMAWK code: https://www.ics.uci.edu/~eppstein/PADS/SMAWK.py diff --git a/experiment.py b/experiment.py index 2dd87b4..3c7c74c 100755 --- a/experiment.py +++ b/experiment.py @@ -29,7 +29,8 @@ class Experiment: # Time stamps t_df = self.df["timestamp"] t_df = sorted(t_df) - + print('TIME') + print(len(t_df)) # initilaize group assignment randomly group_assignment= np.random.randint(num_roles, size=(num_vertices)) # node-group dictionary @@ -41,6 +42,8 @@ class Experiment: # create a new dictionary - key: node-pair , value: list of timestamps dic=self.df.groupby(['source','target'])['timestamp'].apply(list).to_dict() + print(len(dic.values())) + # initialize change points change_points_arr = np.zeros((num_roles, num_roles, num_segments+1) , dtype=int) @@ -61,15 +64,27 @@ class Experiment: elif self.algo_ver == 3: opt = optimize.Optimize( group_dic,lambda_estimates,change_points_arr,nodes,num_roles,num_segments,dic,self.num_levels,self.tuning_params) [group_dic,lambda_estimates,change_points_arr,likelihood] = opt.k_h_seg_var_2() - + + ### Level-dependent (K,H)-segmentation variant-2 ### + elif self.algo_ver == 4: + opt = optimize.Optimize( group_dic,lambda_estimates,change_points_arr,nodes,num_roles,num_segments,dic,self.num_levels,self.tuning_params) + [group_dic,lambda_estimates,change_points_arr,likelihood,g_mapping] = opt.mm_k_h_seg_var_2() + # print('g_mapping_discoverd {}'.format(g_mapping)) + # for e_h in range(0,self.num_levels): + # g_a = group_dic[e_h] + # list_of_groups= [[] for _ in range(num_roles)] + # for idx, val in g_a.items(): + # list_of_groups[val].append(idx) + # print('group assignments {}: {}'.format(e_h,list_of_groups)) # Plotting # dest_folder= self.dest + str(self.algo_ver)+'/' # utils.generate_plots(G,group_dic,lambda_estimates,num_roles,num_segments,dest_folder,nodes,change_points_arr,t_df,self.refValue) - list_of_groups= [[] for _ in range(num_roles)] - for idx, val in group_dic.items(): - list_of_groups[val].append(idx) + # list_of_groups= [[] for _ in range(num_roles)] + # for idx, val in group_dic.items(): + # list_of_groups[val].append(idx) # print('group assignments: {}'.format(list_of_groups)) # print('lambdas: {}'.format(lambda_estimates)) + # return [likelihood,group_dic] return likelihood \ No newline at end of file diff --git a/ld-1-syn.py b/ld-1-syn.py new file mode 100644 index 0000000..c1e4f11 --- /dev/null +++ b/ld-1-syn.py @@ -0,0 +1,299 @@ + +""" +Generate piecewise non-homogeneous poisson point process (NHPPP) +To check the ground truth +Dataset-1 +""" +import numpy as np +import pandas as pd +import utils +import sbm_core +import math +from itertools import combinations +import itertools + +# Initilaize +np.random.seed(155) + +num_roles=2 +num_vertices=40 +num_segments = 4 +num_levels = 2 + +NO_SAMPLES= 200 +nodes = np.arange(num_vertices) +lamda_arr_act = np.zeros((num_roles, num_roles,num_levels) , dtype=float) + + +H =num_levels +print('k-h levels %d'%(num_levels)) + +# h-level lambda estimates +lambda_estimates_h = np.random.rand(num_roles, num_roles, H) +lambda_estimates_h = 1e-2*np.random.randint(11,99, size=(num_roles, num_roles, H)) +# Make high variant lambdas +lambda_estimates_h[0,0,:] = [0.7, 0.1] +lambda_estimates_h[0,1,:] = [0.1, 0.9] +lambda_estimates_h[1,0,:] = lambda_estimates_h[0,1,:] +lambda_estimates_h[1,1,:] = [0.8, 0.1] + +l1 =list(range(0, H)) +l2 = [] +if num_segments > num_levels: + l2 = [np.random.randint(0,H) for i in range(num_segments-H)] + +# Mapping from segment to a level +g_mapping= np.array(l1 + l2) +print('g mapping {}'.format(g_mapping)) + +# initilaize group assignment randomly +group_assignment_arr= np.random.randint(num_roles, size=(num_levels,num_vertices)) + # node-group dictionary +group_dic = {} +# keys = (list(range(self.num_segments)),nodes) +# values = list(group_assignment) +# group_dic = dict(zip(keys,values)) +for i in range(0,num_levels ): + level = i + + group_dic_level = {} + keys = nodes + values = list(group_assignment_arr[level]) + group_dic_level = dict(zip(keys,values)) + group_dic[i] = group_dic_level +print('initial') +# print(group_dic) + +for e_h in range(0,num_segments): + g_a = group_dic[g_mapping[e_h]] + list_of_groups= [[] for _ in range(num_roles)] + for idx, val in g_a.items(): + list_of_groups[val].append(idx) + print('group assignments {}: {}'.format(e_h,list_of_groups)) + # Plotting + +#Initialize lamda +lamda_arr = np.zeros((num_roles, num_roles,num_segments) , dtype=float) +for d in range(0, num_segments): + for k in range(0, num_roles): + for g in range(k, num_roles): + lamda_arr[k,g, d]= lambda_estimates_h[k,g,g_mapping[d]] + lamda_arr[g,k, d]= lamda_arr[k,g, d] +change_points_arr = np.zeros((num_roles, num_roles, num_segments+1) , dtype=int) +df_all= None + +points= list(range(0, (num_segments+1)*NO_SAMPLES, NO_SAMPLES)) +list1 = [] + +level_seg_mapping = {} +for d in range(num_segments): + level = g_mapping[d] + if level in level_seg_mapping: + level_seg_mapping[level].append(d) + else: + level_seg_mapping[level] = [] + level_seg_mapping[level].append(d) +# %% +# Generate piecewise non-homogeneous poisson process + + +tot_count = np.zeros((num_levels) , dtype=float) +com_len = np.zeros((num_levels) , dtype=float) +# for pair in comb: + +for i in range(0,num_levels): + # i = g_mapping[d] + group_assignment = group_assignment_arr[i] + + print(group_assignment) + + list_of_groups= [[] for _ in range(num_roles)] + + for idx, val in enumerate(group_assignment): + list_of_groups[val].append(nodes[idx]) + + # print(list_of_groups) + + size_all_pairs = {} + + for kk in range(0, num_roles): + for gg in range(kk, num_roles): + U=list_of_groups[kk] + W=list_of_groups[gg] + + if kk == gg: + size_all_pairs[kk,gg] = math.comb(len(U), 2) + if kk != gg: + size_all_pairs[kk,gg] = len(U)*len(W) + + for k in range(0, num_roles): + for g in range(k, num_roles): + + + change_points_arr[k,g,:] = points + lamda_arr[k,g,:] = lamda_arr[g,k,:] + + + + comb = [] + if k == g: + comb = list(combinations(list_of_groups[k], 2)) + # print(type(comb)) + else: + # comb = [] + key_data = [list_of_groups[k],list_of_groups[g],] + comb = list(itertools.product(*key_data)) + # print(comb) + if len(comb) != size_all_pairs[k,g]: + print('not equal..') + + + print('d val {}'.format( d)) + com_len[i] = len(comb) + # print('comb len {}'.format( com_len[d])) + tot_count[i] = 0 + + for pair in comb: + s = np.random.poisson(lamda_arr[k,g,d], NO_SAMPLES) + # print(np.count_nonzero(s)) + tot_count[i] += np.count_nonzero(s) + + list_org=[i for i, e in enumerate(s) if e != 0] + + if len(list_org) == 0: + print('zero') + + for d in level_seg_mapping[i]: + + + list1 = [x+points[d] for x in list_org] + + df = pd.DataFrame(data=list1) + df.columns =['timestamp'] + + # print(list1) + # if max(list1) > 799: + # print('{} {}'.format(d, max(list1))) + N= df.size + + # print(pair) + # print(pair[0]) + + list_start_stations =[pair[0]] * N + list_end_stations =[pair[1]] * N + + df['source'] = list_start_stations + df['target'] = list_end_stations + + df_all=pd.concat([df_all, df], ignore_index=True) + + # for dd in level_seg_mapping: + # dd = d + lamda_arr_act[k,g,i] = round(((tot_count[i])/(NO_SAMPLES*com_len[i])),1) + lamda_arr_act[g,k,i] = lamda_arr_act[k,g,i] + # print('tot count') + # print(tot_count[dd]) + # print(' {} {} {} {} : k g d :lamb'.format(k,g,d,lamda_arr_act[g,k,dd])) + print(' {} {} {} {} : k g d :lamb'.format(k,g,i,lamda_arr_act[g,k,i])) + +# Remove self loops +df_all = df_all[((df_all['source'] ) != (df_all['target']))] +#sort +df_all=df_all.sort_values('timestamp') +df_all = df_all[['target', 'timestamp','source']] + +# Save as .csv file +# df_all.to_csv('./Data/synthetic_ground_truth_g1.csv') + + + +df=df_all +dest_folder='./Results/synthetic/3' +t_df = df['timestamp'] + +nodes_arr = np.union1d(df['target'],df['source']).astype(int) +# list of nodes +nodes = nodes_arr.tolist() +num_vertices = len(nodes) + + +# create a new dictionary - key: node-pair , value: list of timestamps +dic=df.groupby(['source','target'])['timestamp'].apply(list).to_dict() +print('{} {} {} '.format(group_dic, lamda_arr_act,change_points_arr)) + + +# liklihood_sum = sbm_core.mm_compute_cost(group_dic,lamda_arr_act,change_points_arr,num_roles,num_segments,dic,g_mapping) +# print(' Initial Actual likelihood .......%f'%liklihood_sum) + +def _swap (row): + if row['source'] > row['target']: + row['source'] , row['target'] =row['target'] , row['source'] + return row + +# Undirected graph +df=df.apply(lambda row: _swap(row), axis=1) +#scale timestamps for zeroth reference point +refValue = df['timestamp'].min() +df['timestamp'] -= refValue + +# Experiment +import experiment + +# User parameters +# num_roles=2 +# num_segments=10 +# num_levels=5# Optional arg +algo_ver=4 +dest_folder='./Results/synthetic/' + +# tuning parameters +theta = 1e-7 +eta = 1 +tuning_params= {'theta':theta,'eta':eta} + +import time +start_time = time.time() + +exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) +# [likelihood_f,group_dic_f] = exp_obj.execute() +likelihood_f= exp_obj.execute() +print("--- %s seconds ---" % (time.time() - start_time)) + +liklihood_sum = sbm_core.mm_compute_cost(group_dic,lamda_arr_act,change_points_arr,num_roles,num_segments,dic,g_mapping) +print(' Initial Actual likelihood .......%f'%liklihood_sum) + +print('g mapping {}'.format(g_mapping)) + +for e_h in range(0,num_segments): + g_a = group_dic[g_mapping[e_h]] + list_of_groups= [[] for _ in range(num_roles)] + for idx, val in g_a.items(): + list_of_groups[val].append(idx) + print('group assignments {}: {}'.format(e_h,list_of_groups)) + + +# adjusted_rand_score([0,4,5],[0,5,4]) +# adjusted_mutual_info_score([0,4,5],[0,5,4]) + +# likelihood for single group and single segment # Normlaized likelihood +# num_roles=1 +# num_segments=1 +# num_levels=1 +# exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) +# exp_obj.execute() + + + + + +# group assignments 0: [[2, 3, 4, 6, 8, 9, 10, 12, 13, 19, 21, 22, 25, 26, 27, 29, 30, 31, 32, 33, 35, 36, 37, 39], [0, 1, 5, 7, 11, 14, 15, 16, 17, 18, 20, 23, 24, 28, 34, 38]] +# group assignments 1: [[0, 1, 3, 6, 7, 8, 9, 15, 18, 19, 20, 21, 23, 24, 27, 28, 30, 31, 33, 34, 35, 37, 38], [2, 4, 5, 10, 11, 12, 13, 14, 16, 17, 22, 25, 26, 29, 32, 36, 39]] +# --- 14.958946704864502 seconds --- + + + # Plotting +# group assignments 0: [[0, 1, 5, 7, 11, 14, 15, 16, 17, 18, 20, 23, 24, 28, 34, 38], [2, 3, 4, 6, 8, 9, 10, 12, 13, 19, 21, 22, 25, 26, 27, 29, 30, 31, 32, 33, 35, 36, 37, 39]] +# group assignments 1: [[2, 4, 5, 10, 11, 12, 13, 14, 16, 17, 22, 25, 26, 29, 32, 36, 39], [0, 1, 3, 6, 7, 8, 9, 15, 18, 19, 20, 21, 23, 24, 27, 28, 30, 31, 33, 34, 35, 37, 38]] +# group assignments 2: [[0, 1, 5, 7, 11, 14, 15, 16, 17, 18, 20, 23, 24, 28, 34, 38], [2, 3, 4, 6, 8, 9, 10, 12, 13, 19, 21, 22, 25, 26, 27, 29, 30, 31, 32, 33, 35, 36, 37, 39]] +# group assignments 3: [[2, 4, 5, 10, 11, 12, 13, 14, 16, 17, 22, 25, 26, 29, 32, 36, 39], [0, 1, 3, 6, 7, 8, 9, 15, 18, 19, 20, 21, 23, 24, 27, 28, 30, 31, 33, 34, 35, 37, 38]] + diff --git a/ld-2-syn.py b/ld-2-syn.py new file mode 100644 index 0000000..7e130d8 --- /dev/null +++ b/ld-2-syn.py @@ -0,0 +1,318 @@ + + + + + +""" +Generate piecewise non-homogeneous poisson point process (NHPPP) +To check the ground truth +Dataset-2 +""" +import numpy as np +import pandas as pd +import utils +import sbm_core +import math +from itertools import combinations +import itertools +# from sklearn.metrics.cluster import adjusted_rand_score +# from sklearn.metrics.cluster import adjusted_mutual_info_score +# Initilaize +np.random.seed(14232321) + +num_roles=2 +num_vertices=20 +num_segments = 3 +num_levels = 2 + +# NO_SAMPLES= 700 +NO_SAMPLES= 250 + +nodes = np.arange(num_vertices) +lamda_arr_act = np.zeros((num_roles, num_roles,num_levels) , dtype=float) +H =num_levels +print('k-h levels %d'%(num_levels)) + +# h-level lambda estimates +lambda_estimates_h = np.random.rand(num_roles, num_roles, H) +lambda_estimates_h = 1e-2*np.random.randint(11,99, size=(num_roles, num_roles, H)) +# Make high variant lambdas +lambda_estimates_h[0,0,:] = [0.9, 0.1] +lambda_estimates_h[0,1,:] = [0.1, 0.9] +lambda_estimates_h[1,0,:] = lambda_estimates_h[0,1,:] +lambda_estimates_h[1,1,:] = [0.9, 0.1] + +l1 =list(range(0, H)) +l2 = [] +if num_segments > num_levels: + l2 = [np.random.randint(0,H) for i in range(num_segments-H)] + +# Mapping from segment to a level +g_mapping= np.array(l1 + l2) +print('g mapping {}'.format(g_mapping)) + + + +# initilaize group assignment randomly +group_assignment_arr= np.random.randint(num_roles, size=(num_levels,num_vertices)) + # node-group dictionary +group_dic = {} +# keys = (list(range(self.num_segments)),nodes) +# values = list(group_assignment) +# group_dic = dict(zip(keys,values)) +for i in range(0,num_levels ): + level = i + + group_dic_level = {} + keys = nodes + values = list(group_assignment_arr[level]) + group_dic_level = dict(zip(keys,values)) + group_dic[i] = group_dic_level +print('initial') +# print(group_dic) + +for e_h in range(0,num_segments): + g_a = group_dic[g_mapping[e_h]] + list_of_groups= [[] for _ in range(num_roles)] + for idx, val in g_a.items(): + list_of_groups[val].append(idx) + print('group assignments {}: {}'.format(e_h,list_of_groups)) + # Plotting + +#Initialize lamda +lamda_arr = np.zeros((num_roles, num_roles,num_segments) , dtype=float) +for d in range(0, num_segments): + for k in range(0, num_roles): + for g in range(k, num_roles): + lamda_arr[k,g, d]= lambda_estimates_h[k,g,g_mapping[d]] + lamda_arr[g,k, d]= lamda_arr[k,g, d] +change_points_arr = np.zeros((num_roles, num_roles, num_segments+1) , dtype=int) +df_all= None + +points= list(range(0, (num_segments+1)*NO_SAMPLES, NO_SAMPLES)) +list1 = [] + +level_seg_mapping = {} +for d in range(num_segments): + level = g_mapping[d] + if level in level_seg_mapping: + level_seg_mapping[level].append(d) + else: + level_seg_mapping[level] = [] + level_seg_mapping[level].append(d) +# %% +# Generate piecewise non-homogeneous poisson process + + +tot_count = np.zeros((num_levels) , dtype=float) +com_len = np.zeros((num_levels) , dtype=float) +# for pair in comb: + +for i in range(0,num_levels): + # i = g_mapping[d] + group_assignment = group_assignment_arr[i] + + print(group_assignment) + + list_of_groups= [[] for _ in range(num_roles)] + + for idx, val in enumerate(group_assignment): + list_of_groups[val].append(nodes[idx]) + + # print(list_of_groups) + + size_all_pairs = {} + + for kk in range(0, num_roles): + for gg in range(kk, num_roles): + U=list_of_groups[kk] + W=list_of_groups[gg] + + if kk == gg: + size_all_pairs[kk,gg] = math.comb(len(U), 2) + if kk != gg: + size_all_pairs[kk,gg] = len(U)*len(W) + + for k in range(0, num_roles): + for g in range(k, num_roles): + + + change_points_arr[k,g,:] = points + lamda_arr[k,g,:] = lamda_arr[g,k,:] + + + + comb = [] + if k == g: + comb = list(combinations(list_of_groups[k], 2)) + # print(type(comb)) + else: + # comb = [] + key_data = [list_of_groups[k],list_of_groups[g],] + comb = list(itertools.product(*key_data)) + # print(comb) + if len(comb) != size_all_pairs[k,g]: + print('not equal..') + + + print('d val {}'.format( d)) + com_len[i] = len(comb) + # print('comb len {}'.format( com_len[d])) + tot_count[i] = 0 + + for pair in comb: + s = np.random.poisson(lamda_arr[k,g,d], NO_SAMPLES) + # print(np.count_nonzero(s)) + tot_count[i] += np.count_nonzero(s) + + list_org=[i for i, e in enumerate(s) if e != 0] + + if len(list_org) == 0: + print('zero') + + for d in level_seg_mapping[i]: + + + list1 = [x+points[d] for x in list_org] + + df = pd.DataFrame(data=list1) + df.columns =['timestamp'] + + N= df.size + + # print(pair) + # print(pair[0]) + + list_start_stations =[pair[0]] * N + list_end_stations =[pair[1]] * N + + df['source'] = list_start_stations + df['target'] = list_end_stations + + df_all=pd.concat([df_all, df], ignore_index=True) + + # for dd in level_seg_mapping: + # dd = d + lamda_arr_act[k,g,i] = round(((tot_count[i])/(NO_SAMPLES*com_len[i])),1) + lamda_arr_act[g,k,i] = lamda_arr_act[k,g,i] + # print('tot count') + # print(tot_count[dd]) + # print(' {} {} {} {} : k g d :lamb'.format(k,g,d,lamda_arr_act[g,k,dd])) + print(' {} {} {} {} : k g d :lamb'.format(k,g,i,lamda_arr_act[g,k,i])) +## Other preparations +# g_mapping +# Out[167]: array([0, 1, 0, 1]) +# d val 0 +# 0 0 0 0.5 : k g d :lamb +# d val 2 +# 0 0 2 0.5 : k g d :lamb +# d val 0 +# 0 1 0 0.1 : k g d :lamb +# d val 2 +# 0 1 2 0.1 : k g d :lamb +# d val 0 +# 1 1 0 0.5 : k g d :lamb +# d val 2 +# 1 1 2 0.6 : k g d :lamb +# Remove self loops +df_all = df_all[((df_all['source'] ) != (df_all['target']))] +#sort +df_all=df_all.sort_values('timestamp') +df_all = df_all[['target', 'timestamp','source']] + +# Save as .csv file +# df_all.to_csv('./Data/synthetic_ground_truth_g1.csv') + + + +df=df_all +dest_folder='./Results/synthetic/3' +t_df = df['timestamp'] + +nodes_arr = np.union1d(df['target'],df['source']).astype(int) +# list of nodes +nodes = nodes_arr.tolist() +num_vertices = len(nodes) + +# node-group dictionary +# group_dic = {} +# keys = nodes +# values = list(group_assignment) +# group_dic = dict(zip(keys,values)) + +# Extract graph +# G = utils.getGraph(nodes_arr,df.values) +# utils.generate_plots(G,group_dic,lamda_arr_act,num_roles,num_segments,dest_folder,nodes,change_points_arr,t_df,0) + + +# create a new dictionary - key: node-pair , value: list of timestamps +dic=df.groupby(['source','target'])['timestamp'].apply(list).to_dict() +print('{} {} {} '.format(group_dic, lamda_arr_act,change_points_arr)) + + +# liklihood_sum = sbm_core.mm_compute_cost(group_dic,lamda_arr_act,change_points_arr,num_roles,num_segments,dic,g_mapping) +# print(' Initial Actual likelihood .......%f'%liklihood_sum) + +def _swap (row): + if row['source'] > row['target']: + row['source'] , row['target'] =row['target'] , row['source'] + return row + +# Undirected graph +df=df.apply(lambda row: _swap(row), axis=1) +#scale timestamps for zeroth reference point +refValue = df['timestamp'].min() +df['timestamp'] -= refValue + +# Experiment +import experiment + +# User parameters +# num_roles=2 +# num_segments=10 +# num_levels=5# Optional arg +algo_ver=3 +dest_folder='./Results/synthetic/' + +# tuning parameters +theta = 1e-7 +eta = 1 +tuning_params= {'theta':theta,'eta':eta} + +import time +start_time = time.time() + +# exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) +# [likelihood_f,group_dic_f] = exp_obj.execute() +# likelihood_f= exp_obj.execute() +print("--- %s seconds ---" % (time.time() - start_time)) + +liklihood_sum = sbm_core.mm_compute_cost(group_dic,lamda_arr_act,change_points_arr,num_roles,num_segments,dic,g_mapping) +print(' Initial Actual likelihood .......%f'%liklihood_sum) + +print('g mapping {}'.format(g_mapping)) + +for e_h in range(0,num_segments): + g_a = group_dic[g_mapping[e_h]] + list_of_groups= [[] for _ in range(num_roles)] + for idx, val in g_a.items(): + list_of_groups[val].append(idx) + print('group assignments {}: {}'.format(e_h,list_of_groups)) + +# adjusted_rand_score([0,4,5],[0,5,4]) +# adjusted_mutual_info_score([0,4,5],[0,5,4]) + +# likelihood for single group and single segment # Normlaized likelihood +# num_roles=1 +# num_segments=1 +# num_levels=1 +# exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) +# exp_obj.execute() + +num_segments=10 +num_levels=5 +algo_ver= 4 + + +exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) +exp_obj.execute() \ No newline at end of file diff --git a/ld-3-syn.py b/ld-3-syn.py new file mode 100644 index 0000000..b638cd4 --- /dev/null +++ b/ld-3-syn.py @@ -0,0 +1,298 @@ + + + + + +""" +Generate piecewise non-homogeneous poisson point process (NHPPP) +To check the ground truth +Dataset-3 +""" +import numpy as np +import pandas as pd +import utils +import sbm_core +import math +from itertools import combinations +import itertools +# from sklearn.metrics.cluster import adjusted_rand_score +# from sklearn.metrics.cluster import adjusted_mutual_info_score +# Initilaize +np.random.seed(1494232321) + +num_roles=2 +num_vertices=20 +num_segments = 5 +num_levels = 3 + +NO_SAMPLES= 450 + +nodes = np.arange(num_vertices) + +lamda_arr_act = np.zeros((num_roles, num_roles,num_levels) , dtype=float) + + +H =num_levels +print('k-h levels %d'%(num_levels)) + +# h-level lambda estimates +lambda_estimates_h = np.random.rand(num_roles, num_roles, H) +lambda_estimates_h = 1e-2*np.random.randint(11,99, size=(num_roles, num_roles, H)) + +# Make high variant lambdas +lambda_estimates_h[0,0,:] = [0.3, 0.9,0.6] +lambda_estimates_h[0,1,:] = [0.1, 0.5, .9] +lambda_estimates_h[1,0,:] = lambda_estimates_h[0,1,:] +lambda_estimates_h[1,1,:] = [0.1, 0.9,.5] + +# print(lambda_estimates_h) + +l1 =list(range(0, H)) +l2 = [] +if num_segments > num_levels: + l2 = [np.random.randint(0,H) for i in range(num_segments-H)] + +# Mapping from segment to a level +g_mapping= np.array(l1 + l2) +print('g mapping {}'.format(g_mapping)) + + + +# initilaize group assignment randomly +group_assignment_arr= np.random.randint(num_roles, size=(num_levels,num_vertices)) + # node-group dictionary +group_dic = {} + +for i in range(0,num_levels ): + level = i + + group_dic_level = {} + keys = nodes + values = list(group_assignment_arr[level]) + group_dic_level = dict(zip(keys,values)) + group_dic[i] = group_dic_level +print('initial') +# print(group_dic) + +for e_h in range(0,num_segments): + g_a = group_dic[g_mapping[e_h]] + list_of_groups= [[] for _ in range(num_roles)] + for idx, val in g_a.items(): + list_of_groups[val].append(idx) + print('group assignments {}: {}'.format(e_h,list_of_groups)) + # Plotting + +#Initialize lamda +lamda_arr = np.zeros((num_roles, num_roles,num_segments) , dtype=float) +for d in range(0, num_segments): + for k in range(0, num_roles): + for g in range(k, num_roles): + lamda_arr[k,g, d]= lambda_estimates_h[k,g,g_mapping[d]] + lamda_arr[g,k, d]= lamda_arr[k,g, d] +change_points_arr = np.zeros((num_roles, num_roles, num_segments+1) , dtype=int) +df_all= None + +points= list(range(0, (num_segments+1)*NO_SAMPLES, NO_SAMPLES)) +list1 = [] + +level_seg_mapping = {} +for d in range(num_segments): + level = g_mapping[d] + if level in level_seg_mapping: + level_seg_mapping[level].append(d) + else: + level_seg_mapping[level] = [] + level_seg_mapping[level].append(d) +# %% +# Generate piecewise non-homogeneous poisson process + + +tot_count = np.zeros((num_levels) , dtype=float) +com_len = np.zeros((num_levels) , dtype=float) +# for pair in comb: + +for i in range(0,num_levels): + # i = g_mapping[d] + group_assignment = group_assignment_arr[i] + + print(group_assignment) + + list_of_groups= [[] for _ in range(num_roles)] + + for idx, val in enumerate(group_assignment): + list_of_groups[val].append(nodes[idx]) + + # print(list_of_groups) + + size_all_pairs = {} + + for kk in range(0, num_roles): + for gg in range(kk, num_roles): + U=list_of_groups[kk] + W=list_of_groups[gg] + + if kk == gg: + size_all_pairs[kk,gg] = math.comb(len(U), 2) + if kk != gg: + size_all_pairs[kk,gg] = len(U)*len(W) + + for k in range(0, num_roles): + for g in range(k, num_roles): + + + change_points_arr[k,g,:] = points + lamda_arr[k,g,:] = lamda_arr[g,k,:] + + + + comb = [] + if k == g: + comb = list(combinations(list_of_groups[k], 2)) + # print(type(comb)) + else: + # comb = [] + key_data = [list_of_groups[k],list_of_groups[g],] + comb = list(itertools.product(*key_data)) + # print(comb) + if len(comb) != size_all_pairs[k,g]: + print('not equal..') + + + print('d val {}'.format( d)) + com_len[i] = len(comb) + # print('comb len {}'.format( com_len[d])) + tot_count[i] = 0 + + for pair in comb: + s = np.random.poisson(lamda_arr[k,g,d], NO_SAMPLES) + # print(np.count_nonzero(s)) + tot_count[i] += np.count_nonzero(s) + + list_org=[i for i, e in enumerate(s) if e != 0] + + if len(list_org) == 0: + print('zero') + + for d in level_seg_mapping[i]: + + list1 = [x+points[d] for x in list_org] + + df = pd.DataFrame(data=list1) + df.columns =['timestamp'] + + N= df.size + + # print(pair) + # print(pair[0]) + + list_start_stations =[pair[0]] * N + list_end_stations =[pair[1]] * N + + df['source'] = list_start_stations + df['target'] = list_end_stations + + df_all=pd.concat([df_all, df], ignore_index=True) + + # for dd in level_seg_mapping: + # dd = d + lamda_arr_act[k,g,i] = round(((tot_count[i])/(NO_SAMPLES*com_len[i])),1) + lamda_arr_act[g,k,i] = lamda_arr_act[k,g,i] + # print('tot count') + # print(tot_count[dd]) + # print(' {} {} {} {} : k g d :lamb'.format(k,g,d,lamda_arr_act[g,k,dd])) + print(' {} {} {} {} : k g d :lamb'.format(k,g,i,lamda_arr_act[g,k,i])) + +# Remove self loops +df_all = df_all[((df_all['source'] ) != (df_all['target']))] +#sort +df_all=df_all.sort_values('timestamp') +df_all = df_all[['target', 'timestamp','source']] + +# Save as .csv file +# df_all.to_csv('./Data/synthetic_ground_truth_g1.csv') + + + +df=df_all +dest_folder='./Results/synthetic/3' +t_df = df['timestamp'] + +nodes_arr = np.union1d(df['target'],df['source']).astype(int) +# list of nodes +nodes = nodes_arr.tolist() +num_vertices = len(nodes) + +# create a new dictionary - key: node-pair , value: list of timestamps +dic=df.groupby(['source','target'])['timestamp'].apply(list).to_dict() +print('{} {} {} '.format(group_dic, lamda_arr_act,change_points_arr)) + + +# liklihood_sum = sbm_core.mm_compute_cost(group_dic,lamda_arr_act,change_points_arr,num_roles,num_segments,dic,g_mapping) +# print(' Initial Actual likelihood .......%f'%liklihood_sum) + +def _swap (row): + if row['source'] > row['target']: + row['source'] , row['target'] =row['target'] , row['source'] + return row + +# Undirected graph +df=df.apply(lambda row: _swap(row), axis=1) +#scale timestamps for zeroth reference point +refValue = df['timestamp'].min() +df['timestamp'] -= refValue + +# Experiment +import experiment + +# User parameters +# num_roles=2 +# num_segments=10 +# num_levels=5# Optional arg +algo_ver=3 +dest_folder='./Results/synthetic/' + +# tuning parameters +theta = 1e-5 +eta = 1 +tuning_params= {'theta':theta,'eta':eta} + +import time +start_time = time.time() + +exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) +# [likelihood_f,group_dic_f] = exp_obj.execute() +likelihood_f= exp_obj.execute() +print("--- %s seconds ---" % (time.time() - start_time)) + +liklihood_sum = sbm_core.mm_compute_cost(group_dic,lamda_arr_act,change_points_arr,num_roles,num_segments,dic,g_mapping) +print(' Initial Actual likelihood .......%f'%liklihood_sum) + +print('g mapping {}'.format(g_mapping)) + +for e_h in range(0,num_segments): + g_a = group_dic[g_mapping[e_h]] + list_of_groups= [[] for _ in range(num_roles)] + for idx, val in g_a.items(): + list_of_groups[val].append(idx) + print('group assignments {}: {}'.format(e_h,list_of_groups)) + +# likelihood for single group and single segment # Normlaized likelihood +num_roles=1 +num_segments=1 +num_levels=1 +exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) +exp_obj.execute() + + +# group assignments 0: [[1, 4, 5, 6, 8, 9, 10, 11, 12, 14, 19], [0, 2, 3, 7, 13, 15, 16, 17, 18]] +# group assignments 1: [[0, 5, 6, 8, 9, 12, 14, 15, 16, 17], [1, 2, 3, 4, 7, 10, 11, 13, 18, 19]] +# group assignments 2: [[0, 1, 2, 4, 7, 11, 12, 13, 16], [3, 5, 6, 8, 9, 10, 14, 15, 17, 18, 19]] +# --- 18.024484872817993 seconds --- +# -317502.0167726356 +# Initial Actual likelihood .......-317502.016773 +# g mapping [0 1 2 1 1] +# group assignments 0: [[0, 2, 3, 7, 13, 15, 16, 17, 18], [1, 4, 5, 6, 8, 9, 10, 11, 12, 14, 19]] +# group assignments 1: [[3, 5, 6, 8, 9, 10, 14, 15, 17, 18, 19], [0, 1, 2, 4, 7, 11, 12, 13, 16]] +# group assignments 2: [[0, 5, 6, 8, 9, 12, 14, 15, 16, 17], [1, 2, 3, 4, 7, 10, 11, 13, 18, 19]] +# group assignments 3: [[3, 5, 6, 8, 9, 10, 14, 15, 17, 18, 19], [0, 1, 2, 4, 7, 11, 12, 13, 16]] +# group assignments 4: [[3, 5, 6, 8, 9, 10, 14, 15, 17, 18, 19], [0, 1, 2, 4, 7, 11, 12, 13, 16]] \ No newline at end of file diff --git a/ld-4-syn.py b/ld-4-syn.py new file mode 100644 index 0000000..5707f3a --- /dev/null +++ b/ld-4-syn.py @@ -0,0 +1,326 @@ + + + + + +""" +Generate piecewise non-homogeneous poisson point process (NHPPP) +To check the ground truth +Dataset-4 +""" +import numpy as np +import pandas as pd +import utils +import sbm_core +import math +from itertools import combinations +import itertools +# from sklearn.metrics.cluster import adjusted_rand_score +# from sklearn.metrics.cluster import adjusted_mutual_info_score +# Initilaize +np.random.seed(153114) + +num_roles=2 +num_vertices=15 +num_segments = 8 +num_levels = 4 + +NO_SAMPLES= 500 + +nodes = np.arange(num_vertices) + +lamda_arr_act = np.zeros((num_roles, num_roles,num_levels) , dtype=float) + +H =num_levels +print('k-h levels %d'%(num_levels)) + +# h-level lambda estimates +lambda_estimates_h = np.random.rand(num_roles, num_roles, H) +lambda_estimates_h = 1e-2*np.random.randint(11,99, size=(num_roles, num_roles, H)) + +# Make high variant lambdas +lambda_estimates_h[0,0,:] = [0.3, 0.9,0.1,.6] +lambda_estimates_h[0,1,:] = [0.1, 0.5, .9, .3] +lambda_estimates_h[1,0,:] = lambda_estimates_h[0,1,:] +lambda_estimates_h[1,1,:] = [0.1, 0.5,.9, .3] + +# print(lambda_estimates_h) + +l1 =list(range(0, H)) +l2 = [] +if num_segments > num_levels: + l2 = [np.random.randint(0,H) for i in range(num_segments-H)] + +# Mapping from segment to a level +g_mapping= np.array(l1 + l2) +print('g mapping {}'.format(g_mapping)) + +# initilaize group assignment randomly +group_assignment_arr= np.random.randint(num_roles, size=(num_levels,num_vertices)) + # node-group dictionary +group_dic = {} + +for i in range(0,num_levels ): + level = i + + group_dic_level = {} + keys = nodes + values = list(group_assignment_arr[level]) + group_dic_level = dict(zip(keys,values)) + group_dic[i] = group_dic_level +print('initial') +# print(group_dic) + +for e_h in range(0,num_segments): + g_a = group_dic[g_mapping[e_h]] + list_of_groups= [[] for _ in range(num_roles)] + for idx, val in g_a.items(): + list_of_groups[val].append(idx) + print('group assignments {}: {}'.format(e_h,list_of_groups)) + # Plotting + +#Initialize lamda +lamda_arr = np.zeros((num_roles, num_roles,num_segments) , dtype=float) +for d in range(0, num_segments): + for k in range(0, num_roles): + for g in range(k, num_roles): + lamda_arr[k,g, d]= lambda_estimates_h[k,g,g_mapping[d]] + lamda_arr[g,k, d]= lamda_arr[k,g, d] +change_points_arr = np.zeros((num_roles, num_roles, num_segments+1) , dtype=int) +df_all= None + +points= list(range(0, (num_segments+1)*NO_SAMPLES, NO_SAMPLES)) +list1 = [] + +level_seg_mapping = {} +for d in range(num_segments): + level = g_mapping[d] + if level in level_seg_mapping: + level_seg_mapping[level].append(d) + else: + level_seg_mapping[level] = [] + level_seg_mapping[level].append(d) +# %% +# Generate piecewise non-homogeneous poisson process + + +tot_count = np.zeros((num_levels) , dtype=float) +com_len = np.zeros((num_levels) , dtype=float) +# for pair in comb: + +for i in range(0,num_levels): + # i = g_mapping[d] + group_assignment = group_assignment_arr[i] + + print(group_assignment) + + list_of_groups= [[] for _ in range(num_roles)] + + for idx, val in enumerate(group_assignment): + list_of_groups[val].append(nodes[idx]) + + # print(list_of_groups) + + size_all_pairs = {} + + for kk in range(0, num_roles): + for gg in range(kk, num_roles): + U=list_of_groups[kk] + W=list_of_groups[gg] + + if kk == gg: + size_all_pairs[kk,gg] = math.comb(len(U), 2) + if kk != gg: + size_all_pairs[kk,gg] = len(U)*len(W) + + for k in range(0, num_roles): + for g in range(k, num_roles): + + + change_points_arr[k,g,:] = points + lamda_arr[k,g,:] = lamda_arr[g,k,:] + + comb = [] + if k == g: + comb = list(combinations(list_of_groups[k], 2)) + # print(type(comb)) + else: + # comb = [] + key_data = [list_of_groups[k],list_of_groups[g],] + comb = list(itertools.product(*key_data)) + # print(comb) + if len(comb) != size_all_pairs[k,g]: + print('not equal..') + + + print('d val {}'.format( d)) + com_len[i] = len(comb) + # print('comb len {}'.format( com_len[d])) + tot_count[i] = 0 + + for pair in comb: + s = np.random.poisson(lamda_arr[k,g,d], NO_SAMPLES) + # print(np.count_nonzero(s)) + tot_count[i] += np.count_nonzero(s) + + list_org=[i for i, e in enumerate(s) if e != 0] + + if len(list_org) == 0: + print('zero') + + + + for d in level_seg_mapping[i]: + + + list1 = [x+points[d] for x in list_org] + + df = pd.DataFrame(data=list1) + df.columns =['timestamp'] + + N= df.size + + # print(pair) + # print(pair[0]) + + list_start_stations =[pair[0]] * N + list_end_stations =[pair[1]] * N + + df['source'] = list_start_stations + df['target'] = list_end_stations + + df_all=pd.concat([df_all, df], ignore_index=True) + + # for dd in level_seg_mapping: + # dd = d + lamda_arr_act[k,g,i] = round(((tot_count[i])/(NO_SAMPLES*com_len[i])),1) + lamda_arr_act[g,k,i] = lamda_arr_act[k,g,i] + # print('tot count') + # print(tot_count[dd]) + # print(' {} {} {} {} : k g d :lamb'.format(k,g,d,lamda_arr_act[g,k,dd])) + print(' {} {} {} {} : k g d :lamb'.format(k,g,i,lamda_arr_act[g,k,i])) + +# Remove self loops +df_all = df_all[((df_all['source'] ) != (df_all['target']))] +#sort +df_all=df_all.sort_values('timestamp') +df_all = df_all[['target', 'timestamp','source']] + + + +df=df_all +dest_folder='./Results/synthetic/3' +t_df = df['timestamp'] + +nodes_arr = np.union1d(df['target'],df['source']).astype(int) +# list of nodes +nodes = nodes_arr.tolist() +num_vertices = len(nodes) + +# create a new dictionary - key: node-pair , value: list of timestamps +dic=df.groupby(['source','target'])['timestamp'].apply(list).to_dict() +print('{} {} {} '.format(group_dic, lamda_arr_act,change_points_arr)) + + +# liklihood_sum = sbm_core.mm_compute_cost(group_dic,lamda_arr_act,change_points_arr,num_roles,num_segments,dic,g_mapping) +# print(' Initial Actual likelihood .......%f'%liklihood_sum) + +def _swap (row): + if row['source'] > row['target']: + row['source'] , row['target'] =row['target'] , row['source'] + return row + +# Undirected graph +df=df.apply(lambda row: _swap(row), axis=1) +#scale timestamps for zeroth reference point +refValue = df['timestamp'].min() +df['timestamp'] -= refValue + +# Experiment +import experiment + +# User parameters +# num_roles=2 +# num_segments=10 +# num_levels=5# Optional arg +algo_ver=4 +dest_folder='./Results/synthetic/' + +# tuning parameters +theta = 1e-5 +eta = 1 +tuning_params= {'theta':theta,'eta':eta} + +import time +start_time = time.time() + +exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) +# [likelihood_f,group_dic_f] = exp_obj.execute() +likelihood_f= exp_obj.execute() +print("--- %s seconds ---" % (time.time() - start_time)) + +liklihood_sum = sbm_core.mm_compute_cost(group_dic,lamda_arr_act,change_points_arr,num_roles,num_segments,dic,g_mapping) +print(' Initial Actual likelihood .......%f'%liklihood_sum) + +print('g mapping {}'.format(g_mapping)) + +for e_h in range(0,num_segments): + g_a = group_dic[g_mapping[e_h]] + list_of_groups= [[] for _ in range(num_roles)] + for idx, val in g_a.items(): + list_of_groups[val].append(idx) + print('group assignments {}: {}'.format(e_h,list_of_groups)) + +# likelihood for single group and single segment # Normlaized likelihood +# num_roles=1 +# num_segments=1 +# num_levels=1 +# exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) +# exp_obj.execute() + + +# -293495.2248026046 +# 5 -293495.224803 +# g_mapping_discoverd [0, 1, 0, 3, 2, 0, 2, 0] +# group assignments 0: [[1, 3, 4, 6, 7, 9, 12], [0, 2, 5, 8, 10, 11, 13, 14]] +# group assignments 1: [[0, 2, 5, 6, 7, 8, 9, 10, 12, 14], [1, 3, 4, 11, 13]] +# group assignments 2: [[1, 7, 14], [0, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13]] +# group assignments 3: [[0, 1, 4, 5, 7, 9, 11, 12], [2, 3, 6, 8, 10, 13, 14]] +# --- 33.311949253082275 seconds --- +# -293958.6558912792 +# Initial Actual likelihood .......-293958.655891 +# g mapping [0 1 2 3 1 1 3 1] +# group assignments 0: [[0, 2, 5, 6, 7, 8, 9, 10, 12, 14], [1, 3, 4, 11, 13]] +# group assignments 1: [[1, 3, 4, 6, 7, 9, 12], [0, 2, 5, 8, 10, 11, 13, 14]] +# group assignments 2: [[0, 1, 4, 5, 7, 9, 11, 12], [2, 3, 6, 8, 10, 13, 14]] +# group assignments 3: [[1, 4, 7, 14], [0, 2, 3, 5, 6, 8, 9, 10, 11, 12, 13]] +# group assignments 4: [[1, 3, 4, 6, 7, 9, 12], [0, 2, 5, 8, 10, 11, 13, 14]] +# group assignments 5: [[1, 3, 4, 6, 7, 9, 12], [0, 2, 5, 8, 10, 11, 13, 14]] +# group assignments 6: [[1, 4, 7, 14], [0, 2, 3, 5, 6, 8, 9, 10, 11, 12, 13]] +# group assignments 7: [[1, 3, 4, 6, 7, 9, 12], [0, 2, 5, 8, 10, 11, 13, 14]] + +from sklearn.metrics.cluster import adjusted_rand_score +# adjusted_rand_score([1, 7, 14],[1, 4, 7, 14]) +# adjusted_rand_score([0, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13],[0, 2, 3, 5, 6, 8, 9, 10, 11, 12, 13]) + + +from sklearn.metrics.cluster import adjusted_rand_score + +labels_true = [1, 0, 0,0,0,0,0,1,0,0,0,0,0,0,1] +labels_pred = [1, 0, 0,0,1,0,0,1,0,0,0,0,0,0,1] + +adjusted_rand_score(labels_true, labels_pred) +# 0.7190366972477065 + +labels_true = [1, 0, 1,1,1,1,1,0,1,1,1,1,1,1,0] +labels_pred = [1, 0, 1,0,1,1,1,0,1,1,1,1,1,1,0] + +adjusted_rand_score(labels_true, labels_pred) + + # 0.7190366972477065 + +# ( 1+1+1+0.7190366972477065)/4 = 0.9297591743119267 + + + + diff --git a/ld-5-syn.py b/ld-5-syn.py new file mode 100644 index 0000000..1d21719 --- /dev/null +++ b/ld-5-syn.py @@ -0,0 +1,304 @@ + + + +""" +Generate piecewise non-homogeneous poisson point process (NHPPP) +To check the ground truth +Dataset-5- modified +""" +import numpy as np +import pandas as pd +import utils +import sbm_core +import math +from itertools import combinations +import itertools +# from sklearn.metrics.cluster import adjusted_rand_score +# from sklearn.metrics.cluster import adjusted_mutual_info_score +# Initilaize +np.random.seed(165) + +num_roles=3 +num_vertices=30 +num_segments = 6 +num_levels = 3 +NO_SAMPLES= 500 + +nodes = np.arange(num_vertices) + +lamda_arr_act = np.zeros((num_roles, num_roles,num_levels) , dtype=float) + + +H =num_levels +print('k-h levels %d'%(num_levels)) + +# h-level lambda estimates +lambda_estimates_h = np.random.rand(num_roles, num_roles, H) +lambda_estimates_h = 1e-2*np.random.randint(11,99, size=(num_roles, num_roles, H)) +# Make high variant lambdas +lambda_estimates_h[0,0,:] = [0.3, 0.9,0.1] +lambda_estimates_h[0,1,:] = [0.1, .9, .4] +lambda_estimates_h[1,0,:] = lambda_estimates_h[0,1,:] +lambda_estimates_h[1,1,:] = [0.1, .9, .5] +lambda_estimates_h[0,2,:] = [0.1, .6, .1] +lambda_estimates_h[1,2,:] = [0.8, .1, .1] +lambda_estimates_h[2,2,:] = [0.9, .9, .1] + + +l1 =list(range(0, H)) +l2 = [] +if num_segments > num_levels: + l2 = [np.random.randint(0,H) for i in range(num_segments-H)] + +# Mapping from segment to a level +g_mapping= np.array(l1 + l2) +print('g mapping {}'.format(g_mapping)) + + + +# initilaize group assignment randomly +group_assignment_arr= np.random.randint(num_roles, size=(num_levels,num_vertices)) + # node-group dictionary +group_dic = {} + +for i in range(0,num_levels ): + level = i + + group_dic_level = {} + keys = nodes + values = list(group_assignment_arr[level]) + group_dic_level = dict(zip(keys,values)) + group_dic[i] = group_dic_level +print('initial') +# print(group_dic) + +for e_h in range(0,num_segments): + g_a = group_dic[g_mapping[e_h]] + list_of_groups= [[] for _ in range(num_roles)] + for idx, val in g_a.items(): + list_of_groups[val].append(idx) + print('group assignments {}: {}'.format(e_h,list_of_groups)) + # Plotting + +#Initialize lamda +lamda_arr = np.zeros((num_roles, num_roles,num_segments) , dtype=float) +for d in range(0, num_segments): + for k in range(0, num_roles): + for g in range(k, num_roles): + lamda_arr[k,g, d]= lambda_estimates_h[k,g,g_mapping[d]] + lamda_arr[g,k, d]= lamda_arr[k,g, d] +change_points_arr = np.zeros((num_roles, num_roles, num_segments+1) , dtype=int) +df_all= None + +points= list(range(0, (num_segments+1)*NO_SAMPLES, NO_SAMPLES)) +list1 = [] + +level_seg_mapping = {} +for d in range(num_segments): + level = g_mapping[d] + if level in level_seg_mapping: + level_seg_mapping[level].append(d) + else: + level_seg_mapping[level] = [] + level_seg_mapping[level].append(d) +# %% +# Generate piecewise non-homogeneous poisson process + + +tot_count = np.zeros((num_levels) , dtype=float) +com_len = np.zeros((num_levels) , dtype=float) +# for pair in comb: + +for i in range(0,num_levels): + # i = g_mapping[d] + group_assignment = group_assignment_arr[i] + + print(group_assignment) + + list_of_groups= [[] for _ in range(num_roles)] + + for idx, val in enumerate(group_assignment): + list_of_groups[val].append(nodes[idx]) + + # print(list_of_groups) + + size_all_pairs = {} + + for kk in range(0, num_roles): + for gg in range(kk, num_roles): + U=list_of_groups[kk] + W=list_of_groups[gg] + + if kk == gg: + size_all_pairs[kk,gg] = math.comb(len(U), 2) + if kk != gg: + size_all_pairs[kk,gg] = len(U)*len(W) + + for k in range(0, num_roles): + for g in range(k, num_roles): + + + change_points_arr[k,g,:] = points + lamda_arr[k,g,:] = lamda_arr[g,k,:] + + + + comb = [] + if k == g: + comb = list(combinations(list_of_groups[k], 2)) + # print(type(comb)) + else: + # comb = [] + key_data = [list_of_groups[k],list_of_groups[g],] + comb = list(itertools.product(*key_data)) + # print(comb) + if len(comb) != size_all_pairs[k,g]: + print('not equal..') + + + print('d val {}'.format( d)) + com_len[i] = len(comb) + # print('comb len {}'.format( com_len[d])) + tot_count[i] = 0 + + for pair in comb: + s = np.random.poisson(lamda_arr[k,g,d], NO_SAMPLES) + # print(np.count_nonzero(s)) + tot_count[i] += np.count_nonzero(s) + + list_org=[i for i, e in enumerate(s) if e != 0] + + if len(list_org) == 0: + print('zero') + + for d in level_seg_mapping[i]: + + + list1 = [x+points[d] for x in list_org] + + df = pd.DataFrame(data=list1) + df.columns =['timestamp'] + + N= df.size + + # print(pair) + # print(pair[0]) + + list_start_stations =[pair[0]] * N + list_end_stations =[pair[1]] * N + + df['source'] = list_start_stations + df['target'] = list_end_stations + + df_all=pd.concat([df_all, df], ignore_index=True) + + # for dd in level_seg_mapping: + # dd = d + lamda_arr_act[k,g,i] = round(((tot_count[i])/(NO_SAMPLES*com_len[i])),1) + lamda_arr_act[g,k,i] = lamda_arr_act[k,g,i] + # print('tot count') + # print(tot_count[dd]) + # print(' {} {} {} {} : k g d :lamb'.format(k,g,d,lamda_arr_act[g,k,dd])) + print(' {} {} {} {} : k g d :lamb'.format(k,g,i,lamda_arr_act[g,k,i])) + +# Remove self loops +df_all = df_all[((df_all['source'] ) != (df_all['target']))] +#sort +df_all=df_all.sort_values('timestamp') +df_all = df_all[['target', 'timestamp','source']] + + +df=df_all +dest_folder='./Results/synthetic/3' +t_df = df['timestamp'] + +nodes_arr = np.union1d(df['target'],df['source']).astype(int) +# list of nodes +nodes = nodes_arr.tolist() +num_vertices = len(nodes) + + +# create a new dictionary - key: node-pair , value: list of timestamps +dic=df.groupby(['source','target'])['timestamp'].apply(list).to_dict() +print('{} {} {} '.format(group_dic, lamda_arr_act,change_points_arr)) + + +# liklihood_sum = sbm_core.mm_compute_cost(group_dic,lamda_arr_act,change_points_arr,num_roles,num_segments,dic,g_mapping) +# print(' Initial Actual likelihood .......%f'%liklihood_sum) + +def _swap (row): + if row['source'] > row['target']: + row['source'] , row['target'] =row['target'] , row['source'] + return row + +# Undirected graph +df=df.apply(lambda row: _swap(row), axis=1) +#scale timestamps for zeroth reference point +refValue = df['timestamp'].min() +df['timestamp'] -= refValue + +# Experiment +import experiment + +# User parameters +# num_roles=2 +# num_segments=10 +# num_levels=5# Optional arg +algo_ver=3 +dest_folder='./Results/synthetic/' + +# tuning parameters +theta = 1e-5 +eta = 1 +tuning_params= {'theta':theta,'eta':eta} + +import time +start_time = time.time() + +exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) +# [likelihood_f,group_dic_f] = exp_obj.execute() +likelihood_f= exp_obj.execute() +print("--- %s seconds ---" % (time.time() - start_time)) + +liklihood_sum = sbm_core.mm_compute_cost(group_dic,lamda_arr_act,change_points_arr,num_roles,num_segments,dic,g_mapping) +print(' Initial Actual likelihood .......%f'%liklihood_sum) + +print('g mapping {}'.format(g_mapping)) + +for e_h in range(0,num_segments): + g_a = group_dic[g_mapping[e_h]] + list_of_groups= [[] for _ in range(num_roles)] + for idx, val in g_a.items(): + list_of_groups[val].append(idx) + print('group assignments {}: {}'.format(e_h,list_of_groups)) + +# likelihood for single group and single segment # Normlaized likelihood +num_roles=1 +num_segments=1 +num_levels=1 +exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) +exp_obj.execute() + +# g_mapping_discoverd [1, 0, 2, 1, 2, 1] +# group assignments 0: [[2, 4, 14, 16, 19, 29], [1, 3, 5, 6, 7, 9, 10, 12, 17, 20, 21, 23, 24, 25, 26, 27], [0, 8, 11, 13, 15, 18, 22, 28]] +# group assignments 1: [[3, 5, 9, 15, 16, 18, 20, 24, 28, 29], [1, 4, 6, 7, 8, 11, 13, 19, 23, 26], [0, 2, 10, 12, 14, 17, 21, 22, 25, 27]] +# group assignments 2: [[0, 4, 5, 7, 8, 12, 15, 16, 18, 22, 29], [2, 3, 10, 11, 13, 14, 20, 21, 23, 24, 25, 27, 28], [1, 6, 9, 17, 19, 26]] +# --- 45.705310344696045 seconds --- +# -745779.9721702943 +# Initial Actual likelihood .......-745779.972170 +# g mapping [0 1 2 0 2 0] +# group assignments 0: [[1, 4, 6, 7, 8, 11, 13, 19, 23, 26], [0, 2, 10, 12, 14, 17, 21, 22, 25, 27], [3, 5, 9, 15, 16, 18, 20, 24, 28, 29]] +# group assignments 1: [[0, 8, 11, 13, 15, 18, 22, 28], [1, 3, 5, 6, 7, 9, 10, 12, 17, 20, 21, 23, 24, 25, 26, 27], [2, 4, 14, 16, 19, 29]] +# group assignments 2: [[0, 4, 5, 7, 8, 12, 15, 16, 18, 22, 29], [2, 3, 10, 11, 13, 14, 20, 21, 23, 24, 25, 27, 28], [1, 6, 9, 17, 19, 26]] +# group assignments 3: [[1, 4, 6, 7, 8, 11, 13, 19, 23, 26], [0, 2, 10, 12, 14, 17, 21, 22, 25, 27], [3, 5, 9, 15, 16, 18, 20, 24, 28, 29]] +# group assignments 4: [[0, 4, 5, 7, 8, 12, 15, 16, 18, 22, 29], [2, 3, 10, 11, 13, 14, 20, 21, 23, 24, 25, 27, 28], [1, 6, 9, 17, 19, 26]] +# group assignments 5: [[1, 4, 6, 7, 8, 11, 13, 19, 23, 26], [0, 2, 10, 12, 14, 17, 21, 22, 25, 27], [3, 5, 9, 15, 16, 18, 20, 24, 28, 29]] + + + + + + + + + diff --git a/ld-bikes_santander.py b/ld-bikes_santander.py new file mode 100755 index 0000000..7ec20ca --- /dev/null +++ b/ld-bikes_santander.py @@ -0,0 +1,105 @@ + + +""" +Experiments for London Cycling Dataset + +Nodes 735 +Temporal Edges 32,258 +Time span 1 day +""" + +import experiment +import os +import pandas as pd + +# read data + +filepath = os.path.join("Data","9b-Journey-Data-Extract-06Sep15-19Sep15.csv") +# pick 9th of September-2015 +start_date = "2015-9-9 0:00:00" +end_date = "2015-9-9 23:59:59" + +# Read data +df = pd.read_csv(filepath, dtype={'StartStation Id': 'Int64', 'EndStation Id': 'Int64'}, usecols=lambda x: x in ['Start Date', 'StartStation Id', 'EndStation Id'], parse_dates=['Start Date']) +df=df.set_axis(['source', 'timestamp', 'target'], axis=1) + + +# Remove null value +df = df[df['target'].isnull() != True] +#sort +df=df.sort_values('timestamp') + +# Filter dates +if start_date and end_date: + after_start_date = df["timestamp"] >= start_date + before_end_date = df["timestamp"] <= end_date + between_two_dates = after_start_date & before_end_date + df = df.loc[between_two_dates] + +# Remove self-loops +df = df[((df['source'] ) != (df['target']))] + +# convert datetime to epoch +df['timestamp'] = df['timestamp'].astype('int64')//1e9 + +def _swap (row): + if row['source'] > row['target']: + row['source'] , row['target'] =row['target'] , row['source'] + return row + +# Undirected graph +df=df.apply(lambda row: _swap(row), axis=1) +#scale timestamps for zeroth reference point +refValue = df['timestamp'].min() +df['timestamp'] -= refValue + +df = df[:1000] + +# # Experiments + +# num_roles=1 +# num_segments=1 +# num_levels=1 +# User parameters +num_roles=2 +num_segments=3 +num_levels=2 +algo_ver= 4 +dest_folder='./Results/bikes/' + +# tuning parameters +# theta = 1e-5 +theta = 1e-7 +eta = 1 +tuning_params= {'theta':theta,'eta':eta} + +import time +start_time = time.time() + + +# num_segments=20 +# num_levels=11 +# algo_ver= 4 + + +# exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) +# exp_obj.execute() + +# print("--- %s seconds ---" % (time.time() - start_time)) + + +# likelihood for single group and single segment # Normlaized likelihood +# num_roles=1 +# num_segments=1 +# num_levels=1 +# exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) +# exp_obj.execute() + + +num_segments=10 +num_levels=7 +algo_ver= 4 + + +exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) +exp_obj.execute() \ No newline at end of file diff --git a/ld-bitcoin.py b/ld-bitcoin.py new file mode 100755 index 0000000..81cc92d --- /dev/null +++ b/ld-bitcoin.py @@ -0,0 +1,75 @@ + + + +""" +Experiments for Bitcoin Dataset +SOURCE, TARGET, RATING, TIME + +Dataset statistics +Nodes 3,783 +Edges 24,186 +""" + +import experiment +import os +import pandas as pd + +# read data +filepath = os.path.join("Data","soc-sign-bitcoinalpha.csv") +header_list = ['source','target','Rate', 'timestamp'] +df = pd.read_csv(filepath, names=header_list) +df = df[['source', 'timestamp','target']] + +def _swap (row): + if row['source'] > row['target']: + row['source'] , row['target'] =row['target'] , row['source'] + return row + +# Undirected graph +df=df.apply(lambda row: _swap(row), axis=1) +#scale timestamps for zeroth reference point +refValue = df['timestamp'].min() +df['timestamp'] -= refValue + + # Remove self-loops +df = df[((df['source'] ) != (df['target']))] + + +# # Experiments + + +# User parameters +num_roles=3 +num_segments=5 +num_levels=3 +algo_ver=4 +dest_folder='./Results/bikes/' + +# tuning parameters +theta = 1e-7 +eta = 1 +tuning_params= {'theta':theta,'eta':eta} + +import time +start_time = time.time() + +exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) +exp_obj.execute() + +print("--- %s seconds ---" % (time.time() - start_time)) + +# likelihood for single group and single segment # Normlaized likelihood +# num_roles=1 +# num_segments=1 +# num_levels=1 +# exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) +# exp_obj.execute() + +num_segments=10 +num_levels=9 +algo_ver= 4 + + +exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) +exp_obj.execute() + diff --git a/ld-eu_email_dep1.py b/ld-eu_email_dep1.py new file mode 100755 index 0000000..fd75df5 --- /dev/null +++ b/ld-eu_email_dep1.py @@ -0,0 +1,69 @@ + + +""" +Experiments for EU +SRC: id of the source node (a user) +TGT: id of the target node (a user) +TS: timestamp (in seconds), starting from 0 +Nodes 309 +Temporal Edges 61046 +Edges in static graph 3031 +Time span 803 days +""" + +import experiment +import os +import pandas as pd + +# read data +filepath = os.path.join("Data","email-Eu-core-temporal-Dept1.txt") +header_list = ['source','target', 'timestamp'] +df = pd.read_table(filepath,sep=' ',names=header_list) +df = df[['source', 'timestamp','target']] + + +def _swap (row): + if row['source'] > row['target']: + row['source'] , row['target'] =row['target'] , row['source'] + return row + +# Undirected graph +df=df.apply(lambda row: _swap(row), axis=1) +#scale timestamps for zeroth reference point +refValue = df['timestamp'].min() +df['timestamp'] -= refValue + + # Remove self-loops +df = df[((df['source'] ) != (df['target']))] + + + +# # Experiments + + +# User parameters +num_roles=3 +num_segments=6 +num_levels=3 +algo_ver= 4 +dest_folder='./Results/bikes/' + +# tuning parameters +theta = 1e-5 +eta = 1 +tuning_params= {'theta':theta,'eta':eta} + +import time +start_time = time.time() + +exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) +exp_obj.execute() + +print("--- %s seconds ---" % (time.time() - start_time)) + +# likelihood for single group and single segment # Normlaized likelihood +num_roles=1 +num_segments=1 +num_levels=1 +exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) +exp_obj.execute() diff --git a/ld-eu_email_dep2.py b/ld-eu_email_dep2.py new file mode 100755 index 0000000..94f7077 --- /dev/null +++ b/ld-eu_email_dep2.py @@ -0,0 +1,81 @@ + + +""" +Experiments for EU +SRC: id of the source node (a user) +TGT: id of the target node (a user) +TS: timestamp (in seconds), starting from 0 + +Nodes 162 +Temporal Edges 46772 +Edges in static graph 1772 +Time span 803 days +""" + +import experiment +import os +import pandas as pd + +# read data +filepath = os.path.join("Data","email-Eu-core-temporal-Dept2.txt") +header_list = ['source','target', 'timestamp'] +df = pd.read_table(filepath,sep='\t',names=header_list) +df = df[['source', 'timestamp','target']] + + +def _swap (row): + if row['source'] > row['target']: + row['source'] , row['target'] =row['target'] , row['source'] + return row + +# Undirected graph +df=df.apply(lambda row: _swap(row), axis=1) +#scale timestamps for zeroth reference point +refValue = df['timestamp'].min() +df['timestamp'] -= refValue + + # Remove self-loops +df = df[((df['source'] ) != (df['target']))] + + + +# # Experiments + + +# User parameters +num_roles=2 +num_segments=4 +num_levels=2 +algo_ver= 4 +dest_folder='./Results/bikes/' + +# tuning parameters +theta = 1e-7 +eta = 1 +tuning_params= {'theta':theta,'eta':eta} + +import time +start_time = time.time() + +exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) +exp_obj.execute() + +print("--- %s seconds ---" % (time.time() - start_time)) + +# likelihood for single group and single segment # Normlaized likelihood +# num_roles=1 +# num_segments=1 +# num_levels=1 +# exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) +# exp_obj.execute() + + +num_segments=20 +num_levels=18 +algo_ver= 4 + + +exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) +exp_obj.execute() + + diff --git a/ld-mathoverflow.py b/ld-mathoverflow.py new file mode 100755 index 0000000..93fa10c --- /dev/null +++ b/ld-mathoverflow.py @@ -0,0 +1,71 @@ + +""" +Experiments for mathoverflow + +Nodes 21688 +Temporal Edges 107581 +Edges in static graph 90489 +Time span 2350 days + +SRC: id of the source node (a user) +TGT: id of the target node (a user) +UNIXTS: Unix timestamp (seconds since the epoch) +""" + +import experiment +import os +import pandas as pd + +# read data +filepath = os.path.join("Data","sx-mathoverflow-a2q.txt") +header_list = ['source','target', 'timestamp'] +df = pd.read_table(filepath,sep=' ',names=header_list) +df = df[['target', 'timestamp','source']] + +def _swap (row): + if row['source'] > row['target']: + row['source'] , row['target'] =row['target'] , row['source'] + return row + +# Undirected graph +df=df.apply(lambda row: _swap(row), axis=1) +#scale timestamps for zeroth reference point +refValue = df['timestamp'].min() +df['timestamp'] -= refValue + +# Remove self-loops +df = df[((df['source'] ) != (df['target']))] + + +# # Experiments + + +# User parameters +num_roles=2 +num_segments=3 +num_levels=2 +algo_ver= 4 +dest_folder='./Results/bikes/' + +# tuning parameters +theta = 1e-5 +eta = 1 +tuning_params= {'theta':theta,'eta':eta} + +import time +start_time = time.time() + +exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) +exp_obj.execute() + +print("--- %s seconds ---" % (time.time() - start_time)) + + +# likelihood for single group and single segment # Normlaized likelihood +num_roles=1 +num_segments=1 +num_levels=1 +exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) +exp_obj.execute() + + diff --git a/ld-mooc.py b/ld-mooc.py new file mode 100755 index 0000000..ee4a494 --- /dev/null +++ b/ld-mooc.py @@ -0,0 +1,75 @@ +""" +Experiments for mooc Dataset +ACTIONID USERID TARGETID TIMESTAMP + + +Number of users 7,047 +Number of targets 97 +Number of actions 411,749 +Number of positive action labels 4,066 +Timestamp seconds +""" + +import experiment +import os +import pandas as pd + +# read data +filepath = os.path.join("Data","mooc_actions.tsv") +df = pd.read_table(filepath) +df = df[['TARGETID', 'TIMESTAMP','USERID']] + +header_list = ['target', 'timestamp','source'] +df.columns = header_list + +# filter vertices +#num_vertices = 100000 +#df = df[((df['source'] < num_vertices) & (df['target'] < num_vertices))] + +def _swap (row): + if row['source'] > row['target']: + row['source'] , row['target'] =row['target'] , row['source'] + return row + +# Undirected graph +df=df.apply(lambda row: _swap(row), axis=1) +#scale timestamps for zeroth reference point +refValue = df['timestamp'].min() +df['timestamp'] -= refValue + +# Remove self-loops +df = df[((df['source'] ) != (df['target']))] + +# # Experiments + +# User parameters +num_roles=2 +num_segments=3 +num_levels=2 +algo_ver= 4 +dest_folder='./Results/bikes/' + +# tuning parameters +theta = 1e-5 +eta = 1 +tuning_params= {'theta':theta,'eta':eta} + +import time +start_time = time.time() + +exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) +exp_obj.execute() + +print("--- %s seconds ---" % (time.time() - start_time)) + +# import numpy as np +# nodes_arr = np.union1d(df['target'],df['source']).astype(int) +# # list of nodes +# nodes = nodes_arr.tolist() + +# likelihood for single group and single segment # Normlaized likelihood +# num_roles=1 +# num_segments=1 +# num_levels=1 +# exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) +# likelihood_f = exp_obj.execute() diff --git a/optimize.py b/optimize.py index c77497d..b675353 100755 --- a/optimize.py +++ b/optimize.py @@ -19,6 +19,7 @@ class Optimize: self.df=df self.num_levels=num_levels self.tuning_params = tuning_params + self.g_mapping = None def k_seg(self): """K-segmentation""" @@ -155,3 +156,85 @@ class Optimize: _curr_val = liklihood_sum _itr+=1 return [self.group_assignment,self.lambda_estimates,self.change_points_arr,liklihood_sum] + + + def com_h_lvl_lambda_group(self): + H = self.num_levels + print('k-h levels %d'%(self.num_levels)) + + # h-level lambda estimates + # lambda_estimates_h = 1e-1*np.random.rand(self.num_roles, self.num_roles, H) + lambda_estimates_h = np.random.rand(self.num_roles, self.num_roles, H) + # print(lambda_estimates_h) + + l1 =list(range(0, H)) + l2 = [] + if self.num_segments > self.num_levels: + l2 = [np.random.randint(0,H) for i in range(self.num_segments-H)] + + # Mapping from segment to a level + g_mapping= np.array(l1 + l2) + print('g mapping {}'.format(g_mapping)) + + R = int(self.num_roles) + for k in range(0, R): + for g in range(0, R): + for d in range(0, self.num_levels): + self.lambda_estimates[k,g, d]= lambda_estimates_h[k,g,d] + self.lambda_estimates[g,k, d]= lambda_estimates_h[k,g,d] + + num_vertices = len(self.nodes) + group_assignment_arr= np.random.randint(self.num_roles, size=(self.num_levels,num_vertices)) + # node-group dictionary + group_dic = {} + + for i in range(0,self.num_levels): + level = i + + group_dic_level = {} + keys = self.nodes + values = list(group_assignment_arr[level]) + group_dic_level = dict(zip(keys,values)) + group_dic[i] = group_dic_level + + + self.group_assignment = group_dic + self.g_mapping = g_mapping + + def mm_k_h_seg_var_2(self): + """(K,H)-segmentation variant-2""" + + # initilaization algorithm: initialize lambdas randomly and segments through linear seg. ver 2. + self.com_h_lvl_lambda_group() + + [self.change_points_arr,self.g_mapping,self.group_assignment]=sbm_core.mm_linear_seg_ver_2(self.num_roles,self.num_segments,self.group_assignment,self.lambda_estimates,self.change_points_arr,self.df,self.g_mapping) + self.lambda_estimates=sbm_core.mm_estimate_lamda_kh(self.num_roles,self.num_segments,self.lambda_estimates,self.group_assignment,self.change_points_arr,self.df,self.g_mapping,self.tuning_params) + liklihood_sum = sbm_core.mm_compute_cost(self.group_assignment,self.lambda_estimates,self.change_points_arr,self.num_roles,self.num_segments,self.df,self.g_mapping) + _prev_val = math.inf + _curr_val = liklihood_sum + _itr = 0 + + while round(_prev_val,2) != round(_curr_val,2): + print("iteration no........... %d " %(_itr+1)) + + self.group_assignment=sbm_core.mm_group_assignment_ver2_2(self.nodes,self.num_roles,self.num_segments,self.lambda_estimates,self.group_assignment,self.change_points_arr,self.df,self.g_mapping) + print('after grouping') + liklihood_sum = sbm_core.mm_compute_cost(self.group_assignment,self.lambda_estimates,self.change_points_arr,self.num_roles,self.num_segments,self.df,self.g_mapping) + + self.lambda_estimates=sbm_core.mm_estimate_lamda_kh(self.num_roles,self.num_segments,self.lambda_estimates,self.group_assignment,self.change_points_arr,self.df,self.g_mapping,self.tuning_params) + print('after lambda estimate') + liklihood_sum = sbm_core.mm_compute_cost(self.group_assignment,self.lambda_estimates,self.change_points_arr,self.num_roles,self.num_segments,self.df,self.g_mapping) + + print('after seg') + # self.change_points_arr = sbm_core.dyn_prog_seg(self.group_assignment,self.lambda_estimates,self.change_points_arr,self.num_roles,self.num_segments,self.df) + [self.change_points_arr,self.g_mapping,self.group_assignment]=sbm_core.mm_linear_seg_ver_2(self.num_roles,self.num_segments,self.group_assignment,self.lambda_estimates,self.change_points_arr,self.df,self.g_mapping) + liklihood_sum = sbm_core.mm_compute_cost(self.group_assignment,self.lambda_estimates,self.change_points_arr,self.num_roles,self.num_segments,self.df,self.g_mapping) + + self.lambda_estimates=sbm_core.mm_estimate_lamda_kh(self.num_roles,self.num_segments,self.lambda_estimates,self.group_assignment,self.change_points_arr,self.df,self.g_mapping,self.tuning_params) + print('after lambda estimate') + liklihood_sum = sbm_core.mm_compute_cost(self.group_assignment,self.lambda_estimates,self.change_points_arr,self.num_roles,self.num_segments,self.df,self.g_mapping) + print(' %d %f'%(_itr+1,liklihood_sum)) + _prev_val = _curr_val + _curr_val = liklihood_sum + _itr+=1 + return [self.group_assignment,self.lambda_estimates,self.change_points_arr,liklihood_sum,self.g_mapping] \ No newline at end of file diff --git a/sbm_core.py b/sbm_core.py index 5117cbc..2ec7199 100755 --- a/sbm_core.py +++ b/sbm_core.py @@ -1,7 +1,8 @@ -################################################################################### -### Utility Functions for Maximum Likelihood Estimation (MLE) and Segmentation ### -### ( based on stochastic blockmodels ) ### -################################################################################### +# ################################################################################### +# ### Utility Functions for Maximum Likelihood Estimation (MLE) and Segmentation ### +# ### ( based on stochastic blockmodels ) ### +# ################################################################################### + import math import numpy as np @@ -997,3 +998,524 @@ def linear_seg_ver_2(num_roles,num_segments,group_assignment,lambda_estimates,ch # com_cost(num_roles,num_segments,m_lambda_estimates,change_points_arr,group_assignment,dic) lambda_estimates = m_lambda_estimates return [lambda_estimates,change_points_arr] + +################################## + +#level-dependent group membership +def mm_group_assignment_ver2_2(nodes,num_roles,num_segments,lambda_estimates,group_assignment,change_points_arr,dic,g_mapping): + + current_g_mapping = g_mapping + level_seg_mapping = {} + + for d in range(num_segments): + level = current_g_mapping[d] + + if level in level_seg_mapping: + level_seg_mapping[level].append(d) + else: + level_seg_mapping[level] = [] + level_seg_mapping[level].append(d) + + + h = len(set(g_mapping)) + print('h levels %d'%h) + + + # Multidigraph object + G = nx.MultiDiGraph() + G.add_nodes_from(nodes) + G.add_edges_from(list(dic)) + G = G.to_undirected() + + for i_h in range(0,h): + + l_seg = level_seg_mapping[i_h] + # print('i: {} seg_level_dic : {}'.format(i_h,l_seg)) + + grp = group_assignment[i_h] + + list_of_groups= [[] for _ in range(num_roles)] + + for idx, val in grp.items(): + list_of_groups[val].append(idx) + + t = 0 + + for d in l_seg: + delta_t = change_points_arr[0,0,d+1] - change_points_arr[0,0,d] + + if d == 0: + delta_t += 1 + + t = t + delta_t + + ss= level_seg_mapping[i_h] + + # iterate over all nodes + for v in nodes: + # remove v + for g in range(0, num_roles): + #Extract Group g + _list_g = list_of_groups[g] + + if v in _list_g : + _list_g.remove(v) + + + # store current group of v + max_i = grp[v] + + edge_counts = np.zeros(num_roles) + # find neighbours of node v + neighbour_nodes = [n for n in G.neighbors(v)] + # print('neignbours {}'.format(neighbour_nodes)) + + # iterate over all neigbour nodes of v + for neigh in neighbour_nodes: + + v1 = v + v2 = neigh + + # order of tuple : undirected + if v1 > v2: + v1, v2 = v2, v1 + + # i=group_assignment.get(v1) + jk=grp[neigh] + + # change-points of (i,j) group pair; in this case, equally partitioned + chg_points = change_points_arr[0,0,:] + + ranges_arr = [ [chg_points[s]+1,chg_points[s+1]] for s in range(0,len(chg_points)-1)] + ranges_arr[0][0] = 0 + + # list of timestamps corresponding to (v1,v2) pair + list_time_stamps = dic.get((v1,v2)) + + # iterate over timestamps list + for item in list_time_stamps: + d = _findSegment(ranges_arr, len(ranges_arr) , int(item)) + # print(' {} {} {} '.format(d, chg_points, item)) + if d in ss: + # print('actual d {} ss {}'.format(d, ss)) + edge_counts[jk] += 1 + # print('h: {} , cnt : {}'.format(i_h, edge_counts)) + likelihood_sum = np.zeros(num_roles) + + + # if node a belongs to group a + for l in range(0, num_roles): + + likelihood_sum[l] = 0 + + for b in range(0, num_roles): + W = list_of_groups[b] + + # maximum possible number of ways of interacting with node v + factor = len(W) + + likelihood_sum[l] += (edge_counts[b]*math.log(lambda_estimates[l,b,i_h]) - factor*lambda_estimates[l,b,i_h]*t) + #SANITY CHECK + # if max_i != np.argmax(likelihood_sum): + # # print('{} : {} {} : {} : '.format(max_i, np.argmax(likelihood_sum),max(likelihood_sum),likelihood_sum)) + # l1 = mm_compute_cost(group_assignment,lambda_estimates,change_points_arr,num_roles,num_segments,dic,g_mapping) + # group_assignment[i_h][v] = np.argmax(likelihood_sum) + # l2 = mm_compute_cost(group_assignment,lambda_estimates,change_points_arr,num_roles,num_segments,dic,g_mapping) + # p1= round(abs(l2-l1),2) + # p2= round(abs(likelihood_sum[0]-likelihood_sum[1]),2) + # if p1 != p2: + # print('p: {} : {} {} : {} : {} {}'.format(p1,p2, likelihood_sum,edge_counts,lambda_estimates[0,:,i_h],lambda_estimates[1,:,i_h] )) + # if l2 < l1: + # print('l : {} : {} {} : {} : '.format(max_i, np.argmax(likelihood_sum),max(likelihood_sum),likelihood_sum)) + + max_i = np.argmax(likelihood_sum) + group_assignment[i_h][v] = max_i + list_of_groups[max_i].append(v) + + # mm_compute_cost(group_assignment,lambda_estimates,change_points_arr,num_roles,num_segments,dic,g_mapping) + return group_assignment + +# #level-dependent estimate (k-h) lamda +def mm_estimate_lamda_kh(num_roles,num_segments,lambda_estimates,group_assignment,change_points_arr,dic,g_mapping,tuning_params=None): + # create dictionary to store interaction counts + # ( no of interactions between ith and jth group within dth segment) + key_data = [ + range(0,num_roles), + range(0,num_roles), + range(0,num_segments), + ] + keys = list(itertools.product(*key_data)) + i_j_d = {key: 0 for key in keys} + + current_g_mapping = g_mapping + + + # print(lambda_estimates) + level_seg_mapping = {} + for d in range(num_segments): + level = current_g_mapping[d] + if level in level_seg_mapping: + level_seg_mapping[level].append(d) + else: + level_seg_mapping[level] = [] + level_seg_mapping[level].append(d) + + # change-points of (i,j) group pair + chg_points = change_points_arr[0,0,:] + + ranges_arr = [ [chg_points[s]+1,chg_points[s+1]] for s in range(0,len(chg_points)-1)] + ranges_arr[0][0]=0 + + n = len(ranges_arr) + + + for key, val in dic.items(): + + for item in val: + d = _findSegment(ranges_arr, n, int(item)) + # l = current_g_mapping[d] + g_a = group_assignment[current_g_mapping[d]] + + i=g_a.get(key[0]) #group of v1 + j=g_a.get(key[1]) #group of v2 + + # undirected graph + if i>j: + i,j=j,i + + i_j_d[(i,j,d)] += 1 + + # liklihood_sum = 0 + h = len(set(g_mapping)) + + # iterate over levels + for i in range(0,h): + + g_a = group_assignment[i] + + list_of_groups= [[] for _ in range(num_roles)] + for idx, val in g_a.items(): + list_of_groups[val].append(idx) + + for k in range(0, num_roles): + for g in range(k,num_roles): + + + U=list_of_groups[k] + W=list_of_groups[g] + + size_all_pairs = 0 + if k == g: + size_all_pairs = math.comb(len(U), 2) + if k != g: + size_all_pairs = len(U)*len(W) + + inter_count = 0 + delta_t = 0 + + for d in level_seg_mapping[i]: + # print('i {} d {}'.format(i, d)) + + delta_t += (change_points_arr[k,g,d+1] - change_points_arr[k,g,d]) + + if d == 0: + delta_t += 1 + + inter_count += i_j_d[(k,g,d)] + + alpha = inter_count + beeta = size_all_pairs*delta_t + lamda = (alpha+ random.random()*tuning_params['theta'])/(beeta+tuning_params['eta']) + + lambda_estimates[k,g,i] = lamda + lambda_estimates[g,k,i] = lamda + + return lambda_estimates + + +# Level-dependent : Estimate change points : LINEAR algorithm ( Fast segmentation with SMAWK ) - Ver 2 +def mm_linear_seg_ver_2(num_roles,num_segments,group_assignment,lambda_estimates,change_points_arr,dic,g_mapping): + + + current_g_mapping = g_mapping + h = len(set(current_g_mapping)) + + time_edges = {} + time_stamps=[] + + for key, val in dic.items(): + # print('{} {}'.format(key[0],key[1])) + + for item in val: + if item not in time_edges: + time_edges[item] = [] + time_edges[(item)] += {key} + time_stamps.append(item) + + # find unique time-stamps + # (there are cases where multiple edges occur at the same timestamp ) + time_stamps = sorted(time_stamps) + time_stamps_unique = sorted(list(set(time_stamps))) + + n = len(time_stamps_unique) + + startsfrom = np.zeros((n,num_segments) , dtype=int) + lamda_value = np.zeros((n,num_segments) , dtype=int ) + + c = np.zeros((n, h) , dtype=float) + o_ek = np.zeros((n, num_segments) , dtype=float) + + + # iterate over levels + for d in range(0, h): + + # level specific group assignements + g_a = group_assignment[d] + list_of_groups= [[] for _ in range(num_roles)] + for idx, val in g_a.items(): + list_of_groups[val].append(idx) + + alpha = 0 + cnt = 0 + + for k in range(0, num_roles): + for g in range(k, num_roles): + + U=list_of_groups[k] + W=list_of_groups[g] + + size_all_pairs = 0 + if k == g: + size_all_pairs = math.comb(len(U), 2) + if k != g: + size_all_pairs = len(U)*len(W) + + if lambda_estimates[k,g,d] != 0: + alpha += (size_all_pairs * lambda_estimates[k,g,d]) + elif not U or not W: + print('list is empty...') + + for ind, tmsp in enumerate(time_stamps_unique): + lst = time_edges.get(tmsp) + + for key in lst: + + i=g_a.get(key[0]) #group of v1 + j=g_a.get(key[1]) #group of v2 + + cnt += math.log(lambda_estimates[i,j,d]) + + c[ind,d] = cnt - alpha*tmsp + + for e in range(0, n): + # print('{} : {} {}'.format(c[e,:],max(c[e,:]),np.argmax(c[e,:]))) + o_ek[e,0] = max(c[e,:]) + lamda_value[e,0] = np.argmax(c[e,:]) + + + LARGE_VAL = pow(10, 10) + + for kk in range(1, num_segments): + max_m = np.zeros((n, h) , dtype=float) + indices = np.zeros((n, num_segments) , dtype=int) + + for a in range(0, h): + def lookup(i,j): + x = -LARGE_VAL + + if not ((j < kk) | (j >= i)): + x = o_ek[j,kk-1] + c[i,a] - c[j,a] + + return x + + rows = list(range(0, n)) + cols = list(range(0, n)) + + col_set = utils.smawk(rows,cols,lookup) + # col_set = utils.Maxcompute(rows,cols,lookup) + + for edg in range(0, n): + indices[edg,a] = col_set[edg] + max_m[edg,a] = lookup(edg,indices[edg,a]) + + for e in range(0, n): + o_ek[e,kk] = max(max_m[e,:]) + lamda_value[e,kk] = np.argmax(max_m[e,:]) + startsfrom[e,kk] = indices[e,lamda_value[e,kk]] + + boundary_point_array=[n-1] + counter = num_segments-1 + boundary_point = n-1 + + # new_lambda_estimates = np.zeros((num_roles, num_roles,num_segments) , dtype=float) + # print(o_ek[boundary_point,counter]) + new_g = np.zeros( num_segments, dtype=int) + while counter > -1: + # print(o_ek[boundary_point,counter]) + + d= int(lamda_value[boundary_point,counter]) + + new_g[counter] = int(d) + + + boundary_point = startsfrom[boundary_point, counter] + boundary_point_array.append(boundary_point) + counter -= 1 + + b_list = [] + + for b in boundary_point_array: + b_list.append(time_stamps_unique[b]) + + + change_points_arr_ = list(reversed(b_list)) + change_points_arr[:,:,:] = change_points_arr_ + # print(change_points_arr_) + + # mm_compute_cost(group_assignment,lambda_estimates,change_points_arr,num_roles,num_segments,dic,g_mapping) + h_current = len(set(list(new_g))) + + current_g_mapping = list(new_g) + # mm_compute_cost(group_assignment,lambda_estimates,change_points_arr,num_roles,num_segments,dic,current_g_mapping) + print('h : {} , h_cur : {}'.format(h, h_current)) + if h > h_current: + print('No of h-levels does not satisfied... (h:{} , h-new:{})'.format(h, h_current)) + + + list_all = [ele for ele in range(0,h)] + + a = set(current_g_mapping) + b = set(list_all) + + diff = list(b - a) + n = h - len(a) + missing_list = random.sample(diff, n) + # print(missing_list) + + duplicate_entries = [item for item, count in collections.Counter(current_g_mapping).items() if count > 1] + # print(duplicate_entries) + + new_g_mapping = copy.copy(current_g_mapping) + # we have duplicate and missing list + + changes= [] + + for idx,m_e in enumerate(missing_list): + # find index of duplicate entries + if idx < len(duplicate_entries) : + ind = current_g_mapping.index(duplicate_entries[idx]) + else: + new_duplicate_entries = [item for item, count in collections.Counter(new_g_mapping).items() if count > 1] + # print(len(new_duplicate_entries)) + duplicate_entries.append(new_duplicate_entries[0]) + ind = new_g_mapping.index(duplicate_entries[idx]) + # update duplicate by a missing entry + new_g_mapping[ind] = m_e + # add (index,val) pair + changes.append([ind,m_e]) + + + current_g_mapping=new_g_mapping + + print(current_g_mapping) + return [change_points_arr,current_g_mapping,group_assignment] + +# Level-dependent : Compute cost +def mm_compute_cost(group_assignment,lambda_estimates,change_points_arr,num_roles,num_segments,dic,g_mapping): + + + # create dictionary to store interaction counts + # ( no of interactions between ith and jth group within dth segment) + key_data = [ + range(0,num_roles), + range(0,num_roles), + range(0,num_segments), + ] + keys = list(itertools.product(*key_data)) + i_j_d = {key: 0 for key in keys} + + current_g_mapping = g_mapping + + level_seg_mapping = {} + for d in range(0,num_segments): + + level = current_g_mapping[d] + # print(level_seg_mapping) + if level in level_seg_mapping: + # print(d) + level_seg_mapping[level].append(d) + else: + # print(d) + level_seg_mapping[level] = [] + level_seg_mapping[level].append(d) + + # change-points of (i,j) group pair + chg_points = change_points_arr[0,0,:] + + ranges_arr = [ [chg_points[s]+1,chg_points[s+1]] for s in range(0,len(chg_points)-1)] + ranges_arr[0][0]=0 + + n = len(ranges_arr) + + + for key, val in dic.items(): + + for item in val: + d = _findSegment(ranges_arr, n, int(item)) + + # l = current_g_mapping[d] + g_a = group_assignment[g_mapping[d]] + + i=g_a.get(key[0]) #group of v1 + j=g_a.get(key[1]) #group of v2 + + # undirected graph + if i>j: + i,j=j,i + + i_j_d[(i,j,d)] += 1 + + liklihood_sum = 0 + h = len(set(g_mapping)) + + # iterate over levels + for i in range(0,h): + g_a = group_assignment[i] + + list_of_groups= [[] for _ in range(num_roles)] + for idx, val in g_a.items(): + list_of_groups[val].append(idx) + + for k in range(0, num_roles): + for g in range(k,num_roles): + + + U=list_of_groups[k] + W=list_of_groups[g] + + size_all_pairs = 0 + if k == g: + size_all_pairs = math.comb(len(U), 2) + if k != g: + size_all_pairs = len(U)*len(W) + + + inter_count = 0 + delta_t = 0 + + for d in level_seg_mapping[i]: + # print('i {} d {}'.format(i, d)) + + delta_t += (change_points_arr[k,g,d+1] - change_points_arr[k,g,d]) + # delta_t += (change_points_arr[0,0,d+1] - change_points_arr[0,0,d]) + if d == 0: + delta_t += 1 + + inter_count += i_j_d[(k,g,d)] + + # jj = level_seg_mapping[i] + lamda = lambda_estimates[k,g,i] + liklihood_sum += (inter_count*math.log(lamda) - size_all_pairs*lamda*delta_t) + # print(liklihood_sum) + return liklihood_sum + -- GitLab