diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..dabd1b0ea9842e979373f59b048bc78ccbbd9085 Binary files /dev/null and b/.DS_Store differ diff --git a/aff-lam-fixed.py b/aff-lam-fixed.py new file mode 100644 index 0000000000000000000000000000000000000000..c3a7ba8720e2fe607f9edd96768ea9813d36d411 --- /dev/null +++ b/aff-lam-fixed.py @@ -0,0 +1,275 @@ + + +""" +Effect of lambda: Fixed +Dataset-1 +""" +import numpy as np +import pandas as pd +import utils +import sbm_core +import math +from itertools import combinations +import itertools +from sklearn.metrics.cluster import adjusted_rand_score + +# Initilaize +np.random.seed(34573251) + +results = np.zeros((50,3) , dtype=float) + +for itr_no in range(0,50): + + num_roles=2 + num_vertices=20 + num_segments = 4 + + NO_SAMPLES= 1850 + group_assignment= np.random.randint(num_roles, size=(num_vertices)) + + nodes = np.arange(num_vertices) + + list_of_groups= [[] for _ in range(num_roles)] + + for idx, val in enumerate(group_assignment): + list_of_groups[val].append(nodes[idx]) + + # print(list_of_groups) + + size_all_pairs = {} + for k in range(0, num_roles): + for g in range(k, num_roles): + U=list_of_groups[k] + W=list_of_groups[g] + + if k == g: + size_all_pairs[k,g] = math.comb(len(U), 2) + if k != g: + size_all_pairs[k,g] = len(U)*len(W) + + lamda_arr = np.ones((num_roles, num_roles,num_segments) , dtype=float) + lamda_arr_act = np.zeros((num_roles, num_roles,num_segments) , dtype=float) + + num_levels = 2 + H =num_levels + + # h-level lambda estimates + lambda_estimates_h = np.random.rand(num_roles, num_roles, H) + + # _itr = 8 # Change _itr from 0 to 8 for large lambda differences + # _itr = 1 + # yu = (9-_itr)*.1 + # lambda_estimates_h[0,0,:] = [yu, 0.01] + # lambda_estimates_h[0,1,:] = [0.01, yu] + # lambda_estimates_h[1,0,:] = lambda_estimates_h[0,1,:] + # lambda_estimates_h[1,1,:] = [yu, yu] + + _itr = 5 # Change _itr from 0 to 8 for smaller lambda differences + yu = (9-_itr)*.01 + lambda_estimates_h[0,0,:] = [yu, 0.01] + lambda_estimates_h[0,1,:] = [0.01, yu] + lambda_estimates_h[1,0,:] = lambda_estimates_h[0,1,:] + lambda_estimates_h[1,1,:] = [yu, yu] + + + l1 =list(range(0, H)) + l2 = [] + if num_segments > num_levels: + l2 = [np.random.randint(0,H) for i in range(num_segments-H)] + + # Mapping from segment to a level + g_mapping= np.array(l1 + l2) + # print('g mapping {}'.format(g_mapping)) + #Initialize lamda + lamda_arr = np.zeros((num_roles, num_roles,num_segments) , dtype=float) + for d in range(0, num_segments): + lamda_arr[:,:, d]= lambda_estimates_h[:,:,g_mapping[d]] + + change_points_arr = np.zeros((num_roles, num_roles, num_segments+1) , dtype=int) + df_all= None + + points= list(range(0, (num_segments+1)*NO_SAMPLES, NO_SAMPLES)) + list1 = [] + + # Generate piecewise non-homogeneous poisson process + for k in range(0, num_roles): + for g in range(k, num_roles): + comb = [] + if k == g: + comb = list(combinations(list_of_groups[k], 2)) + # print(type(comb)) + else: + # comb = [] + key_data = [list_of_groups[k],list_of_groups[g],] + comb = list(itertools.product(*key_data)) + # print(comb) + if len(comb) != size_all_pairs[k,g]: + print('not equal..') + + change_points_arr[k,g,:] = points + lamda_arr[k,g,:] = lamda_arr[g,k,:] + + tot_count = np.zeros((num_segments) , dtype=float) + + for pair in comb: + + for d in range(0,num_segments): + + s = np.random.poisson(lamda_arr[k,g,d], NO_SAMPLES) + # print(np.count_nonzero(s)) + tot_count[d] += np.count_nonzero(s) + + list1=[i for i, e in enumerate(s) if e != 0] + + if len(list1) == 0: + print('zero') + + list1 = [x+points[d] for x in list1] + + df = pd.DataFrame(data=list1) + df.columns =['timestamp'] + + + N= df.size + + list_start_stations =[pair[0]] * N + list_end_stations =[pair[1]] * N + + df['source'] = list_start_stations + df['target'] = list_end_stations + + df_all=pd.concat([df_all, df], ignore_index=True) + + for d in range(0,num_segments): + lamda_arr_act[k,g,d] = tot_count[d]/(NO_SAMPLES*len(comb)) + # print(tot_count[d]) + ## Other preparations + + # Remove self loops + df_all = df_all[((df_all['source'] ) != (df_all['target']))] + #sort + df_all=df_all.sort_values('timestamp') + df_all = df_all[['target', 'timestamp','source']] + + # Save as .csv file + # df_all.to_csv('./Data/synthetic_ground_truth_g1.csv') + + df=df_all + dest_folder='./Results/synthetic/3' + t_df = df['timestamp'] + + nodes_arr = np.union1d(df['target'],df['source']).astype(int) + # list of nodes + nodes = nodes_arr.tolist() + num_vertices = len(nodes) + + # node-group dictionary + group_dic = {} + keys = nodes + values = list(group_assignment) + group_dic = dict(zip(keys,values)) + + + # create a new dictionary - key: node-pair , value: list of timestamps + dic=df.groupby(['source','target'])['timestamp'].apply(list).to_dict() + # print('{} {} {} '.format(group_dic, lamda_arr_act,change_points_arr)) + + + def _swap (row): + if row['source'] > row['target']: + row['source'] , row['target'] =row['target'] , row['source'] + return row + + # Undirected graph + df=df.apply(lambda row: _swap(row), axis=1) + #scale timestamps for zeroth reference point + refValue = df['timestamp'].min() + df['timestamp'] -= refValue + + # Experiment + import experiment + + # User parameters + num_roles=2 + # num_segments=4 + # num_levels=2# Optional arg + algo_ver=3 + dest_folder='./Results/synthetic/' + + # tuning parameters + theta = 1e-7 + eta = 1 + tuning_params= {'theta':theta,'eta':eta} + + + exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) + [itr_d,likelihood_d,group_dic_d,lambda_estimates_d,change_points_arr_d] = exp_obj.execute() + + + t_df = sorted(t_df) + + chg_points = change_points_arr_d[0,0,:] + ranges_arr = [ [chg_points[s]+1,chg_points[s+1]] for s in range(0,len(chg_points)-1)] + ranges_arr[0][0] = 0 + list_time_stamps = list(t_df) + + + # iterate over timestamps list + dis_arr = list() + gt_arr = list() + + + for item in list_time_stamps: + + # find the segment which the timestamp belongs + # (is dependent on which groups the two nodes belong) + d = sbm_core._findSegment(ranges_arr, len(ranges_arr) , int(item)) + dis_arr.append(d) + + + chg_points = change_points_arr[0,0,:] + ranges_arr = [ [chg_points[s]+1,chg_points[s+1]] for s in range(0,len(chg_points)-1)] + ranges_arr[0][0] = 0 + list_time_stamps = list(t_df) + + + # iterate over timestamps list + + for item in list_time_stamps: + + # find the segment which the timestamp belongs + # (is dependent on which groups the two nodes belong) + d = sbm_core._findSegment(ranges_arr, len(ranges_arr) , int(item)) + gt_arr.append(d) + + + ind = adjusted_rand_score(gt_arr,dis_arr) + # print('rand index: seg {} : {}'.format(_itr, ind)) + + g1= group_dic_d + g2= group_dic_d[1] + + ds= list(group_dic_d.values() ) + gt1 = list(g1.values()) + + ind_grp=adjusted_rand_score(ds,gt1) + # print('rand index: group {} : {}'.format(_itr, ind_grp)) + + results[itr_no][0] = ind + results[itr_no][1] = itr_d + results[itr_no][2] = ind_grp + +import pickle +pickle.dump(results, open('large-fixed-file-{}.pickle'.format(_itr), 'wb')) + +arr = results +ll_avg_val = (sum(arr)/len(arr)) + +#FINAL RESULTS + +print(ll_avg_val) +print(max(arr[:,0])) +print(min(arr[:,0])) + +print(max(arr[:,1])) +print(min(arr[:,1])) diff --git a/lam-aff-2-ld.py b/aff-lam-ld.py similarity index 89% rename from lam-aff-2-ld.py rename to aff-lam-ld.py index 1d4c07bf7c73728106f2c95f0eb8527b5e4c7eab..deb26c6ca48be316e9dff966d36ee2f9837f29de 100644 --- a/lam-aff-2-ld.py +++ b/aff-lam-ld.py @@ -1,10 +1,10 @@ + """ Effect of lambda: LD Dataset-2 """ - import numpy as np import pandas as pd import utils @@ -14,19 +14,18 @@ from itertools import combinations import itertools from sklearn.metrics.cluster import adjusted_rand_score # Initilaize -np.random.seed(3457325) - -res = np.zeros((9,5) , dtype=float) +np.random.seed(34573251) +results = np.zeros((50,3) , dtype=float) -for _itr in range(0,1): +for itr_no in range(0,50): num_roles=2 num_vertices=20 num_segments = 4 num_levels = 2 - NO_SAMPLES= 200 + NO_SAMPLES= 1850 nodes = np.arange(num_vertices) lamda_arr_act = np.zeros((num_roles, num_roles,num_levels) , dtype=float) @@ -36,12 +35,18 @@ for _itr in range(0,1): # h-level lambda estimates lambda_estimates_h = np.random.rand(num_roles, num_roles, H) - lambda_estimates_h = 1e-2*np.random.randint(11,99, size=(num_roles, num_roles, H)) - # Make high variant lambdas + _itr = 3 yu = (9-_itr)*.1 - lambda_estimates_h[0,0,:] = [yu, 0.1] - lambda_estimates_h[0,1,:] = [0.1, yu] + lambda_estimates_h[0,0,:] = [yu, 0.01] + lambda_estimates_h[0,1,:] = [0.01, yu] + lambda_estimates_h[1,0,:] = lambda_estimates_h[0,1,:] + lambda_estimates_h[1,1,:] = [yu, yu] + + _itr = 4 + yu = (9-_itr)*.01 + lambda_estimates_h[0,0,:] = [yu, 0.01] + lambda_estimates_h[0,1,:] = [0.01, yu] lambda_estimates_h[1,0,:] = lambda_estimates_h[0,1,:] lambda_estimates_h[1,1,:] = [yu, yu] @@ -188,13 +193,10 @@ for _itr in range(0,1): df_all=pd.concat([df_all, df], ignore_index=True) - # for dd in level_seg_mapping: - # dd = d + lamda_arr_act[k,g,i] = round(((tot_count[i])/(NO_SAMPLES*com_len[i])),1) lamda_arr_act[g,k,i] = lamda_arr_act[k,g,i] - # print('tot count') - # print(tot_count[dd]) - # print(' {} {} {} {} : k g d :lamb'.format(k,g,d,lamda_arr_act[g,k,dd])) + print(' {} {} {} {} : k g d :lamb'.format(k,g,i,lamda_arr_act[g,k,i])) # Remove self loops @@ -205,8 +207,7 @@ for _itr in range(0,1): # Save as .csv file # df_all.to_csv('./Data/synthetic_ground_truth_g1.csv') - - + df= None df=df_all dest_folder='./Results/synthetic/3' @@ -296,8 +297,6 @@ for _itr in range(0,1): ind_seg = adjusted_rand_score(gt_arr,dis_arr) print('ind {} : {}'.format(_itr, ind_seg)) - liklihood_sum = sbm_core.mm_compute_cost(group_dic,lamda_arr_act,change_points_arr,num_roles,num_segments,dic,g_mapping) - print(' Initial Actual likelihood .......%f'%liklihood_sum) print('g mapping {}'.format(g_mapping)) @@ -308,13 +307,12 @@ for _itr in range(0,1): list_of_groups[val].append(idx) print('group assignments {}: {}'.format(e_h,list_of_groups)) - #group ass, of level 1 - list_of_groups_1= [[] for _ in range(num_roles)] - #group ass, of level 2 - list_of_groups_2= [[] for _ in range(num_roles)] + g1= group_dic_d[0] g2= group_dic_d[1] + # print('rand index: group {} : {}'.format(_itr, ind_grp)) + found_cont = 0 for i_h in range(0,num_levels): @@ -329,8 +327,7 @@ for _itr in range(0,1): ds= list(group_dic_d[i_h].values() ) gt1 = list(g1.values()) gt2 = list(g2.values()) - - + ind1=adjusted_rand_score(ds,gt1) ind2=adjusted_rand_score(ds,gt2) @@ -340,35 +337,25 @@ for _itr in range(0,1): ind = found_cont/2 - res[_itr][1] = ind - res[_itr][4] = ind_seg -print('end') - - - -# 0.9898951352373943 -# 0.9904822820772498 -# 0.9894069501702982 -# 0.9892811884102554 -# 0.9893223431465236 -# 0.9886669698061425 -# 0.4041524218474172 -# 0.4968349779236352 -# 0.49583738728791915 - + results[itr_no][0] = ind_seg + results[itr_no][1] = it + results[itr_no][2] = ind +print('end') + +import pickle +# pickle.dump(results, open('max-small-file-{}.pickle'.format(_itr), 'wb')) +#FINAL RESULTS -# 3 -# 3 -# 3 -# 6 -# 4 -# 3 -# 5 -# 4 -# 3 +arr = results +ll_avg_val = (sum(arr)/len(arr)) +print(ll_avg_val) +print(max(arr[:,0])) +print(min(arr[:,0])) +print(max(arr[:,1])) +print(min(arr[:,1])) diff --git a/bikes_santander.py b/bikes_santander.py index 003aadb2082b4187139ec9472f499835da4fef93..ef260be70bfac719397be52f5495f6e5b6ec51cc 100755 --- a/bikes_santander.py +++ b/bikes_santander.py @@ -62,7 +62,7 @@ df = df[:1000] num_roles=3 num_segments=7 num_levels=5 -algo_ver= 2 +algo_ver= 3 dest_folder='./Results/bikes/' # tuning parameters diff --git a/lam-aff-1-fixed.py b/lam-aff-1-fixed.py deleted file mode 100644 index 33f914a9c85b2138054b272cc45f2c68a6329567..0000000000000000000000000000000000000000 --- a/lam-aff-1-fixed.py +++ /dev/null @@ -1,269 +0,0 @@ - -""" -Affect of Lambda -Dataset-1 -""" - -import numpy as np -import pandas as pd -import utils -import sbm_core -import math -from itertools import combinations -import itertools -from sklearn.metrics.cluster import adjusted_rand_score - -# Initilaize -np.random.seed(107) - -num_roles=2 -num_vertices=25 -num_segments = 2 - -NO_SAMPLES= 95 -group_assignment= np.random.randint(num_roles, size=(num_vertices)) - -nodes = np.arange(num_vertices) - -list_of_groups= [[] for _ in range(num_roles)] - -for idx, val in enumerate(group_assignment): - list_of_groups[val].append(nodes[idx]) - -print(list_of_groups) - -size_all_pairs = {} -for k in range(0, num_roles): - for g in range(k, num_roles): - U=list_of_groups[k] - W=list_of_groups[g] - - if k == g: - size_all_pairs[k,g] = math.comb(len(U), 2) - if k != g: - size_all_pairs[k,g] = len(U)*len(W) - -lamda_arr = np.ones((num_roles, num_roles,num_segments) , dtype=float) -lamda_arr = 1e-1* np.random.randint(1,9, size=(num_roles, num_roles,num_segments)) - -#set value for each iteration ( 0 - 8 ) -_itr = 8 -_itr = 0 - -yu = (9-_itr)*.1 -lamda_arr[0,0]=[yu, 0.1] -lamda_arr[0,1]= [0.1, yu] -lamda_arr[1,0]=lamda_arr[0,1] -lamda_arr[1,1]=[yu, yu] - -lamda_arr_act = np.zeros((num_roles, num_roles,num_segments) , dtype=float) - -change_points_arr = np.zeros((num_roles, num_roles, num_segments+1) , dtype=int) -df_all= None - -points= list(range(0, (num_segments+1)*NO_SAMPLES, NO_SAMPLES)) -list1 = [] - -# Generate piecewise non-homogeneous poisson process -for k in range(0, num_roles): - for g in range(k, num_roles): - comb = [] - if k == g: - comb = list(combinations(list_of_groups[k], 2)) - # print(type(comb)) - else: - # comb = [] - key_data = [list_of_groups[k],list_of_groups[g],] - comb = list(itertools.product(*key_data)) - # print(comb) - if len(comb) != size_all_pairs[k,g]: - print('not equal..') - - change_points_arr[k,g,:] = points - lamda_arr[k,g,:] = lamda_arr[g,k,:] - - tot_count = np.zeros((num_segments) , dtype=float) - - for pair in comb: - - for d in range(0,num_segments): - - s = np.random.poisson(lamda_arr[k,g,d], NO_SAMPLES) - # print(np.count_nonzero(s)) - tot_count[d] += np.count_nonzero(s) - - list1=[i for i, e in enumerate(s) if e != 0] - - if len(list1) == 0: - print('zero') - - list1 = [x+points[d] for x in list1] - - df = pd.DataFrame(data=list1) - df.columns =['timestamp'] - - N= df.size - - list_start_stations =[pair[0]] * N - list_end_stations =[pair[1]] * N - - df['source'] = list_start_stations - df['target'] = list_end_stations - - df_all=pd.concat([df_all, df], ignore_index=True) - - for d in range(0,num_segments): - lamda_arr_act[k,g,d] = tot_count[d]/(NO_SAMPLES*len(comb)) - # print(tot_count[d]) -## Other preparations - -# Remove self loops -df_all = df_all[((df_all['source'] ) != (df_all['target']))] -#sort -df_all=df_all.sort_values('timestamp') -df_all = df_all[['target', 'timestamp','source']] - -# Save as .csv file -# df_all.to_csv('./Data/synthetic_ground_truth_g1.csv') - -df=df_all -dest_folder='./Results/synthetic/3' -t_df = df['timestamp'] - -nodes_arr = np.union1d(df['target'],df['source']).astype(int) -# list of nodes -nodes = nodes_arr.tolist() -num_vertices = len(nodes) - -# node-group dictionary -group_dic = {} -keys = nodes -values = list(group_assignment) -group_dic = dict(zip(keys,values)) - - -# create a new dictionary - key: node-pair , value: list of timestamps -dic=df.groupby(['source','target'])['timestamp'].apply(list).to_dict() -print('{} {} {} '.format(group_dic, lamda_arr_act,change_points_arr)) - - -liklihood_sum = sbm_core.compute_cost(group_dic,lamda_arr_act,change_points_arr,num_roles,num_segments,dic) -print(' Initial Actual likelihood .......%f'%liklihood_sum) - -def _swap (row): - if row['source'] > row['target']: - row['source'] , row['target'] =row['target'] , row['source'] - return row - -# Undirected graph -df=df.apply(lambda row: _swap(row), axis=1) -#scale timestamps for zeroth reference point -refValue = df['timestamp'].min() -df['timestamp'] -= refValue - -# Experiment - -import experiment - - -# User parameters -num_roles=2 -num_segments=2 -num_levels=2# Optional arg -algo_ver=3 -dest_folder='./Results/synthetic/' - -# tuning parameters -theta = 0 -eta = 1 -tuning_params= {'theta':theta,'eta':eta} - -import time -start_time = time.time() - -exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) -[itr_d,likelihood_d,group_dic_d,lambda_estimates_d,change_points_arr_d] = exp_obj.execute() - -print("--- %s seconds ---" % (time.time() - start_time)) - - -t_df = sorted(t_df) - -chg_points = change_points_arr_d[0,0,:] -ranges_arr = [ [chg_points[s]+1,chg_points[s+1]] for s in range(0,len(chg_points)-1)] -ranges_arr[0][0] = 0 -list_time_stamps = list(t_df) - - -# iterate over timestamps list -dis_arr = list() -gt_arr = list() - - -for item in list_time_stamps: - - # find the segment which the timestamp belongs - # (is dependent on which groups the two nodes belong) - d = sbm_core._findSegment(ranges_arr, len(ranges_arr) , int(item)) - dis_arr.append(d) - - -chg_points = change_points_arr[0,0,:] -ranges_arr = [ [chg_points[s]+1,chg_points[s+1]] for s in range(0,len(chg_points)-1)] -ranges_arr[0][0] = 0 -list_time_stamps = list(t_df) - - -# iterate over timestamps list - -for item in list_time_stamps: - - # find the segment which the timestamp belongs - # (is dependent on which groups the two nodes belong) - d = sbm_core._findSegment(ranges_arr, len(ranges_arr) , int(item)) - gt_arr.append(d) - - -ind = adjusted_rand_score(gt_arr,dis_arr) -print('rand index: seg {} : {}'.format(_itr, ind)) - -g1= group_dic_d -g2= group_dic_d[1] - -ds= list(group_dic_d.values() ) -gt1 = list(g1.values()) - -ind_grp=adjusted_rand_score(ds,gt1) -print('rand index: group {} : {}'.format(_itr, ind_grp)) - -# likelihood for single group and single segment # Normlaized likelihood -# num_roles=1 -# num_segments=1 -# num_levels=1 -# exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) -# exp_obj.execute() - - -# 0.9757628970136779 -# 0.9766007299178403 -# 0.976928378298833 -# 0.9768191326709813 -# 0.9786828903097778 -# 0.9762078453564311 -# 0.9731767168042805 -# 0.9753286095154328 -# 0.5773249110735138 - - -# 3 -# 3 -# 2 -# 3 -# 3 -# 3 -# 3 -# 4 -# 4 - - - diff --git a/lam-aff-1-ld.py b/lam-aff-1-ld.py deleted file mode 100644 index 52c3e70f748d4a1c6fd8c8b8aeecf21640092d4c..0000000000000000000000000000000000000000 --- a/lam-aff-1-ld.py +++ /dev/null @@ -1,386 +0,0 @@ - - -""" -Effect of lambda: LD -Dataset-1 -""" -import numpy as np -import pandas as pd -import utils -import sbm_core -import math -from itertools import combinations -import itertools -from sklearn.metrics.cluster import adjusted_rand_score -# Initilaize -np.random.seed(155) - -res = np.zeros((9,5) , dtype=float) - - -for _itr in range(8,9): - - num_roles=2 - num_vertices=20 - num_segments = 4 - num_levels = 2 - - NO_SAMPLES= 200 - nodes = np.arange(num_vertices) - lamda_arr_act = np.zeros((num_roles, num_roles,num_levels) , dtype=float) - - - H =num_levels - print('k-h levels %d'%(num_levels)) - - # h-level lambda estimates - lambda_estimates_h = np.random.rand(num_roles, num_roles, H) - lambda_estimates_h = 1e-2*np.random.randint(11,99, size=(num_roles, num_roles, H)) - - # Make high variant lambdas - - yu = (9-_itr)*.1 - lambda_estimates_h[0,0,:] = [yu, 0.1] - lambda_estimates_h[0,1,:] = [0.1, yu] - lambda_estimates_h[1,0,:] = lambda_estimates_h[0,1,:] - lambda_estimates_h[1,1,:] = [yu, yu] - - - l1 =list(range(0, H)) - l2 = [] - if num_segments > num_levels: - l2 = [np.random.randint(0,H) for i in range(num_segments-H)] - - # Mapping from segment to a level - g_mapping= np.array(l1 + l2) - print('g mapping {}'.format(g_mapping)) - - # initilaize group assignment randomly - group_assignment_arr= np.random.randint(num_roles, size=(num_levels,num_vertices)) - # node-group dictionary - group_dic = {} - - for i in range(0,num_levels ): - level = i - - group_dic_level = {} - keys = nodes - values = list(group_assignment_arr[level]) - group_dic_level = dict(zip(keys,values)) - group_dic[i] = group_dic_level - print('initial') - # print(group_dic) - - for e_h in range(0,num_segments): - g_a = group_dic[g_mapping[e_h]] - list_of_groups= [[] for _ in range(num_roles)] - for idx, val in g_a.items(): - list_of_groups[val].append(idx) - print('group assignments {}: {}'.format(e_h,list_of_groups)) - # Plotting - - #Initialize lamda - lamda_arr = np.zeros((num_roles, num_roles,num_segments) , dtype=float) - for d in range(0, num_segments): - for k in range(0, num_roles): - for g in range(k, num_roles): - lamda_arr[k,g, d]= lambda_estimates_h[k,g,g_mapping[d]] - lamda_arr[g,k, d]= lamda_arr[k,g, d] - change_points_arr = np.zeros((num_roles, num_roles, num_segments+1) , dtype=int) - df_all= None - - points= list(range(0, (num_segments+1)*NO_SAMPLES, NO_SAMPLES)) - list1 = [] - - level_seg_mapping = {} - for d in range(num_segments): - level = g_mapping[d] - if level in level_seg_mapping: - level_seg_mapping[level].append(d) - else: - level_seg_mapping[level] = [] - level_seg_mapping[level].append(d) - # %% - # Generate piecewise non-homogeneous poisson process - - - tot_count = np.zeros((num_levels) , dtype=float) - com_len = np.zeros((num_levels) , dtype=float) - # for pair in comb: - - for i in range(0,num_levels): - # i = g_mapping[d] - group_assignment = group_assignment_arr[i] - - print(group_assignment) - - list_of_groups= [[] for _ in range(num_roles)] - - for idx, val in enumerate(group_assignment): - list_of_groups[val].append(nodes[idx]) - - # print(list_of_groups) - - size_all_pairs = {} - - for kk in range(0, num_roles): - for gg in range(kk, num_roles): - U=list_of_groups[kk] - W=list_of_groups[gg] - - if kk == gg: - size_all_pairs[kk,gg] = math.comb(len(U), 2) - if kk != gg: - size_all_pairs[kk,gg] = len(U)*len(W) - - for k in range(0, num_roles): - for g in range(k, num_roles): - - - change_points_arr[k,g,:] = points - lamda_arr[k,g,:] = lamda_arr[g,k,:] - - - - comb = [] - if k == g: - comb = list(combinations(list_of_groups[k], 2)) - # print(type(comb)) - else: - # comb = [] - key_data = [list_of_groups[k],list_of_groups[g],] - comb = list(itertools.product(*key_data)) - # print(comb) - if len(comb) != size_all_pairs[k,g]: - print('not equal..') - - - print('d val {}'.format( d)) - com_len[i] = len(comb) - # print('comb len {}'.format( com_len[d])) - tot_count[i] = 0 - - for pair in comb: - s = np.random.poisson(lamda_arr[k,g,d], NO_SAMPLES) - # print(np.count_nonzero(s)) - tot_count[i] += np.count_nonzero(s) - - list_org=[i for i, e in enumerate(s) if e != 0] - - if len(list_org) == 0: - print('zero') - - for d in level_seg_mapping[i]: - - - list1 = [x+points[d] for x in list_org] - df= None - df = pd.DataFrame(data=list1) - df.columns =['timestamp'] - - # print(list1) - # if max(list1) > 799: - # print('{} {}'.format(d, max(list1))) - N= df.size - - # print(pair) - # print(pair[0]) - - list_start_stations =[pair[0]] * N - list_end_stations =[pair[1]] * N - - df['source'] = list_start_stations - df['target'] = list_end_stations - - df_all=pd.concat([df_all, df], ignore_index=True) - - # for dd in level_seg_mapping: - # dd = d - lamda_arr_act[k,g,i] = round(((tot_count[i])/(NO_SAMPLES*com_len[i])),1) - lamda_arr_act[g,k,i] = lamda_arr_act[k,g,i] - # print('tot count') - # print(tot_count[dd]) - # print(' {} {} {} {} : k g d :lamb'.format(k,g,d,lamda_arr_act[g,k,dd])) - print(' {} {} {} {} : k g d :lamb'.format(k,g,i,lamda_arr_act[g,k,i])) - - # Remove self loops - df_all = df_all[((df_all['source'] ) != (df_all['target']))] - #sort - df_all=df_all.sort_values('timestamp') - df_all = df_all[['target', 'timestamp','source']] - - # Save as .csv file - # df_all.to_csv('./Data/synthetic_ground_truth_g1.csv') - - - df= None - df=df_all - dest_folder='./Results/synthetic/3' - t_df = df['timestamp'] - - nodes_arr = np.union1d(df['target'],df['source']).astype(int) - # list of nodes - nodes = nodes_arr.tolist() - num_vertices = len(nodes) - - - # create a new dictionary - key: node-pair , value: list of timestamps - dic=df.groupby(['source','target'])['timestamp'].apply(list).to_dict() - print('{} {} {} '.format(group_dic, lamda_arr_act,change_points_arr)) - - - # liklihood_sum = sbm_core.mm_compute_cost(group_dic,lamda_arr_act,change_points_arr,num_roles,num_segments,dic,g_mapping) - # print(' Initial Actual likelihood .......%f'%liklihood_sum) - - def _swap (row): - if row['source'] > row['target']: - row['source'] , row['target'] =row['target'] , row['source'] - return row - - # Undirected graph - df=df.apply(lambda row: _swap(row), axis=1) - #scale timestamps for zeroth reference point - refValue = df['timestamp'].min() - df['timestamp'] -= refValue - - - - chg_points = change_points_arr[0,0,:] - ranges_arr = [ [chg_points[s]+1,chg_points[s+1]] for s in range(0,len(chg_points)-1)] - ranges_arr[0][0] = 0 - list_time_stamps = list(t_df) - - - # iterate over timestamps list - gt_arr = list() - for item in list_time_stamps: - - # find the segment which the timestamp belongs - # (is dependent on which groups the two nodes belong) - d = sbm_core._findSegment(ranges_arr, len(ranges_arr) , int(item)) - gt_arr.append(d) - - # Experiment - import experiment - - # User parameters - # num_roles=2 - # num_segments=10 - # num_levels=5# Optional arg - algo_ver=4 - dest_folder='./Results/synthetic/' - - # tuning parameters - theta = 1e-7 - eta = 1 - tuning_params= {'theta':theta,'eta':eta} - - - - exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) - # [likelihood_f,group_dic_f] = exp_obj.execute() - [it,ll1,group_dic_d,lambda_estimates,change_points_arr_d]= exp_obj.execute() - - - # SEGMENTATION ACCURACY - - t_df = sorted(t_df) - - chg_points = change_points_arr_d[0,0,:] - ranges_arr = [ [chg_points[s]+1,chg_points[s+1]] for s in range(0,len(chg_points)-1)] - ranges_arr[0][0] = 0 - list_time_stamps = list(t_df) - - - # iterate over timestamps list - dis_arr = list() - for item in list_time_stamps: - - # find the segment which the timestamp belongs - # (is dependent on which groups the two nodes belong) - d = sbm_core._findSegment(ranges_arr, len(ranges_arr) , int(item)) - dis_arr.append(d) - - - gt_arr= np.array(gt_arr, dtype=np.float64) - dis_arr= np.array(dis_arr, dtype=np.float64) - ind_seg = adjusted_rand_score(gt_arr,dis_arr) - print('ind {} : {}'.format(_itr, ind_seg)) - - - liklihood_sum = sbm_core.mm_compute_cost(group_dic,lamda_arr_act,change_points_arr,num_roles,num_segments,dic,g_mapping) - print(' Initial Actual likelihood .......%f'%liklihood_sum) - - print('g mapping {}'.format(g_mapping)) - - for e_h in range(0,num_segments): - g_a = group_dic[g_mapping[e_h]] - list_of_groups= [[] for _ in range(num_roles)] - for idx, val in g_a.items(): - list_of_groups[val].append(idx) - print('group assignments {}: {}'.format(e_h,list_of_groups)) - - #group ass, of level 1 - list_of_groups_1= [[] for _ in range(num_roles)] - #group ass, of level 2 - list_of_groups_2= [[] for _ in range(num_roles)] - g1= group_dic_d[0] - g2= group_dic_d[1] - - - found_cont = 0 - - for i_h in range(0,num_levels): - # i_h level - grp = group_dic_d[i_h] - - list_of_groups_d= [[] for _ in range(num_roles)] - - for idx, val in grp.items(): - list_of_groups_d[val].append(idx) - - ds= list(group_dic_d[i_h].values() ) - gt1 = list(g1.values()) - gt2 = list(g2.values()) - - - ind1=adjusted_rand_score(ds,gt1) - ind2=adjusted_rand_score(ds,gt2) - - - d_in = max(ind1,ind2) - found_cont += d_in - - ind = found_cont/2 - - res[_itr][1] = ind - res[_itr][4] = ind_seg -print('end') - - - -# 0.989349 -# 0.9899235585218414 -# 0.9887209171780673 -# 0.9900141929986654 -# 0.9900915114849232 -# 0.9895393785077311 -# 0.9890441642420313 -# 0.5056343918828786 -# 0.489279 - - - -# 2 -# 3 -# 2 -# 3 -# 3 -# 3 -# 4 -# 3 -# 3 - - - - diff --git a/lam-aff-2-fixed.py b/lam-aff-2-fixed.py deleted file mode 100644 index 0ec886437aa08f047ddba664695d07c02d368fcf..0000000000000000000000000000000000000000 --- a/lam-aff-2-fixed.py +++ /dev/null @@ -1,262 +0,0 @@ - - -""" -Affect of Lambda -Dataset-2 -""" - -import numpy as np -import pandas as pd -import utils -import sbm_core -import math -from itertools import combinations -import itertools -from sklearn.metrics.cluster import adjusted_rand_score -# Initilaize -np.random.seed(1137) - -num_roles=2 -num_vertices=25 -num_segments = 2 - -NO_SAMPLES= 100 -group_assignment= np.random.randint(num_roles, size=(num_vertices)) - -nodes = np.arange(num_vertices) - -list_of_groups= [[] for _ in range(num_roles)] - -for idx, val in enumerate(group_assignment): - list_of_groups[val].append(nodes[idx]) - -print(list_of_groups) - -size_all_pairs = {} -for k in range(0, num_roles): - for g in range(k, num_roles): - U=list_of_groups[k] - W=list_of_groups[g] - - if k == g: - size_all_pairs[k,g] = math.comb(len(U), 2) - if k != g: - size_all_pairs[k,g] = len(U)*len(W) - -lamda_arr = np.ones((num_roles, num_roles,num_segments) , dtype=float) -lamda_arr = 1e-1* np.random.randint(1,9, size=(num_roles, num_roles,num_segments)) - -#set value for each iteration ( 0 - 8 ) -_itr = 8 -_itr = 0 - -yu = (9-_itr)*.1 -lamda_arr[0,0]=[yu, 0.1] -lamda_arr[0,1]= [0.1, yu] -lamda_arr[1,0]=lamda_arr[0,1] -lamda_arr[1,1]=[yu, yu] - -lamda_arr_act = np.zeros((num_roles, num_roles,num_segments) , dtype=float) - -change_points_arr = np.zeros((num_roles, num_roles, num_segments+1) , dtype=int) -df_all= None - -points= list(range(0, (num_segments+1)*NO_SAMPLES, NO_SAMPLES)) -list1 = [] - -# Generate piecewise non-homogeneous poisson process -for k in range(0, num_roles): - for g in range(k, num_roles): - comb = [] - if k == g: - comb = list(combinations(list_of_groups[k], 2)) - # print(type(comb)) - else: - # comb = [] - key_data = [list_of_groups[k],list_of_groups[g],] - comb = list(itertools.product(*key_data)) - # print(comb) - if len(comb) != size_all_pairs[k,g]: - print('not equal..') - - change_points_arr[k,g,:] = points - lamda_arr[k,g,:] = lamda_arr[g,k,:] - - tot_count = np.zeros((num_segments) , dtype=float) - - for pair in comb: - - for d in range(0,num_segments): - - s = np.random.poisson(lamda_arr[k,g,d], NO_SAMPLES) - # print(np.count_nonzero(s)) - tot_count[d] += np.count_nonzero(s) - - list1=[i for i, e in enumerate(s) if e != 0] - - if len(list1) == 0: - print('zero') - - list1 = [x+points[d] for x in list1] - - df = pd.DataFrame(data=list1) - df.columns =['timestamp'] - - N= df.size - - list_start_stations =[pair[0]] * N - list_end_stations =[pair[1]] * N - - df['source'] = list_start_stations - df['target'] = list_end_stations - - df_all=pd.concat([df_all, df], ignore_index=True) - - for d in range(0,num_segments): - lamda_arr_act[k,g,d] = tot_count[d]/(NO_SAMPLES*len(comb)) - # print(tot_count[d]) -## Other preparations - -# Remove self loops -df_all = df_all[((df_all['source'] ) != (df_all['target']))] -#sort -df_all=df_all.sort_values('timestamp') -df_all = df_all[['target', 'timestamp','source']] - -# Save as .csv file -# df_all.to_csv('./Data/synthetic_ground_truth_g1.csv') - -df=df_all -dest_folder='./Results/synthetic/3' -t_df = df['timestamp'] - -nodes_arr = np.union1d(df['target'],df['source']).astype(int) -# list of nodes -nodes = nodes_arr.tolist() -num_vertices = len(nodes) - -# node-group dictionary -group_dic = {} -keys = nodes -values = list(group_assignment) -group_dic = dict(zip(keys,values)) - - -# create a new dictionary - key: node-pair , value: list of timestamps -dic=df.groupby(['source','target'])['timestamp'].apply(list).to_dict() -print('{} {} {} '.format(group_dic, lamda_arr_act,change_points_arr)) - - -liklihood_sum = sbm_core.compute_cost(group_dic,lamda_arr_act,change_points_arr,num_roles,num_segments,dic) -print(' Initial Actual likelihood .......%f'%liklihood_sum) - -def _swap (row): - if row['source'] > row['target']: - row['source'] , row['target'] =row['target'] , row['source'] - return row - -# Undirected graph -df=df.apply(lambda row: _swap(row), axis=1) -#scale timestamps for zeroth reference point -refValue = df['timestamp'].min() -df['timestamp'] -= refValue - -# Experiment - -import experiment - - -# User parameters -num_roles=2 -num_segments=2 -num_levels=2# Optional arg -algo_ver=3 -dest_folder='./Results/synthetic/' - -# tuning parameters -theta = 0 -eta = 1 -tuning_params= {'theta':theta,'eta':eta} - -import time -start_time = time.time() - -exp_obj = experiment.Experiment(df,num_roles,num_segments,algo_ver,dest_folder,tuning_params,num_levels,refValue) -[itr_d,likelihood_d,group_dic_d,lambda_estimates_d,change_points_arr_d] = exp_obj.execute() - -print("--- %s seconds ---" % (time.time() - start_time)) - - -t_df = sorted(t_df) - -chg_points = change_points_arr_d[0,0,:] -ranges_arr = [ [chg_points[s]+1,chg_points[s+1]] for s in range(0,len(chg_points)-1)] -ranges_arr[0][0] = 0 -list_time_stamps = list(t_df) - - -# iterate over timestamps list -dis_arr = list() -gt_arr = list() - - -for item in list_time_stamps: - - # find the segment which the timestamp belongs - # (is dependent on which groups the two nodes belong) - d = sbm_core._findSegment(ranges_arr, len(ranges_arr) , int(item)) - dis_arr.append(d) - - -chg_points = change_points_arr[0,0,:] -ranges_arr = [ [chg_points[s]+1,chg_points[s+1]] for s in range(0,len(chg_points)-1)] -ranges_arr[0][0] = 0 -list_time_stamps = list(t_df) - - -# iterate over timestamps list - -for item in list_time_stamps: - - # find the segment which the timestamp belongs - # (is dependent on which groups the two nodes belong) - d = sbm_core._findSegment(ranges_arr, len(ranges_arr) , int(item)) - gt_arr.append(d) - - -ind = adjusted_rand_score(gt_arr,dis_arr) -print('rand index: seg {} : {}'.format(_itr, ind)) - -g1= group_dic_d -g2= group_dic_d[1] - -ds= list(group_dic_d.values() ) -gt1 = list(g1.values()) - -ind_grp=adjusted_rand_score(ds,gt1) -print('rand index: group {} : {}'.format(_itr, ind_grp)) - -# 0.9785444674036701 -# 0.9791525131372905 -# 0.981440657362889 -# 0.9780947193990287 -# 0.9785576050121263 -# 0.9768656988977588 -# 0.9794087578274921 -# 0.9785467310928326 -# 0.8326828222297133 - - -# 3 -# 3 -# 3 -# 3 -# 3 -# 3 -# 5 -# 5 -# 5 - - - - diff --git a/sbm_core.py b/sbm_core.py old mode 100755 new mode 100644 index 2ec71990ff3b4eb8540d75469639e4e8e34e5a41..2da8eaa3f346622c915f4694e65aa6317122f29a --- a/sbm_core.py +++ b/sbm_core.py @@ -1,3 +1,5 @@ + + # ################################################################################### # ### Utility Functions for Maximum Likelihood Estimation (MLE) and Segmentation ### # ### ( based on stochastic blockmodels ) ### @@ -21,26 +23,27 @@ def _findSegment(a, n, K): # Binary search while (end <= end): - - # mid point - mid = (start + end) >> 1 - - # element found - if (K >= a[mid][0] and K <= a[mid][1]): - return mid - - # first half - elif (K < a[mid][0]): - end = mid - 1 - - # second half - else: - start = mid + 1 - - # Not found - print('Not Found') - return -1 - + + if end >= start: + # mid point + mid = (start + end) //2 + # element found + if (K >= a[mid][0] and K <= a[mid][1]): + return mid + + # first half + elif (K > a[mid][1]): + start = mid + 1 + + # second half + elif (K < a[mid][0]): + end = mid - 1 + else: + # print('K : {} MID: {} START:{} END: {} ARRA : {}, n: {}'.format(K,mid, start,end, a,n)) + # Not found + print('Not Found') + return -1 + # split an empty group if it does exist def _split_the_empty_group(num_roles,list_of_groups,group_assignment):