-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscript_generation_affinity_and_incidence_matrix.py
executable file
·345 lines (247 loc) · 14.7 KB
/
script_generation_affinity_and_incidence_matrix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
import networkx as nx
import numpy as np
import scipy.io as sio
import os
from multiprocessing import Pool
import argparse
import pickle5 as pickle
def gaussian_kernel(attribute_1, attribute_2, gamma=1):
''' Return the value of a gaussian kernel between two attributes '''
return np.exp(-gamma * np.power(np.linalg.norm(attribute_1 - attribute_2), 2))
def kernel(attribute_1, attribute_2, kernel_arg):
""" Calculate the kernel of the two attributes given
parameters in kernel_arg
"""
kernel_type, kernel_dict = kernel_arg
if kernel_type == "gaussian":
if kernel_dict["attribute_type"] == "coord":
return gaussian_kernel(attribute_1, attribute_2, gamma=kernel_dict["gaussian_gamma_coord"])
elif kernel_dict["attribute_type"] == "geodesic":
return gaussian_kernel(attribute_1, attribute_2, gamma=kernel_dict["gaussian_gamma_geodesic"])
else:
print("problème")
return 0
def compute_median_distances_all_pair(list_vectors, distance_type="euclidean", radius=100):
""" given a list of vectors, compute the median of the
distances of all pair of values in the list """
list_distances = []
for i, elem_i in enumerate(list_vectors):
for j in range(i+1, len(list_vectors)):
elem_j = list_vectors[j]
if distance_type == "euclidean":
distance = np.linalg.norm(elem_i - elem_j)
elif distance_type == "geodesic":
distance = radius * np.arccos(np.clip(np.dot(elem_i,elem_j) / np.power(radius,2),-1, 1))
else:
print("distance type not allowed !", distance_type)
list_distances.append(distance)
return np.median(list_distances)
def compute_heuristic_gamma(graph_1, graph_2):
""" Calculate a gamma value for the gaussian kernel
based on a heuristic (take the median of all
distances between pair of coordinates)
"""
# calcule the heuristic for graph_1
# we get all the coordinates for graph_1
full_coordinates = []
for node in graph_1.nodes:
full_coordinates.append(graph_1.nodes[node]["coord"])
graph_1_gamma_coord = compute_median_distances_all_pair(full_coordinates, "geodesic")
# we get all the coordinates for graph_2
full_coordinates = []
for node in graph_2.nodes:
full_coordinates.append(graph_2.nodes[node]["coord"])
graph_2_gamma_coord = compute_median_distances_all_pair(full_coordinates, "geodesic")
gamma_coord = 1/np.mean([graph_1_gamma_coord, graph_2_gamma_coord])
# we get all the geodesic distances for graph_1
full_coordinates = []
for edge in graph_1.edges:
full_coordinates.append(graph_1.edges[edge]["geodesic_distance"])
graph_1_gamma_geo = compute_median_distances_all_pair(full_coordinates, "euclidean")
# we get all the geodesic for graph_2
full_coordinates = []
for edge in graph_2.edges:
full_coordinates.append(graph_2.edges[edge]["geodesic_distance"])
graph_2_gamma_geo = compute_median_distances_all_pair(full_coordinates, "euclidean")
gamma_geodesic = 1/np.mean([graph_1_gamma_geo, graph_2_gamma_geo])
return gamma_coord, gamma_geodesic
def full_affinity(graph_1, graph_2, kernel_args):
""" Calculation of the affinity value of two graphs of same size with the kernel function provided
"""
print("full affinity matrix :",graph_1.number_of_nodes(), graph_2.number_of_nodes(), graph_1.number_of_edges(), graph_2.number_of_edges())
# Initialise affinity matrix with zeros
affinity_matrix = np.zeros((np.power(graph_1.number_of_nodes(), 2), np.power(graph_2.number_of_nodes(),2)))
# we fill the affinity matrix with the kernel values.
# we loop over all the possible permutations
for node_a in graph_1.nodes:
for node_i in graph_2.nodes:
for node_b in graph_1.nodes:
for node_j in graph_2.nodes:
# We check if we need to take the attributes of nodes or edge
if node_a == node_b and node_i == node_j:
# We take the node attributes.
attribute_1 = graph_1.nodes[node_a]["coord"]
attribute_2 = graph_2.nodes[node_i]["coord"]
# calculate the kernel value of these attributes
kernel_args[1]["attribute_type"] = "coord"
value_kernel = kernel(attribute_1, attribute_2, kernel_args)
# add this in the right place in the affinity_matrix
affinity_matrix[ node_a * graph_2.number_of_nodes() + node_i, node_b * graph_2.number_of_nodes() + node_j] \
= value_kernel
else:
# we check that the edges exist on both side and if so add the value to the affinity matrix
if (node_a, node_b) in graph_1.edges and (node_i, node_j) in graph_2.edges:
attribute_1 = graph_1.edges[(node_a, node_b)]["geodesic_distance"]
attribute_2 = graph_2.edges[(node_i, node_j)]["geodesic_distance"]
# get the kernel value
kernel_args[1]["attribute_type"] = "geodesic"
value_kernel = kernel(attribute_1, attribute_2, kernel_args)
affinity_matrix[ node_a * graph_2.number_of_nodes() + node_i, node_b * graph_2.number_of_nodes() + node_j] \
= value_kernel
return affinity_matrix
def edge_to_edge_affinity(graph_1, graph_2, kernel_args):
"""calculate the edge_to_edge affinity matrix for kerGM"""
# Initialise affinity matrix to zeros
affinity_matrix = np.zeros((graph_1.number_of_edges(), graph_2.number_of_edges()))
# we add the necessary attribute information
kernel_args[1]["attribute_type"] = "geodesic"
# We go through all combination of edges
for edge_1 in graph_1.edges:
for edge_2 in graph_2.edges:
kernel_value = kernel(graph_1.edges[edge_1]["geodesic_distance"], graph_2.edges[edge_2]["geodesic_distance"], kernel_args)
affinity_matrix[graph_1.edges[edge_1]["id"], graph_2.edges[edge_2]["id"]] = kernel_value
return affinity_matrix
def node_to_node_affinity(graph_1, graph_2, kernel_args):
"""calculate the edge_to_edge affinity matrix for kerGM"""
# Initialise affinity matrix to zeros
affinity_matrix = np.zeros((graph_1.number_of_nodes(), graph_2.number_of_nodes()))
# we add the necessary attribute arguments for the kernel
kernel_args[1]["attribute_type"] = "coord"
# We go through all combination of nodes
for node_1 in graph_1.nodes:
for node_2 in graph_2.nodes:
kernel_value = kernel(graph_1.nodes[node_1]["coord"], graph_2.nodes[node_2]["coord"], kernel_args)
affinity_matrix[node_1, node_2] = kernel_value
return affinity_matrix
def get_head_and_tail_incidence_matrix(graph):
""" From a given graph return a head and a tail incidence matrix where the direction
of the edges have been chosen randomly
"""
# initialize H (tail-incidence) and G(head-incidence) matrix. Notation are from KerGM paper
H = np.zeros((graph.number_of_nodes(), graph.number_of_edges()))
G = np.zeros((graph.number_of_nodes(), graph.number_of_edges()))
for edge in graph.edges:
# randomly select edge direction
if np.random.rand(1) > 0.5:
head = edge[0]
tail = edge[1]
else:
head = edge[1]
tail = edge[0]
# add the information to the matrices
edge_id = graph.edges[edge]["id"]
H[head, edge_id] = 1
G[tail, edge_id] = 1
return H, G
def generate_affinity_and_incidence_for_pair(graph_1, graph_2, kernel_args, cpt_full_matrix=False):
""" Generate the different affinity matrix as well as the incidences matrix
necessary for the testing of different algorithms (like KerGM)
"""
# Generate affinity matrices
if cpt_full_matrix:
full_affinity_matrix = full_affinity(graph_1, graph_2, kernel_args)
kE11 = edge_to_edge_affinity(graph_1, graph_1, kernel_args)
kE22 = edge_to_edge_affinity(graph_2, graph_2, kernel_args)
kE12 = edge_to_edge_affinity(graph_1, graph_2, kernel_args)
kN12 = node_to_node_affinity(graph_1, graph_2, kernel_args)
# generate head and tail incidence matrix
H1, G1 = get_head_and_tail_incidence_matrix(graph_1)
H2, G2 = get_head_and_tail_incidence_matrix(graph_2)
if cpt_full_matrix:
return full_affinity_matrix, kE11, kE22, kE12, kN12, H1, G1, H2, G2
else:
return kE11, kE22, kE12, kN12, H1, G1, H2, G2
def load_generate_and_save_affinity_and_incidence_for_pair(path_to_folder, graph_nb_1, graph_nb_2, kernel_args, cpt_full_matrix=False):
""" Generate the affinity and incidences matrix and save them
in a given repositery
"""
print(path_to_folder, graph_nb_1, graph_nb_2)
# get the two graphs
graph_1 = pickle.load(open(os.path.join(path_to_folder,"graphs","graph_"+str(graph_nb_1)+".gpickle"),'rb'))
graph_2 = pickle.load(open(os.path.join(path_to_folder,"graphs","graph_"+str(graph_nb_2)+".gpickle"),'rb'))
#graph_1 = nx.read_gpickle(os.path.join(path_to_folder, "graphs", "graph_"+str(graph_nb_1)+".gpickle"))
#graph_2 = nx.read_gpickle(os.path.join(path_to_folder, "graphs", "graph_"+str(graph_nb_2)+".gpickle"))
# if the kernel is gaussian get the gamma value for the coordinate
# and the geodesic distance
if kernel_args[0] == "gaussian" and kernel_args[1]["gaussian_gamma"] == 0:
gamma_coord, gamma_geodesic = compute_heuristic_gamma(graph_1, graph_2)
print("gamma coord:",gamma_coord,"gamma geo:", gamma_geodesic)
kernel_args[1]["gaussian_gamma_coord"] = gamma_coord
kernel_args[1]["gaussian_gamma_geodesic"] = gamma_geodesic
# generate all the necessary informations
if cpt_full_matrix:
full_affinity_matrix, kE11, kE22, kE12, kN12, H1, G1, H2, G2 \
= generate_affinity_and_incidence_for_pair(graph_1, graph_2, kernel_args, cpt_full_matrix=cpt_full_matrix)
# Create dictionnaries that hold the information
dict_affinity_to_save = {"full_affinity":full_affinity_matrix, "kE11":kE11, "kE22":kE22, "kE12": kE12, "kN12":kN12}
dict_incidence_matrix_to_save = {"H1":H1,"H2":H2,"G1":G1,"G2":G2}
else:
kE11, kE22, kE12, kN12, H1, G1, H2, G2 \
= generate_affinity_and_incidence_for_pair(graph_1, graph_2, kernel_args, cpt_full_matrix=cpt_full_matrix)
# Create dictionnaries that hold the information
dict_affinity_to_save = {"kE11":kE11, "kE22":kE22, "kE12": kE12, "kN12":kN12}
dict_incidence_matrix_to_save = {"H1":H1,"H2":H2,"G1":G1,"G2":G2}
# Save everything in an appropriate matlab format
sio.savemat(os.path.join(path_to_folder, "affinity", "affinity_"+str(graph_nb_1)+"_"+str(graph_nb_2)+".mat"), dict_affinity_to_save, do_compression=True)
sio.savemat(os.path.join(path_to_folder, "affinity", "incidence_"+str(graph_nb_1)+"_"+str(graph_nb_2)+".mat"), dict_incidence_matrix_to_save, do_compression=True)
def generate_and_save_all_affinity_and_incidence_in_path(path_to_folder, kernel_args, cpt_full_matrix=False, nb_workers=4):
""" Go through all folders and subfolders to load graphs and
generate the correponding affinity and incidence matrices.
This process is done using subprocesses to increase the
computation time
"""
# get all the informations to send to the processes
list_arguments = []
for file_name in os.listdir(path_to_folder):
long_file_name = os.path.join(path_to_folder, file_name)
if os.path.isdir(long_file_name):
for sub_folder in os.listdir(long_file_name):
full_folder_name = os.path.join(long_file_name,sub_folder)
# Create the directory that will hold the affinity results
if not os.path.isdir(os.path.join(full_folder_name,"affinity")):
os.mkdir(os.path.join(full_folder_name,"affinity"))
# get the number of graph
files = os.listdir(os.path.join(full_folder_name, "graphs"))
nb_tot_graphs = 0
for fil in files:
if 'graph' in fil:
nb_tot_graphs += 1
print('nb_tot_graphs=', nb_tot_graphs)
# For each pair of graphs
for i_graph in range(nb_tot_graphs):
for j_graph in range(i_graph+1, nb_tot_graphs):
list_arguments.append((full_folder_name, i_graph, j_graph, kernel_args, cpt_full_matrix))
# launch the processes
with Pool(processes=nb_workers) as pool:
pool.starmap(load_generate_and_save_affinity_and_incidence_for_pair, list_arguments)
####################
if __name__ == "__main__":
# We parse the argument from command line
parser = argparse.ArgumentParser(description="Generate the affinity and incidence matrices given a folder structure generated through the script to generate n graphs")
parser.add_argument("path_to_folder", help="path where the folders contains the graphs")
parser.add_argument("--nb_workers", help="number of processes to launch", default=1, type=int)
parser.add_argument("--cpt_full_matrix", help="Decide if we the file should include full affinity matrices or just the smaller one for KerGM (0=False, 1=True)", type=int, default=0)
parser.add_argument("--kernel_type", help="kernel type, only gaussian right now", default="gaussian")
parser.add_argument("--gaussian_gamma", help="gamma value for the gaussian kernel", default=0, type=float)
args = parser.parse_args()
path_to_folder = "../data/Oasis_original_new_with_dummy/modified_graphs" #args.path_to_folder
cpt_full_matrix = bool(args.cpt_full_matrix)
nb_workers = args.nb_workers
gaussian_gamma = args.gaussian_gamma
kernel_type = args.kernel_type
# We define the kernel arguments to be used
kernel_args = (kernel_type, {"gaussian_gamma":gaussian_gamma})
generate_and_save_all_affinity_and_incidence_in_path(path_to_folder,
kernel_args,
cpt_full_matrix=cpt_full_matrix,
nb_workers=nb_workers)