| @@ -74,6 +74,7 @@ for ds in dslist: | |||||
| extra_params=(ds['extra_params'] if 'extra_params' in ds else None), | extra_params=(ds['extra_params'] if 'extra_params' in ds else None), | ||||
| ds_name=ds['name'], | ds_name=ds['name'], | ||||
| n_jobs=multiprocessing.cpu_count(), | n_jobs=multiprocessing.cpu_count(), | ||||
| # n_jobs=7, | |||||
| read_gm_from_file=False, | read_gm_from_file=False, | ||||
| verbose=True) | verbose=True) | ||||
| print() | print() | ||||
| @@ -18,31 +18,44 @@ from scipy import optimize | |||||
| import cvxpy as cp | import cvxpy as cp | ||||
| import sys | import sys | ||||
| sys.path.insert(0, "../") | |||||
| #sys.path.insert(0, "../") | |||||
| from ged import GED, get_nb_edit_operations | from ged import GED, get_nb_edit_operations | ||||
| from utils import kernel_distance_matrix | from utils import kernel_distance_matrix | ||||
| def fit_GED_to_kernel_distance(Gn, gkernel, itr_max): | |||||
| def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max, | |||||
| fitkernel=None, gamma=1.0): | |||||
| # c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them. | # c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them. | ||||
| random.seed(1) | |||||
| cost_rdm = random.sample(range(1, 10), 5) | |||||
| edit_costs = cost_rdm + [0] | |||||
| # random.seed(1) | |||||
| cost_rdm = random.sample(range(1, 10), 6) | |||||
| # edit_costs = cost_rdm + [0] | |||||
| edit_costs = cost_rdm | |||||
| # edit_costs = [i * 0.01 for i in cost_rdm] + [0] | |||||
| # edit_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0] | # edit_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0] | ||||
| # edit_costs = [0, 0, 0.9544, 0.026, 0.0196, 0] | # edit_costs = [0, 0, 0.9544, 0.026, 0.0196, 0] | ||||
| # edit_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0] | # edit_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0] | ||||
| idx_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0] | |||||
| idx_cost_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0] | |||||
| # compute distances in feature space. | # compute distances in feature space. | ||||
| dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, gkernel=gkernel) | |||||
| coef_dk = 1 | |||||
| dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, gkernel=gkernel) | |||||
| dis_k_vec = [] | dis_k_vec = [] | ||||
| for i in range(len(dis_k_mat)): | for i in range(len(dis_k_mat)): | ||||
| for j in range(i, len(dis_k_mat)): | for j in range(i, len(dis_k_mat)): | ||||
| dis_k_vec.append(dis_k_mat[i, j]) | dis_k_vec.append(dis_k_mat[i, j]) | ||||
| dis_k_vec = np.array(dis_k_vec) | dis_k_vec = np.array(dis_k_vec) | ||||
| if fitkernel == None: | |||||
| dis_k_vec_ajusted = dis_k_vec | |||||
| elif fitkernel == 'gaussian': | |||||
| coef_dk = 1 / np.max(dis_k_vec) | |||||
| idx_dk_nonzeros = np.where(dis_k_vec != 0)[0] | |||||
| # remove 0's and constraint d_k between 0 and 1. | |||||
| dis_k_vec = dis_k_vec[idx_dk_nonzeros] * coef_dk | |||||
| dis_k_vec_ajusted = np.sqrt(-np.log(dis_k_vec) / gamma) | |||||
| residual_list = [] | residual_list = [] | ||||
| edit_cost_list = [] | edit_cost_list = [] | ||||
| time_list = [] | time_list = [] | ||||
| nb_cost_mat_list = [] | |||||
| for itr in range(itr_max): | for itr in range(itr_max): | ||||
| print('\niteration', itr) | print('\niteration', itr) | ||||
| @@ -52,15 +65,23 @@ def fit_GED_to_kernel_distance(Gn, gkernel, itr_max): | |||||
| edit_cost_list.append(edit_cost_constant) | edit_cost_list.append(edit_cost_constant) | ||||
| ged_all, ged_mat, n_edit_operations = compute_geds(Gn, edit_cost_constant, | ged_all, ged_mat, n_edit_operations = compute_geds(Gn, edit_cost_constant, | ||||
| idx_nonzeros, parallel=True) | |||||
| idx_cost_nonzeros, parallel=True) | |||||
| residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec))) | |||||
| if fitkernel == None: | |||||
| residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec))) | |||||
| elif fitkernel == 'gaussian': | |||||
| ged_all = np.array(ged_all)[idx_dk_nonzeros] | |||||
| residual = np.sqrt(np.sum(np.square( | |||||
| np.exp(-gamma * ged_all ** 2) / coef_dk - dis_k_vec))) | |||||
| residual_list.append(residual) | residual_list.append(residual) | ||||
| # "fit" geds to distances in feature space by tuning edit costs using the | # "fit" geds to distances in feature space by tuning edit costs using the | ||||
| # Least Squares Method. | # Least Squares Method. | ||||
| nb_cost_mat = np.array(n_edit_operations).T | nb_cost_mat = np.array(n_edit_operations).T | ||||
| edit_costs_new, residual = compute_better_costs(nb_cost_mat, dis_k_vec) | |||||
| if fitkernel == 'gaussian': | |||||
| nb_cost_mat = nb_cost_mat[idx_dk_nonzeros] | |||||
| nb_cost_mat_list.append(nb_cost_mat) | |||||
| edit_costs_new, residual = compute_better_costs(nb_cost_mat, dis_k_vec_ajusted) | |||||
| print('pseudo residual:', residual) | print('pseudo residual:', residual) | ||||
| for i in range(len(edit_costs_new)): | for i in range(len(edit_costs_new)): | ||||
| @@ -70,7 +91,7 @@ def fit_GED_to_kernel_distance(Gn, gkernel, itr_max): | |||||
| else: | else: | ||||
| raise ValueError('The edit cost is negative.') | raise ValueError('The edit cost is negative.') | ||||
| for idx, item in enumerate(idx_nonzeros): | |||||
| for idx, item in enumerate(idx_cost_nonzeros): | |||||
| edit_costs[item] = edit_costs_new[idx] | edit_costs[item] = edit_costs_new[idx] | ||||
| time_list.append(time.time() - time0) | time_list.append(time.time() - time0) | ||||
| @@ -78,14 +99,21 @@ def fit_GED_to_kernel_distance(Gn, gkernel, itr_max): | |||||
| print('edit_costs:', edit_costs) | print('edit_costs:', edit_costs) | ||||
| print('residual_list:', residual_list) | print('residual_list:', residual_list) | ||||
| print() | |||||
| edit_cost_list.append(edit_costs) | edit_cost_list.append(edit_costs) | ||||
| ged_all, ged_mat, n_edit_operations = compute_geds(Gn, edit_costs, | ged_all, ged_mat, n_edit_operations = compute_geds(Gn, edit_costs, | ||||
| idx_nonzeros, parallel=True) | |||||
| residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec))) | |||||
| idx_cost_nonzeros, parallel=True) | |||||
| if fitkernel == 0: | |||||
| residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec))) | |||||
| elif fitkernel == 'gaussian': | |||||
| ged_all = np.array(ged_all)[idx_dk_nonzeros] | |||||
| residual = np.sqrt(np.sum(np.square( | |||||
| np.exp(-gamma * ged_all ** 2) / coef_dk - dis_k_vec))) | |||||
| residual_list.append(residual) | residual_list.append(residual) | ||||
| nb_cost_mat_list.append(np.array(n_edit_operations).T) | |||||
| return edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list | |||||
| return edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, \ | |||||
| time_list, nb_cost_mat_list, coef_dk | |||||
| def compute_geds(Gn, edit_cost_constant, idx_nonzeros, parallel=False): | def compute_geds(Gn, edit_cost_constant, idx_nonzeros, parallel=False): | ||||
| @@ -193,7 +221,10 @@ def compute_better_costs(nb_cost_mat, dis_k_vec): | |||||
| # h = np.array([0 for i in range(nb_cost_mat.shape[1])]) | # h = np.array([0 for i in range(nb_cost_mat.shape[1])]) | ||||
| x = cp.Variable(nb_cost_mat.shape[1]) | x = cp.Variable(nb_cost_mat.shape[1]) | ||||
| cost = cp.sum_squares(nb_cost_mat * x - dis_k_vec) | cost = cp.sum_squares(nb_cost_mat * x - dis_k_vec) | ||||
| constraints = [x >= [0 for i in range(nb_cost_mat.shape[1])]] | |||||
| constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])], | |||||
| # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] | |||||
| np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, | |||||
| np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] | |||||
| prob = cp.Problem(cp.Minimize(cost), constraints) | prob = cp.Problem(cp.Minimize(cost), constraints) | ||||
| prob.solve() | prob.solve() | ||||
| edit_costs_new = x.value | edit_costs_new = x.value | ||||
| @@ -9,11 +9,14 @@ import numpy as np | |||||
| import networkx as nx | import networkx as nx | ||||
| from tqdm import tqdm | from tqdm import tqdm | ||||
| import sys | import sys | ||||
| import multiprocessing | |||||
| from multiprocessing import Pool | |||||
| from functools import partial | |||||
| from gedlibpy import librariesImport, gedlibpy | from gedlibpy import librariesImport, gedlibpy | ||||
| def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP', | def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP', | ||||
| edit_cost_constant=[], saveGXL='benoit', stabilizer='min', repeat=50): | |||||
| edit_cost_constant=[], stabilizer='min', repeat=50): | |||||
| """ | """ | ||||
| Compute GED for 2 graphs. | Compute GED for 2 graphs. | ||||
| """ | """ | ||||
| @@ -25,9 +28,11 @@ def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP', | |||||
| G_new = nx.Graph() | G_new = nx.Graph() | ||||
| for nd, attrs in G.nodes(data=True): | for nd, attrs in G.nodes(data=True): | ||||
| G_new.add_node(str(nd), chem=attrs['atom']) | G_new.add_node(str(nd), chem=attrs['atom']) | ||||
| # G_new.add_node(str(nd), x=str(attrs['attributes'][0]), | |||||
| # y=str(attrs['attributes'][1])) | |||||
| for nd1, nd2, attrs in G.edges(data=True): | for nd1, nd2, attrs in G.edges(data=True): | ||||
| # G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) | |||||
| G_new.add_edge(str(nd1), str(nd2)) | |||||
| G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) | |||||
| # G_new.add_edge(str(nd1), str(nd2)) | |||||
| return G_new | return G_new | ||||
| @@ -49,6 +54,32 @@ def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP', | |||||
| pi_backward = gedlibpy.get_backward_map(g, h) | pi_backward = gedlibpy.get_backward_map(g, h) | ||||
| upper = gedlibpy.get_upper_bound(g, h) | upper = gedlibpy.get_upper_bound(g, h) | ||||
| lower = gedlibpy.get_lower_bound(g, h) | lower = gedlibpy.get_lower_bound(g, h) | ||||
| elif stabilizer == 'mean': | |||||
| # @todo: to be finished... | |||||
| upper_list = [np.inf] * repeat | |||||
| for itr in range(repeat): | |||||
| gedlibpy.run_method(g, h) | |||||
| upper_list[itr] = gedlibpy.get_upper_bound(g, h) | |||||
| pi_forward = gedlibpy.get_forward_map(g, h) | |||||
| pi_backward = gedlibpy.get_backward_map(g, h) | |||||
| lower = gedlibpy.get_lower_bound(g, h) | |||||
| upper = np.mean(upper_list) | |||||
| elif stabilizer == 'median': | |||||
| if repeat % 2 == 0: | |||||
| repeat += 1 | |||||
| upper_list = [np.inf] * repeat | |||||
| pi_forward_list = [0] * repeat | |||||
| pi_backward_list = [0] * repeat | |||||
| for itr in range(repeat): | |||||
| gedlibpy.run_method(g, h) | |||||
| upper_list[itr] = gedlibpy.get_upper_bound(g, h) | |||||
| pi_forward_list[itr] = gedlibpy.get_forward_map(g, h) | |||||
| pi_backward_list[itr] = gedlibpy.get_backward_map(g, h) | |||||
| lower = gedlibpy.get_lower_bound(g, h) | |||||
| upper = np.median(upper_list) | |||||
| idx_median = upper_list.index(upper) | |||||
| pi_forward = pi_forward_list[idx_median] | |||||
| pi_backward = pi_backward_list[idx_median] | |||||
| elif stabilizer == 'min': | elif stabilizer == 'min': | ||||
| upper = np.inf | upper = np.inf | ||||
| for itr in range(repeat): | for itr in range(repeat): | ||||
| @@ -61,6 +92,18 @@ def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP', | |||||
| lower = gedlibpy.get_lower_bound(g, h) | lower = gedlibpy.get_lower_bound(g, h) | ||||
| if upper == 0: | if upper == 0: | ||||
| break | break | ||||
| elif stabilizer == 'max': | |||||
| upper = 0 | |||||
| for itr in range(repeat): | |||||
| gedlibpy.run_method(g, h) | |||||
| upper_tmp = gedlibpy.get_upper_bound(g, h) | |||||
| if upper_tmp > upper: | |||||
| upper = upper_tmp | |||||
| pi_forward = gedlibpy.get_forward_map(g, h) | |||||
| pi_backward = gedlibpy.get_backward_map(g, h) | |||||
| lower = gedlibpy.get_lower_bound(g, h) | |||||
| elif stabilizer == 'gaussian': | |||||
| pass | |||||
| dis = upper | dis = upper | ||||
| @@ -138,23 +181,69 @@ def GED_n(Gn, lib='gedlibpy', cost='CHEM_1', method='IPFP', | |||||
| return dis, pi_forward, pi_backward | return dis, pi_forward, pi_backward | ||||
| def ged_median(Gn, Gn_median, measure='ged', verbose=False, | |||||
| ged_cost='CHEM_1', ged_method='IPFP', saveGXL='benoit'): | |||||
| dis_list = [] | |||||
| pi_forward_list = [] | |||||
| for idx, G in tqdm(enumerate(Gn), desc='computing median distances', | |||||
| file=sys.stdout) if verbose else enumerate(Gn): | |||||
| dis_sum = 0 | |||||
| pi_forward_list.append([]) | |||||
| for G_p in Gn_median: | |||||
| dis_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p, | |||||
| cost=ged_cost, method=ged_method, saveGXL=saveGXL) | |||||
| pi_forward_list[idx].append(pi_tmp_forward) | |||||
| dis_sum += dis_tmp | |||||
| dis_list.append(dis_sum) | |||||
| def ged_median(Gn, Gn_median, verbose=False, params_ged={'lib': 'gedlibpy', | |||||
| 'cost': 'CHEM_1', 'method': 'IPFP', 'edit_cost_constant': [], | |||||
| 'stabilizer': 'min', 'repeat': 50}, parallel=False): | |||||
| if parallel: | |||||
| len_itr = int(len(Gn)) | |||||
| pi_forward_list = [[] for i in range(len_itr)] | |||||
| dis_list = [0 for i in range(len_itr)] | |||||
| itr = range(0, len_itr) | |||||
| n_jobs = multiprocessing.cpu_count() | |||||
| if len_itr < 100 * n_jobs: | |||||
| chunksize = int(len_itr / n_jobs) + 1 | |||||
| else: | |||||
| chunksize = 100 | |||||
| def init_worker(gn_toshare, gn_median_toshare): | |||||
| global G_gn, G_gn_median | |||||
| G_gn = gn_toshare | |||||
| G_gn_median = gn_median_toshare | |||||
| do_partial = partial(_compute_ged_median, params_ged) | |||||
| pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn, Gn_median)) | |||||
| if verbose: | |||||
| iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize), | |||||
| desc='computing GEDs', file=sys.stdout) | |||||
| else: | |||||
| iterator = pool.imap_unordered(do_partial, itr, chunksize) | |||||
| for i, dis_sum, pi_forward in iterator: | |||||
| pi_forward_list[i] = pi_forward | |||||
| dis_list[i] = dis_sum | |||||
| # print('\n-------------------------------------------') | |||||
| # print(i, j, idx_itr, dis) | |||||
| pool.close() | |||||
| pool.join() | |||||
| else: | |||||
| dis_list = [] | |||||
| pi_forward_list = [] | |||||
| for idx, G in tqdm(enumerate(Gn), desc='computing median distances', | |||||
| file=sys.stdout) if verbose else enumerate(Gn): | |||||
| dis_sum = 0 | |||||
| pi_forward_list.append([]) | |||||
| for G_p in Gn_median: | |||||
| dis_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p, | |||||
| **params_ged) | |||||
| pi_forward_list[idx].append(pi_tmp_forward) | |||||
| dis_sum += dis_tmp | |||||
| dis_list.append(dis_sum) | |||||
| return dis_list, pi_forward_list | return dis_list, pi_forward_list | ||||
| def _compute_ged_median(params_ged, itr): | |||||
| # print(itr) | |||||
| dis_sum = 0 | |||||
| pi_forward = [] | |||||
| for G_p in G_gn_median: | |||||
| dis_tmp, pi_tmp_forward, pi_tmp_backward = GED(G_gn[itr], G_p, | |||||
| **params_ged) | |||||
| pi_forward.append(pi_tmp_forward) | |||||
| dis_sum += dis_tmp | |||||
| return itr, dis_sum, pi_forward | |||||
| def get_nb_edit_operations(g1, g2, forward_map, backward_map): | def get_nb_edit_operations(g1, g2, forward_map, backward_map): | ||||
| """Compute the number of each edit operations. | """Compute the number of each edit operations. | ||||
| """ | """ | ||||
| @@ -22,20 +22,22 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, | |||||
| epsilon=0.001, node_label='atom', edge_label='bond_type', | epsilon=0.001, node_label='atom', edge_label='bond_type', | ||||
| connected=False, removeNodes=True, allBestInit=False, allBestNodes=False, | connected=False, removeNodes=True, allBestInit=False, allBestNodes=False, | ||||
| allBestEdges=False, allBestOutput=False, | allBestEdges=False, allBestOutput=False, | ||||
| params_ged={'ged_cost': 'CHEM_1', 'ged_method': 'IPFP', 'saveGXL': 'benoit'}): | |||||
| params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', 'method': 'IPFP', | |||||
| 'edit_cost_constant': [], 'stabilizer': 'min', 'repeat': 50}): | |||||
| """See my name, then you know what I do. | """See my name, then you know what I do. | ||||
| """ | """ | ||||
| # Gn_median = Gn_median[0:10] | # Gn_median = Gn_median[0:10] | ||||
| # Gn_median = [nx.convert_node_labels_to_integers(g) for g in Gn_median] | # Gn_median = [nx.convert_node_labels_to_integers(g) for g in Gn_median] | ||||
| if removeNodes: | |||||
| node_ir = np.inf # corresponding to the node remove and insertion. | |||||
| label_r = 'thanksdanny' # the label for node remove. # @todo: make this label unrepeatable. | |||||
| node_ir = np.inf # corresponding to the node remove and insertion. | |||||
| label_r = 'thanksdanny' # the label for node remove. # @todo: make this label unrepeatable. | |||||
| ds_attrs = get_dataset_attributes(Gn_median + Gn_candidate, | ds_attrs = get_dataset_attributes(Gn_median + Gn_candidate, | ||||
| attr_names=['edge_labeled', 'node_attr_dim', 'edge_attr_dim'], | attr_names=['edge_labeled', 'node_attr_dim', 'edge_attr_dim'], | ||||
| edge_label=edge_label) | edge_label=edge_label) | ||||
| node_label_set = get_node_labels(Gn_median, node_label) | |||||
| edge_label_set = get_edge_labels(Gn_median, edge_label) | |||||
| def generate_graph(G, pi_p_forward, label_set): | |||||
| def generate_graph(G, pi_p_forward): | |||||
| G_new_list = [G.copy()] # all "best" graphs generated in this iteration. | G_new_list = [G.copy()] # all "best" graphs generated in this iteration. | ||||
| # nx.draw_networkx(G) | # nx.draw_networkx(G) | ||||
| # import matplotlib.pyplot as plt | # import matplotlib.pyplot as plt | ||||
| @@ -52,7 +54,7 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, | |||||
| for ndi, (nd, _) in enumerate(G.nodes(data=True)): | for ndi, (nd, _) in enumerate(G.nodes(data=True)): | ||||
| h_i0_list = [] | h_i0_list = [] | ||||
| label_list = [] | label_list = [] | ||||
| for label in label_set: | |||||
| for label in node_label_set: | |||||
| h_i0 = 0 | h_i0 = 0 | ||||
| for idx, g in enumerate(Gn_median): | for idx, g in enumerate(Gn_median): | ||||
| pi_i = pi_p_forward[idx][ndi] | pi_i = pi_p_forward[idx][ndi] | ||||
| @@ -62,7 +64,7 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, | |||||
| label_list.append(label) | label_list.append(label) | ||||
| # case when the node is to be removed. | # case when the node is to be removed. | ||||
| if removeNodes: | if removeNodes: | ||||
| h_i0_remove = 0 # @todo: maybe this can be added to the label_set above. | |||||
| h_i0_remove = 0 # @todo: maybe this can be added to the node_label_set above. | |||||
| for idx, g in enumerate(Gn_median): | for idx, g in enumerate(Gn_median): | ||||
| pi_i = pi_p_forward[idx][ndi] | pi_i = pi_p_forward[idx][ndi] | ||||
| if pi_i == node_ir: | if pi_i == node_ir: | ||||
| @@ -91,11 +93,10 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, | |||||
| G_new_list = [ggg.copy() for ggg in G_new_list_nd] | G_new_list = [ggg.copy() for ggg in G_new_list_nd] | ||||
| else: | else: | ||||
| # choose one of the best randomly. | # choose one of the best randomly. | ||||
| h_ij0_max = h_i0_list[idx_max[0]] | |||||
| idx_rdm = random.randint(0, len(idx_max) - 1) | idx_rdm = random.randint(0, len(idx_max) - 1) | ||||
| best_label = label_list[idx_max[idx_rdm]] | best_label = label_list[idx_max[idx_rdm]] | ||||
| # check whether a_ij is 0 or 1. | |||||
| h_i0_max = h_i0_list[idx_max[idx_rdm]] | |||||
| g_new = G_new_list[0] | g_new = G_new_list[0] | ||||
| if best_label == label_r: | if best_label == label_r: | ||||
| g_new.remove_node(nd) | g_new.remove_node(nd) | ||||
| @@ -134,8 +135,7 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, | |||||
| # for nd1, nd2, _ in g_new.edges(data=True): | # for nd1, nd2, _ in g_new.edges(data=True): | ||||
| h_ij0_list = [] | h_ij0_list = [] | ||||
| label_list = [] | label_list = [] | ||||
| # @todo: compute edge label set before. | |||||
| for label in get_edge_labels(Gn_median, edge_label): | |||||
| for label in edge_label_set: | |||||
| h_ij0 = 0 | h_ij0 = 0 | ||||
| for idx, g in enumerate(Gn_median): | for idx, g in enumerate(Gn_median): | ||||
| pi_i = pi_p_forward[idx][nd1i] | pi_i = pi_p_forward[idx][nd1i] | ||||
| @@ -176,9 +176,9 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, | |||||
| G_new_list_ed.append(g_tmp_copy) | G_new_list_ed.append(g_tmp_copy) | ||||
| g_tmp_list = [ggg.copy() for ggg in G_new_list_ed] | g_tmp_list = [ggg.copy() for ggg in G_new_list_ed] | ||||
| else: # choose one of the best randomly. | else: # choose one of the best randomly. | ||||
| h_ij0_max = h_ij0_list[idx_max[0]] | |||||
| idx_rdm = random.randint(0, len(idx_max) - 1) | idx_rdm = random.randint(0, len(idx_max) - 1) | ||||
| best_label = label_list[idx_max[idx_rdm]] | best_label = label_list[idx_max[idx_rdm]] | ||||
| h_ij0_max = h_ij0_list[idx_max[idx_rdm]] | |||||
| # check whether a_ij is 0 or 1. | # check whether a_ij is 0 or 1. | ||||
| sij_norm = 0 | sij_norm = 0 | ||||
| @@ -192,6 +192,7 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, | |||||
| g_new.add_edge(nd1, nd2) | g_new.add_edge(nd1, nd2) | ||||
| g_new.edges[nd1, nd2][edge_label] = best_label | g_new.edges[nd1, nd2][edge_label] = best_label | ||||
| else: | else: | ||||
| # elif h_ij0_max < len(Gn_median) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es): | |||||
| if g_new.has_edge(nd1, nd2): | if g_new.has_edge(nd1, nd2): | ||||
| g_new.remove_edge(nd1, nd2) | g_new.remove_edge(nd1, nd2) | ||||
| g_tmp_list = [g_new] | g_tmp_list = [g_new] | ||||
| @@ -221,8 +222,8 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, | |||||
| if g_tmp.has_node(nd1) and g_tmp.has_node(nd2) \ | if g_tmp.has_node(nd1) and g_tmp.has_node(nd2) \ | ||||
| and not g_tmp.has_edge(nd1, nd2): | and not g_tmp.has_edge(nd1, nd2): | ||||
| g_tmp.add_edge(nd1, nd2) | g_tmp.add_edge(nd1, nd2) | ||||
| # else: # @todo: which to use? | |||||
| elif sij_norm < len(Gn_median) * c_er / (c_er + c_ei): | |||||
| else: # @todo: which to use? | |||||
| # elif sij_norm < len(Gn_median) * c_er / (c_er + c_ei): | |||||
| if g_tmp.has_edge(nd1, nd2): | if g_tmp.has_edge(nd1, nd2): | ||||
| g_tmp.remove_edge(nd1, nd2) | g_tmp.remove_edge(nd1, nd2) | ||||
| # do not change anything when equal. | # do not change anything when equal. | ||||
| @@ -238,7 +239,7 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, | |||||
| # # find the best graph generated in this iteration and update pi_p. | # # find the best graph generated in this iteration and update pi_p. | ||||
| # @todo: should we update all graphs generated or just the best ones? | # @todo: should we update all graphs generated or just the best ones? | ||||
| dis_list, pi_forward_list = ged_median(G_new_list, Gn_median, | dis_list, pi_forward_list = ged_median(G_new_list, Gn_median, | ||||
| **params_ged) | |||||
| params_ged=params_ged) | |||||
| # @todo: should we remove the identical and connectivity check? | # @todo: should we remove the identical and connectivity check? | ||||
| # Don't know which is faster. | # Don't know which is faster. | ||||
| if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0: | if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0: | ||||
| @@ -283,15 +284,16 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, | |||||
| # while itr < ite_max and (np.abs(old_sod - cur_sod) > epsilon or | # while itr < ite_max and (np.abs(old_sod - cur_sod) > epsilon or | ||||
| # np.abs(old_sod - cur_sod) == 0): | # np.abs(old_sod - cur_sod) == 0): | ||||
| while itr < ite_max and np.abs(old_sod - cur_sod) > epsilon: | while itr < ite_max and np.abs(old_sod - cur_sod) > epsilon: | ||||
| # while itr < ite_max: | |||||
| # for itr in range(0, 5): # the convergence condition? | # for itr in range(0, 5): # the convergence condition? | ||||
| print('itr_iam is', itr) | print('itr_iam is', itr) | ||||
| G_new_list = [] | G_new_list = [] | ||||
| pi_forward_new_list = [] | pi_forward_new_list = [] | ||||
| dis_new_list = [] | dis_new_list = [] | ||||
| for idx, g in enumerate(G_list): | for idx, g in enumerate(G_list): | ||||
| label_set = get_node_labels(Gn_median + [g], node_label) | |||||
| # label_set = get_node_labels(Gn_median + [g], node_label) | |||||
| G_tmp_list, pi_forward_tmp_list, dis_tmp_list = generate_graph( | G_tmp_list, pi_forward_tmp_list, dis_tmp_list = generate_graph( | ||||
| g, pi_forward_list[idx], label_set) | |||||
| g, pi_forward_list[idx]) | |||||
| G_new_list += G_tmp_list | G_new_list += G_tmp_list | ||||
| pi_forward_new_list += pi_forward_tmp_list | pi_forward_new_list += pi_forward_tmp_list | ||||
| dis_new_list += dis_tmp_list | dis_new_list += dis_tmp_list | ||||
| @@ -325,7 +327,7 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, | |||||
| print('\nsods:', sod_list, '\n') | print('\nsods:', sod_list, '\n') | ||||
| return G_list, pi_forward_list, dis_min | |||||
| return G_list, pi_forward_list, dis_min, sod_list | |||||
| def remove_duplicates(Gn): | def remove_duplicates(Gn): | ||||
| @@ -363,7 +365,8 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, | |||||
| # compute set-median. | # compute set-median. | ||||
| dis_min = np.inf | dis_min = np.inf | ||||
| dis_list, pi_forward_all = ged_median(Gn_candidate, Gn_median, | dis_list, pi_forward_all = ged_median(Gn_candidate, Gn_median, | ||||
| **params_ged) | |||||
| params_ged=params_ged, parallel=True) | |||||
| print('finish computing GEDs.') | |||||
| # find all smallest distances. | # find all smallest distances. | ||||
| if allBestInit: # try all best init graphs. | if allBestInit: # try all best init graphs. | ||||
| idx_min_list = range(len(dis_list)) | idx_min_list = range(len(dis_list)) | ||||
| @@ -371,19 +374,26 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, | |||||
| else: | else: | ||||
| idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist() | idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist() | ||||
| dis_min = [dis_list[idx_min_list[0]]] * len(idx_min_list) | dis_min = [dis_list[idx_min_list[0]]] * len(idx_min_list) | ||||
| idx_min_rdm = random.randint(0, len(idx_min_list) - 1) | |||||
| idx_min_list = [idx_min_list[idx_min_rdm]] | |||||
| sod_set_median = np.min(dis_min) | |||||
| # phase 2: iteration. | # phase 2: iteration. | ||||
| G_list = [] | G_list = [] | ||||
| dis_list = [] | dis_list = [] | ||||
| pi_forward_list = [] | pi_forward_list = [] | ||||
| G_set_median_list = [] | |||||
| # sod_list = [] | |||||
| for idx_tmp, idx_min in enumerate(idx_min_list): | for idx_tmp, idx_min in enumerate(idx_min_list): | ||||
| # print('idx_min is', idx_min) | # print('idx_min is', idx_min) | ||||
| G = Gn_candidate[idx_min].copy() | G = Gn_candidate[idx_min].copy() | ||||
| G_set_median_list.append(G.copy()) | |||||
| # list of edit operations. | # list of edit operations. | ||||
| pi_p_forward = pi_forward_all[idx_min] | pi_p_forward = pi_forward_all[idx_min] | ||||
| # pi_p_backward = pi_all_backward[idx_min] | # pi_p_backward = pi_all_backward[idx_min] | ||||
| Gi_list, pi_i_forward_list, dis_i_min = iteration_proc(G, pi_p_forward, dis_min[idx_tmp]) | |||||
| Gi_list, pi_i_forward_list, dis_i_min, sod_list = iteration_proc(G, | |||||
| pi_p_forward, dis_min[idx_tmp]) | |||||
| G_list += Gi_list | G_list += Gi_list | ||||
| dis_list += [dis_i_min] * len(Gi_list) | dis_list += [dis_i_min] * len(Gi_list) | ||||
| pi_forward_list += pi_i_forward_list | pi_forward_list += pi_i_forward_list | ||||
| @@ -409,9 +419,9 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, | |||||
| # print(g.edges(data=True)) | # print(g.edges(data=True)) | ||||
| # get the best median graphs | # get the best median graphs | ||||
| G_min_list, pi_forward_min_list, dis_min = best_median_graphs( | |||||
| G_gen_median_list, pi_forward_min_list, sod_gen_median = best_median_graphs( | |||||
| G_list, pi_forward_list, dis_list) | G_list, pi_forward_list, dis_list) | ||||
| # for g in G_min_list: | |||||
| # for g in G_gen_median_list: | |||||
| # nx.draw_networkx(g) | # nx.draw_networkx(g) | ||||
| # plt.show() | # plt.show() | ||||
| # print(g.nodes(data=True)) | # print(g.nodes(data=True)) | ||||
| @@ -419,10 +429,10 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, | |||||
| if not allBestOutput: | if not allBestOutput: | ||||
| # randomly choose one graph. | # randomly choose one graph. | ||||
| idx_rdm = random.randint(0, len(G_min_list) - 1) | |||||
| G_min_list = [G_min_list[idx_rdm]] | |||||
| idx_rdm = random.randint(0, len(G_gen_median_list) - 1) | |||||
| G_gen_median_list = [G_gen_median_list[idx_rdm]] | |||||
| return G_min_list, dis_min | |||||
| return G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median | |||||
| @@ -5,8 +5,8 @@ import numpy as np | |||||
| import networkx as nx | import networkx as nx | ||||
| import time | import time | ||||
| import librariesImport | |||||
| import script | |||||
| from gedlibpy import librariesImport, gedlibpy | |||||
| #import script | |||||
| sys.path.insert(0, "/home/bgauzere/dev/optim-graphes/") | sys.path.insert(0, "/home/bgauzere/dev/optim-graphes/") | ||||
| import pygraph | import pygraph | ||||
| from pygraph.utils.graphfiles import loadDataset | from pygraph.utils.graphfiles import loadDataset | ||||
| @@ -27,8 +27,9 @@ def preimage_iam(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max, | |||||
| params_iam={'c_ei': 1, 'c_er': 1, 'c_es': 1, | params_iam={'c_ei': 1, 'c_er': 1, 'c_es': 1, | ||||
| 'ite_max': 50, 'epsilon': 0.001, | 'ite_max': 50, 'epsilon': 0.001, | ||||
| 'removeNodes': True, 'connected': False}, | 'removeNodes': True, 'connected': False}, | ||||
| params_ged={'ged_cost': 'CHEM_1', 'ged_method': 'IPFP', | |||||
| 'saveGXL': 'benoit'}): | |||||
| params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', 'method': 'IPFP', | |||||
| 'edit_cost_constant': [], 'stabilizer': 'min', | |||||
| 'repeat': 50}): | |||||
| """This function constructs graph pre-image by the iterative pre-image | """This function constructs graph pre-image by the iterative pre-image | ||||
| framework in reference [1], algorithm 1, where the step of generating new | framework in reference [1], algorithm 1, where the step of generating new | ||||
| graphs randomly is replaced by the IAM algorithm in reference [2]. | graphs randomly is replaced by the IAM algorithm in reference [2]. | ||||
| @@ -91,12 +92,12 @@ def preimage_iam(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max, | |||||
| ghat_new_list = [] | ghat_new_list = [] | ||||
| for g_tmp in Gk: | for g_tmp in Gk: | ||||
| Gn_nearest_init = [g_tmp.copy()] | Gn_nearest_init = [g_tmp.copy()] | ||||
| ghat_new_list_tmp, _ = iam_upgraded(Gn_nearest_median, | |||||
| ghat_new_list_tmp, _, _ = iam_upgraded(Gn_nearest_median, | |||||
| Gn_nearest_init, params_ged=params_ged, **params_iam) | Gn_nearest_init, params_ged=params_ged, **params_iam) | ||||
| ghat_new_list += ghat_new_list_tmp | ghat_new_list += ghat_new_list_tmp | ||||
| else: # only the best graph in D_k is used to initialize IAM. | else: # only the best graph in D_k is used to initialize IAM. | ||||
| Gn_nearest_init = [g.copy() for g in Gk] | Gn_nearest_init = [g.copy() for g in Gk] | ||||
| ghat_new_list, _ = iam_upgraded(Gn_nearest_median, Gn_nearest_init, | |||||
| ghat_new_list, _, _ = iam_upgraded(Gn_nearest_median, Gn_nearest_init, | |||||
| params_ged=params_ged, **params_iam) | params_ged=params_ged, **params_iam) | ||||
| # for g in g_tmp_list: | # for g in g_tmp_list: | ||||
| @@ -181,8 +182,9 @@ def preimage_iam_random_mix(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max | |||||
| params_iam={'c_ei': 1, 'c_er': 1, 'c_es': 1, | params_iam={'c_ei': 1, 'c_er': 1, 'c_es': 1, | ||||
| 'ite_max': 50, 'epsilon': 0.001, | 'ite_max': 50, 'epsilon': 0.001, | ||||
| 'removeNodes': True, 'connected': False}, | 'removeNodes': True, 'connected': False}, | ||||
| params_ged={'ged_cost': 'CHEM_1', 'ged_method': 'IPFP', | |||||
| 'saveGXL': 'benoit'}): | |||||
| params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', | |||||
| 'method': 'IPFP', 'edit_cost_constant': [], | |||||
| 'stabilizer': 'min', 'repeat': 50}): | |||||
| """This function constructs graph pre-image by the iterative pre-image | """This function constructs graph pre-image by the iterative pre-image | ||||
| framework in reference [1], algorithm 1, where new graphs are generated | framework in reference [1], algorithm 1, where new graphs are generated | ||||
| randomly and by the IAM algorithm in reference [2]. | randomly and by the IAM algorithm in reference [2]. | ||||
| @@ -7,7 +7,10 @@ Created on Thu Oct 24 11:50:56 2019 | |||||
| """ | """ | ||||
| from matplotlib import pyplot as plt | from matplotlib import pyplot as plt | ||||
| import numpy as np | import numpy as np | ||||
| from tqdm import tqdm | |||||
| import sys | |||||
| sys.path.insert(0, "../") | |||||
| from pygraph.utils.graphfiles import loadDataset | from pygraph.utils.graphfiles import loadDataset | ||||
| from utils import remove_edges | from utils import remove_edges | ||||
| from fitDistance import fit_GED_to_kernel_distance | from fitDistance import fit_GED_to_kernel_distance | ||||
| @@ -21,21 +24,22 @@ def test_anycosts(): | |||||
| remove_edges(Gn) | remove_edges(Gn) | ||||
| gkernel = 'marginalizedkernel' | gkernel = 'marginalizedkernel' | ||||
| itr_max = 10 | itr_max = 10 | ||||
| edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list = \ | |||||
| fit_GED_to_kernel_distance(Gn, gkernel, itr_max) | |||||
| edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \ | |||||
| nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, gkernel, itr_max) | |||||
| total_time = np.sum(time_list) | total_time = np.sum(time_list) | ||||
| print('\nedit_costs:', edit_costs) | print('\nedit_costs:', edit_costs) | ||||
| print('\nresidual_list:', residual_list) | print('\nresidual_list:', residual_list) | ||||
| print('\nedit_cost_list:', edit_cost_list) | print('\nedit_cost_list:', edit_cost_list) | ||||
| print('\ndistance matrix in kernel space:', dis_k_mat) | print('\ndistance matrix in kernel space:', dis_k_mat) | ||||
| print('\nged matrix:', ged_mat) | print('\nged matrix:', ged_mat) | ||||
| print('total time:', total_time) | |||||
| print('\ntotal time:', total_time) | |||||
| print('\nnb_cost_mat:', nb_cost_mat_list[-1]) | |||||
| np.savez('results/fit_distance.any_costs.gm', edit_costs=edit_costs, | np.savez('results/fit_distance.any_costs.gm', edit_costs=edit_costs, | ||||
| residual_list=residual_list, edit_cost_list=edit_cost_list, | residual_list=residual_list, edit_cost_list=edit_cost_list, | ||||
| dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, | dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, | ||||
| total_time=total_time) | |||||
| total_time=total_time, nb_cost_mat_list=nb_cost_mat_list) | |||||
| # normalized distance matrices. | |||||
| # # normalized distance matrices. | |||||
| # gmfile = np.load('results/fit_distance.any_costs.gm.npz') | # gmfile = np.load('results/fit_distance.any_costs.gm.npz') | ||||
| # edit_costs = gmfile['edit_costs'] | # edit_costs = gmfile['edit_costs'] | ||||
| # residual_list = gmfile['residual_list'] | # residual_list = gmfile['residual_list'] | ||||
| @@ -43,72 +47,256 @@ def test_anycosts(): | |||||
| # dis_k_mat = gmfile['dis_k_mat'] | # dis_k_mat = gmfile['dis_k_mat'] | ||||
| # ged_mat = gmfile['ged_mat'] | # ged_mat = gmfile['ged_mat'] | ||||
| # total_time = gmfile['total_time'] | # total_time = gmfile['total_time'] | ||||
| ## nb_cost_mat_list = gmfile['nb_cost_mat_list'] | |||||
| norm_dis_k_mat = normalize_distance_matrix(dis_k_mat) | norm_dis_k_mat = normalize_distance_matrix(dis_k_mat) | ||||
| plt.imshow(norm_dis_k_mat) | plt.imshow(norm_dis_k_mat) | ||||
| plt.colorbar() | plt.colorbar() | ||||
| plt.savefig('results/norm_dis_k_mat.any_costs' + '.eps', format='eps', dpi=300) | plt.savefig('results/norm_dis_k_mat.any_costs' + '.eps', format='eps', dpi=300) | ||||
| # plt.savefig('results/norm_dis_k_mat.any_costs' + '.jpg', format='jpg') | |||||
| # plt.savefig('results/norm_dis_k_mat.any_costs' + '.png', format='png') | |||||
| # plt.show() | # plt.show() | ||||
| plt.clf() | plt.clf() | ||||
| norm_ged_mat = normalize_distance_matrix(ged_mat) | norm_ged_mat = normalize_distance_matrix(ged_mat) | ||||
| plt.imshow(norm_ged_mat) | plt.imshow(norm_ged_mat) | ||||
| plt.colorbar() | plt.colorbar() | ||||
| plt.savefig('results/norm_ged_mat.any_costs' + '.eps', format='eps', dpi=300) | plt.savefig('results/norm_ged_mat.any_costs' + '.eps', format='eps', dpi=300) | ||||
| # plt.savefig('results/norm_ged_mat.any_costs' + '.jpg', format='jpg') | |||||
| # plt.savefig('results/norm_ged_mat.any_costs' + '.png', format='png') | |||||
| # plt.show() | |||||
| plt.clf() | |||||
| norm_diff = norm_ged_mat - norm_dis_k_mat | |||||
| plt.imshow(norm_diff) | |||||
| plt.colorbar() | |||||
| plt.savefig('results/diff_mat_norm_ged_dis_k.any_costs' + '.eps', format='eps', dpi=300) | |||||
| # plt.savefig('results/diff_mat_norm_ged_dis_k.any_costs' + '.png', format='png') | |||||
| # plt.show() | # plt.show() | ||||
| plt.clf() | plt.clf() | ||||
| # draw_count_bar(norm_diff) | |||||
| def test_cs_leq_ci_plus_cr(): | def test_cs_leq_ci_plus_cr(): | ||||
| """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er | """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er | ||||
| """ | """ | ||||
| ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', | |||||
| 'extra_params': {}} # node/edge symb | |||||
| Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||||
| ds = {'name': 'monoterpenoides', | |||||
| 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | |||||
| Gn, y_all = loadDataset(ds['dataset']) | |||||
| # Gn = Gn[0:10] | # Gn = Gn[0:10] | ||||
| remove_edges(Gn) | |||||
| gkernel = 'marginalizedkernel' | |||||
| gkernel = 'untilhpathkernel' | |||||
| node_label = 'atom' | |||||
| edge_label = 'bond_type' | |||||
| itr_max = 10 | itr_max = 10 | ||||
| edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list = \ | |||||
| fit_GED_to_kernel_distance(Gn, gkernel, itr_max) | |||||
| edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \ | |||||
| nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label, | |||||
| gkernel, itr_max, | |||||
| fitkernel='gaussian') | |||||
| total_time = np.sum(time_list) | total_time = np.sum(time_list) | ||||
| print('\nedit_costs:', edit_costs) | print('\nedit_costs:', edit_costs) | ||||
| print('\nresidual_list:', residual_list) | print('\nresidual_list:', residual_list) | ||||
| print('\nedit_cost_list:', edit_cost_list) | print('\nedit_cost_list:', edit_cost_list) | ||||
| print('\ndistance matrix in kernel space:', dis_k_mat) | print('\ndistance matrix in kernel space:', dis_k_mat) | ||||
| print('\nged matrix:', ged_mat) | print('\nged matrix:', ged_mat) | ||||
| print('total time:', total_time) | |||||
| np.savez('results/fit_distance.cs_leq_ci_plus_cr.gm', edit_costs=edit_costs, | |||||
| print('\ntotal time:', total_time) | |||||
| print('\nnb_cost_mat:', nb_cost_mat_list[-1]) | |||||
| np.savez('results/fit_distance.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel.gm', | |||||
| edit_costs=edit_costs, | |||||
| residual_list=residual_list, edit_cost_list=edit_cost_list, | residual_list=residual_list, edit_cost_list=edit_cost_list, | ||||
| dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, | dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, | ||||
| total_time=total_time) | |||||
| total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, | |||||
| coef_dk=coef_dk) | |||||
| # ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', | |||||
| # 'extra_params': {}} # node/edge symb | |||||
| # Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||||
| ## Gn = Gn[0:10] | |||||
| ## remove_edges(Gn) | |||||
| # gkernel = 'untilhpathkernel' | |||||
| # node_label = 'atom' | |||||
| # edge_label = 'bond_type' | |||||
| # itr_max = 10 | |||||
| # edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \ | |||||
| # nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label, | |||||
| # gkernel, itr_max) | |||||
| # total_time = np.sum(time_list) | |||||
| # print('\nedit_costs:', edit_costs) | |||||
| # print('\nresidual_list:', residual_list) | |||||
| # print('\nedit_cost_list:', edit_cost_list) | |||||
| # print('\ndistance matrix in kernel space:', dis_k_mat) | |||||
| # print('\nged matrix:', ged_mat) | |||||
| # print('\ntotal time:', total_time) | |||||
| # print('\nnb_cost_mat:', nb_cost_mat_list[-1]) | |||||
| # np.savez('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.mutag.elabeled.uhpkernel.gm', | |||||
| # edit_costs=edit_costs, | |||||
| # residual_list=residual_list, edit_cost_list=edit_cost_list, | |||||
| # dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, | |||||
| # total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, coef_dk) | |||||
| # # normalized distance matrices. | |||||
| # gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz') | |||||
| # edit_costs = gmfile['edit_costs'] | |||||
| # residual_list = gmfile['residual_list'] | |||||
| # edit_cost_list = gmfile['edit_cost_list'] | |||||
| # dis_k_mat = gmfile['dis_k_mat'] | |||||
| # ged_mat = gmfile['ged_mat'] | |||||
| # total_time = gmfile['total_time'] | |||||
| # nb_cost_mat_list = gmfile['nb_cost_mat_list'] | |||||
| # coef_dk = gmfile['coef_dk'] | |||||
| nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat) | |||||
| print(nb_consistent, nb_inconsistent, ratio_consistent) | |||||
| # dis_k_sub = pairwise_substitution(dis_k_mat) | |||||
| # ged_sub = pairwise_substitution(ged_mat) | |||||
| # np.savez('results/sub_dis_mat.cs_leq_ci_plus_cr.cost_leq_1en2.gm', | |||||
| # dis_k_sub=dis_k_sub, ged_sub=ged_sub) | |||||
| norm_dis_k_mat = normalize_distance_matrix(dis_k_mat) | |||||
| plt.imshow(norm_dis_k_mat) | |||||
| plt.colorbar() | |||||
| plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel' | |||||
| + '.eps', format='eps', dpi=300) | |||||
| plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel' | |||||
| + '.png', format='png') | |||||
| # plt.show() | |||||
| plt.clf() | |||||
| norm_ged_mat = normalize_distance_matrix(ged_mat) | |||||
| plt.imshow(norm_ged_mat) | |||||
| plt.colorbar() | |||||
| plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel' | |||||
| + '.eps', format='eps', dpi=300) | |||||
| plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel' | |||||
| + '.png', format='png') | |||||
| # plt.show() | |||||
| plt.clf() | |||||
| norm_diff = norm_ged_mat - norm_dis_k_mat | |||||
| plt.imshow(norm_diff) | |||||
| plt.colorbar() | |||||
| plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel' | |||||
| + '.eps', format='eps', dpi=300) | |||||
| plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel' | |||||
| + '.png', format='png') | |||||
| # plt.show() | |||||
| plt.clf() | |||||
| # draw_count_bar(norm_diff) | |||||
| def test_unfitted(): | |||||
| """unfitted. | |||||
| """ | |||||
| from fitDistance import compute_geds | |||||
| from utils import kernel_distance_matrix | |||||
| ds = {'name': 'monoterpenoides', | |||||
| 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | |||||
| Gn, y_all = loadDataset(ds['dataset']) | |||||
| # Gn = Gn[0:10] | |||||
| gkernel = 'untilhpathkernel' | |||||
| node_label = 'atom' | |||||
| edge_label = 'bond_type' | |||||
| # ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', | |||||
| # 'extra_params': {}} # node/edge symb | |||||
| # Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||||
| ## Gn = Gn[0:10] | |||||
| ## remove_edges(Gn) | |||||
| # gkernel = 'marginalizedkernel' | |||||
| dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, gkernel=gkernel) | |||||
| ged_all, ged_mat, n_edit_operations = compute_geds(Gn, [3, 3, 1, 3, 3, 1], | |||||
| [0, 1, 2, 3, 4, 5], parallel=True) | |||||
| print('\ndistance matrix in kernel space:', dis_k_mat) | |||||
| print('\nged matrix:', ged_mat) | |||||
| # np.savez('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.gm', edit_costs=edit_costs, | |||||
| # residual_list=residual_list, edit_cost_list=edit_cost_list, | |||||
| # dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, | |||||
| # total_time=total_time, nb_cost_mat_list=nb_cost_mat_list) | |||||
| # normalized distance matrices. | # normalized distance matrices. | ||||
| # gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.gm.npz') | |||||
| # gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en3.gm.npz') | |||||
| # edit_costs = gmfile['edit_costs'] | # edit_costs = gmfile['edit_costs'] | ||||
| # residual_list = gmfile['residual_list'] | # residual_list = gmfile['residual_list'] | ||||
| # edit_cost_list = gmfile['edit_cost_list'] | # edit_cost_list = gmfile['edit_cost_list'] | ||||
| # dis_k_mat = gmfile['dis_k_mat'] | # dis_k_mat = gmfile['dis_k_mat'] | ||||
| # ged_mat = gmfile['ged_mat'] | # ged_mat = gmfile['ged_mat'] | ||||
| # total_time = gmfile['total_time'] | # total_time = gmfile['total_time'] | ||||
| # nb_cost_mat_list = gmfile['nb_cost_mat_list'] | |||||
| nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat) | |||||
| print(nb_consistent, nb_inconsistent, ratio_consistent) | |||||
| norm_dis_k_mat = normalize_distance_matrix(dis_k_mat) | norm_dis_k_mat = normalize_distance_matrix(dis_k_mat) | ||||
| plt.imshow(norm_dis_k_mat) | plt.imshow(norm_dis_k_mat) | ||||
| plt.colorbar() | plt.colorbar() | ||||
| plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr' + '.eps', format='eps', dpi=300) | |||||
| # plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr' + '.jpg', format='jpg') | |||||
| plt.savefig('results/norm_dis_k_mat.unfitted.MUTAG' + '.eps', format='eps', dpi=300) | |||||
| plt.savefig('results/norm_dis_k_mat.unfitted.MUTAG' + '.png', format='png') | |||||
| # plt.show() | # plt.show() | ||||
| plt.clf() | plt.clf() | ||||
| norm_ged_mat = normalize_distance_matrix(ged_mat) | norm_ged_mat = normalize_distance_matrix(ged_mat) | ||||
| plt.imshow(norm_ged_mat) | plt.imshow(norm_ged_mat) | ||||
| plt.colorbar() | plt.colorbar() | ||||
| plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr' + '.eps', format='eps', dpi=300) | |||||
| # plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr' + '.jpg', format='jpg') | |||||
| plt.savefig('results/norm_ged_mat.unfitted.MUTAG' + '.eps', format='eps', dpi=300) | |||||
| plt.savefig('results/norm_ged_mat.unfitted.MUTAG' + '.png', format='png') | |||||
| # plt.show() | |||||
| plt.clf() | |||||
| norm_diff = norm_ged_mat - norm_dis_k_mat | |||||
| plt.imshow(norm_diff) | |||||
| plt.colorbar() | |||||
| plt.savefig('results/diff_mat_norm_ged_dis_k.unfitted.MUTAG' + '.eps', format='eps', dpi=300) | |||||
| plt.savefig('results/diff_mat_norm_ged_dis_k.unfitted.MUTAG' + '.png', format='png') | |||||
| # plt.show() | # plt.show() | ||||
| plt.clf() | plt.clf() | ||||
| draw_count_bar(norm_diff) | |||||
| def pairwise_substitution_consistence(mat1, mat2): | |||||
| """ | |||||
| """ | |||||
| nb_consistent = 0 | |||||
| nb_inconsistent = 0 | |||||
| # the matrix is considered symmetric. | |||||
| upper_tri1 = mat1[np.triu_indices_from(mat1)] | |||||
| upper_tri2 = mat2[np.tril_indices_from(mat2)] | |||||
| for i in tqdm(range(len(upper_tri1)), desc='computing consistence', file=sys.stdout): | |||||
| for j in range(i, len(upper_tri1)): | |||||
| if np.sign(upper_tri1[i] - upper_tri1[j]) == np.sign(upper_tri2[i] - upper_tri2[j]): | |||||
| nb_consistent += 1 | |||||
| else: | |||||
| nb_inconsistent += 1 | |||||
| return nb_consistent, nb_inconsistent, nb_consistent / (nb_consistent + nb_inconsistent) | |||||
| def pairwise_substitution(mat): | |||||
| # the matrix is considered symmetric. | |||||
| upper_tri = mat[np.triu_indices_from(mat)] | |||||
| sub_list = [] | |||||
| for i in tqdm(range(len(upper_tri)), desc='computing', file=sys.stdout): | |||||
| for j in range(i, len(upper_tri)): | |||||
| sub_list.append(upper_tri[i] - upper_tri[j]) | |||||
| return sub_list | |||||
| def draw_count_bar(norm_diff): | |||||
| import pandas | |||||
| from collections import Counter, OrderedDict | |||||
| norm_diff_cnt = norm_diff.flatten() | |||||
| norm_diff_cnt = norm_diff_cnt * 10 | |||||
| norm_diff_cnt = np.floor(norm_diff_cnt) | |||||
| norm_diff_cnt = Counter(norm_diff_cnt) | |||||
| norm_diff_cnt = OrderedDict(sorted(norm_diff_cnt.items())) | |||||
| df = pandas.DataFrame.from_dict(norm_diff_cnt, orient='index') | |||||
| df.plot(kind='bar') | |||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||
| test_anycosts() | |||||
| test_cs_leq_ci_plus_cr() | |||||
| # test_anycosts() | |||||
| # test_cs_leq_ci_plus_cr() | |||||
| test_unfitted() | |||||
| # x = np.array([[1,2,3],[4,5,6],[7,8,9]]) | |||||
| # xx = pairwise_substitution(x) | |||||
| @@ -17,9 +17,363 @@ import random | |||||
| import sys | import sys | ||||
| sys.path.insert(0, "../") | sys.path.insert(0, "../") | ||||
| from pygraph.utils.graphfiles import loadDataset | from pygraph.utils.graphfiles import loadDataset | ||||
| #from pygraph.utils.logger2file import * | |||||
| from iam import iam_upgraded | from iam import iam_upgraded | ||||
| from utils import remove_edges, compute_kernel, get_same_item_indices | |||||
| from ged import ged_median | |||||
| from utils import remove_edges, compute_kernel, get_same_item_indices, dis_gstar | |||||
| #from ged import ged_median | |||||
| def test_iam_monoterpenoides(): | |||||
| ds = {'name': 'monoterpenoides', | |||||
| 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | |||||
| Gn, y_all = loadDataset(ds['dataset']) | |||||
| # Gn = Gn[0:50] | |||||
| gkernel = 'untilhpathkernel' | |||||
| node_label = 'atom' | |||||
| edge_label = 'bond_type' | |||||
| # parameters for GED function from the IAM paper. | |||||
| # fitted edit costs (Gaussian). | |||||
| c_vi = 0.03620133402089074 | |||||
| c_vr = 0.0417574590207099 | |||||
| c_vs = 0.009992282328587499 | |||||
| c_ei = 0.08293120042342755 | |||||
| c_er = 0.09512220476358019 | |||||
| c_es = 0.09222529696841467 | |||||
| # # fitted edit costs (linear combinations). | |||||
| # c_vi = 0.1749684054238749 | |||||
| # c_vr = 0.0734054228711457 | |||||
| # c_vs = 0.05017781726016715 | |||||
| # c_ei = 0.1869431164806936 | |||||
| # c_er = 0.32055856948274 | |||||
| # c_es = 0.2569469379247611 | |||||
| # # unfitted edit costs. | |||||
| # c_vi = 3 | |||||
| # c_vr = 3 | |||||
| # c_vs = 1 | |||||
| # c_ei = 3 | |||||
| # c_er = 3 | |||||
| # c_es = 1 | |||||
| ite_max_iam = 50 | |||||
| epsilon_iam = 0.001 | |||||
| removeNodes = False | |||||
| connected_iam = False | |||||
| # parameters for IAM function | |||||
| # ged_cost = 'CONSTANT' | |||||
| ged_cost = 'CONSTANT' | |||||
| ged_method = 'IPFP' | |||||
| edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es] | |||||
| # edit_cost_constant = [] | |||||
| ged_stabilizer = 'min' | |||||
| ged_repeat = 50 | |||||
| params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, | |||||
| 'edit_cost_constant': edit_cost_constant, | |||||
| 'stabilizer': ged_stabilizer, 'repeat': ged_repeat} | |||||
| # classify graphs according to letters. | |||||
| time_list = [] | |||||
| dis_ks_min_list = [] | |||||
| dis_ks_set_median_list = [] | |||||
| sod_gs_list = [] | |||||
| g_best = [] | |||||
| sod_set_median_list = [] | |||||
| sod_list_list = [] | |||||
| idx_dict = get_same_item_indices(y_all) | |||||
| for y_class in idx_dict: | |||||
| print('\n-------------------------------------------------------') | |||||
| print('class of y:', y_class) | |||||
| Gn_class = [Gn[i].copy() for i in idx_dict[y_class]] | |||||
| time_list.append([]) | |||||
| dis_ks_min_list.append([]) | |||||
| dis_ks_set_median_list.append([]) | |||||
| sod_gs_list.append([]) | |||||
| g_best.append([]) | |||||
| sod_set_median_list.append([]) | |||||
| for repeat in range(50): | |||||
| idx_rdm = random.sample(range(len(Gn_class)), 10) | |||||
| print('graphs chosen:', idx_rdm) | |||||
| Gn_median = [Gn_class[idx].copy() for idx in idx_rdm] | |||||
| Gn_candidate = [g.copy() for g in Gn_median] | |||||
| alpha_range = [1 / len(Gn_median)] * len(Gn_median) | |||||
| time0 = time.time() | |||||
| G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \ | |||||
| = iam_upgraded(Gn_median, | |||||
| Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam, | |||||
| epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes, | |||||
| params_ged=params_ged) | |||||
| time_total = time.time() - time0 | |||||
| print('\ntime: ', time_total) | |||||
| time_list[-1].append(time_total) | |||||
| g_best[-1].append(G_gen_median_list[0]) | |||||
| sod_set_median_list[-1].append(sod_set_median) | |||||
| print('\nsmallest sod of the set median:', sod_set_median) | |||||
| sod_gs_list[-1].append(sod_gen_median) | |||||
| print('\nsmallest sod in graph space:', sod_gen_median) | |||||
| sod_list_list.append(sod_list) | |||||
| # show the best graph and save it to file. | |||||
| print('one of the possible corresponding pre-images is') | |||||
| nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'), | |||||
| with_labels=True) | |||||
| # plt.show() | |||||
| # plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) + | |||||
| # plt.savefig('results/iam/paper_compare/monoter_y' + str(y_class) + | |||||
| # '_repeat' + str(repeat) + '_' + str(time.time()) + | |||||
| # '.png', format="PNG") | |||||
| plt.clf() | |||||
| # print(G_gen_median_list[0].nodes(data=True)) | |||||
| # print(G_gen_median_list[0].edges(data=True)) | |||||
| # compute distance between \psi and the set median graph. | |||||
| knew_set_median = compute_kernel(G_set_median_list + Gn_median, | |||||
| gkernel, node_label, edge_label, False) | |||||
| dhat_new_set_median_list = [] | |||||
| for idx, g_tmp in enumerate(G_set_median_list): | |||||
| # @todo: the term3 below could use the one at the beginning of the function. | |||||
| dhat_new_set_median_list.append(dis_gstar(idx, range(len(G_set_median_list), | |||||
| len(G_set_median_list) + len(Gn_median) + 1), | |||||
| alpha_range, knew_set_median, withterm3=False)) | |||||
| print('\ndistance in kernel space of set median: ', dhat_new_set_median_list[0]) | |||||
| dis_ks_set_median_list[-1].append(dhat_new_set_median_list[0]) | |||||
| # compute distance between \psi and the new generated graphs. | |||||
| knew = compute_kernel(G_gen_median_list + Gn_median, gkernel, node_label, | |||||
| edge_label, False) | |||||
| dhat_new_list = [] | |||||
| for idx, g_tmp in enumerate(G_gen_median_list): | |||||
| # @todo: the term3 below could use the one at the beginning of the function. | |||||
| dhat_new_list.append(dis_gstar(idx, range(len(G_gen_median_list), | |||||
| len(G_gen_median_list) + len(Gn_median) + 1), | |||||
| alpha_range, knew, withterm3=False)) | |||||
| print('\nsmallest distance in kernel space: ', dhat_new_list[0]) | |||||
| dis_ks_min_list[-1].append(dhat_new_list[0]) | |||||
| print('\nsods of the set median for this class:', sod_set_median_list[-1]) | |||||
| print('\nsods in graph space for this class:', sod_gs_list[-1]) | |||||
| print('\ndistance in kernel space of set median for this class:', | |||||
| dis_ks_set_median_list[-1]) | |||||
| print('\nsmallest distances in kernel space for this class:', | |||||
| dis_ks_min_list[-1]) | |||||
| print('\ntimes for this class:', time_list[-1]) | |||||
| sod_set_median_list[-1] = np.mean(sod_set_median_list[-1]) | |||||
| sod_gs_list[-1] = np.mean(sod_gs_list[-1]) | |||||
| dis_ks_set_median_list[-1] = np.mean(dis_ks_set_median_list[-1]) | |||||
| dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1]) | |||||
| time_list[-1] = np.mean(time_list[-1]) | |||||
| print() | |||||
| print('\nmean sods of the set median for each class:', sod_set_median_list) | |||||
| print('\nmean sods in graph space for each class:', sod_gs_list) | |||||
| print('\ndistances in kernel space of set median for each class:', | |||||
| dis_ks_set_median_list) | |||||
| print('\nmean smallest distances in kernel space for each class:', | |||||
| dis_ks_min_list) | |||||
| print('\nmean times for each class:', time_list) | |||||
| print('\nmean sods of the set median of all:', np.mean(sod_set_median_list)) | |||||
| print('\nmean sods in graph space of all:', np.mean(sod_gs_list)) | |||||
| print('\nmean distances in kernel space of set median of all:', | |||||
| np.mean(dis_ks_set_median_list)) | |||||
| print('\nmean smallest distances in kernel space of all:', | |||||
| np.mean(dis_ks_min_list)) | |||||
| print('\nmean times of all:', np.mean(time_list)) | |||||
| nb_better_sods = 0 | |||||
| nb_worse_sods = 0 | |||||
| nb_same_sods = 0 | |||||
| for sods in sod_list_list: | |||||
| if sods[0] > sods[-1]: | |||||
| nb_better_sods += 1 | |||||
| elif sods[0] < sods[-1]: | |||||
| nb_worse_sods += 1 | |||||
| else: | |||||
| nb_same_sods += 1 | |||||
| print('\n In', str(len(sod_list_list)), 'sod lists,', str(nb_better_sods), | |||||
| 'are getting better,', str(nb_worse_sods), 'are getting worse,', | |||||
| str(nb_same_sods), 'are not changed; ', str(nb_better_sods / len(sod_list_list)), | |||||
| 'sods are improved.') | |||||
| def test_iam_mutag(): | |||||
| ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', | |||||
| 'extra_params': {}} # node/edge symb | |||||
| Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||||
| # Gn = Gn[0:50] | |||||
| gkernel = 'untilhpathkernel' | |||||
| node_label = 'atom' | |||||
| edge_label = 'bond_type' | |||||
| # parameters for GED function from the IAM paper. | |||||
| # fitted edit costs. | |||||
| c_vi = 0.03523843108436513 | |||||
| c_vr = 0.03347339739350128 | |||||
| c_vs = 0.06871290673612238 | |||||
| c_ei = 0.08591999846720685 | |||||
| c_er = 0.07962086440894103 | |||||
| c_es = 0.08596855855478233 | |||||
| # unfitted edit costs. | |||||
| # c_vi = 3 | |||||
| # c_vr = 3 | |||||
| # c_vs = 1 | |||||
| # c_ei = 3 | |||||
| # c_er = 3 | |||||
| # c_es = 1 | |||||
| ite_max_iam = 50 | |||||
| epsilon_iam = 0.001 | |||||
| removeNodes = False | |||||
| connected_iam = False | |||||
| # parameters for IAM function | |||||
| # ged_cost = 'CONSTANT' | |||||
| ged_cost = 'CONSTANT' | |||||
| ged_method = 'IPFP' | |||||
| edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es] | |||||
| # edit_cost_constant = [] | |||||
| ged_stabilizer = 'min' | |||||
| ged_repeat = 50 | |||||
| params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, | |||||
| 'edit_cost_constant': edit_cost_constant, | |||||
| 'stabilizer': ged_stabilizer, 'repeat': ged_repeat} | |||||
| # classify graphs according to letters. | |||||
| time_list = [] | |||||
| dis_ks_min_list = [] | |||||
| dis_ks_set_median_list = [] | |||||
| sod_gs_list = [] | |||||
| g_best = [] | |||||
| sod_set_median_list = [] | |||||
| sod_list_list = [] | |||||
| idx_dict = get_same_item_indices(y_all) | |||||
| for y_class in idx_dict: | |||||
| print('\n-------------------------------------------------------') | |||||
| print('class of y:', y_class) | |||||
| Gn_class = [Gn[i].copy() for i in idx_dict[y_class]] | |||||
| time_list.append([]) | |||||
| dis_ks_min_list.append([]) | |||||
| dis_ks_set_median_list.append([]) | |||||
| sod_gs_list.append([]) | |||||
| g_best.append([]) | |||||
| sod_set_median_list.append([]) | |||||
| for repeat in range(50): | |||||
| idx_rdm = random.sample(range(len(Gn_class)), 10) | |||||
| print('graphs chosen:', idx_rdm) | |||||
| Gn_median = [Gn_class[idx].copy() for idx in idx_rdm] | |||||
| Gn_candidate = [g.copy() for g in Gn_median] | |||||
| alpha_range = [1 / len(Gn_median)] * len(Gn_median) | |||||
| time0 = time.time() | |||||
| G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \ | |||||
| = iam_upgraded(Gn_median, | |||||
| Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam, | |||||
| epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes, | |||||
| params_ged=params_ged) | |||||
| time_total = time.time() - time0 | |||||
| print('\ntime: ', time_total) | |||||
| time_list[-1].append(time_total) | |||||
| g_best[-1].append(G_gen_median_list[0]) | |||||
| sod_set_median_list[-1].append(sod_set_median) | |||||
| print('\nsmallest sod of the set median:', sod_set_median) | |||||
| sod_gs_list[-1].append(sod_gen_median) | |||||
| print('\nsmallest sod in graph space:', sod_gen_median) | |||||
| sod_list_list.append(sod_list) | |||||
| # show the best graph and save it to file. | |||||
| print('one of the possible corresponding pre-images is') | |||||
| nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'), | |||||
| with_labels=True) | |||||
| # plt.show() | |||||
| # plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) + | |||||
| # plt.savefig('results/iam/paper_compare/mutag_y' + str(y_class) + | |||||
| # '_repeat' + str(repeat) + '_' + str(time.time()) + | |||||
| # '.png', format="PNG") | |||||
| plt.clf() | |||||
| # print(G_gen_median_list[0].nodes(data=True)) | |||||
| # print(G_gen_median_list[0].edges(data=True)) | |||||
| # compute distance between \psi and the set median graph. | |||||
| knew_set_median = compute_kernel(G_set_median_list + Gn_median, | |||||
| gkernel, node_label, edge_label, False) | |||||
| dhat_new_set_median_list = [] | |||||
| for idx, g_tmp in enumerate(G_set_median_list): | |||||
| # @todo: the term3 below could use the one at the beginning of the function. | |||||
| dhat_new_set_median_list.append(dis_gstar(idx, range(len(G_set_median_list), | |||||
| len(G_set_median_list) + len(Gn_median) + 1), | |||||
| alpha_range, knew_set_median, withterm3=False)) | |||||
| print('\ndistance in kernel space of set median: ', dhat_new_set_median_list[0]) | |||||
| dis_ks_set_median_list[-1].append(dhat_new_set_median_list[0]) | |||||
| # compute distance between \psi and the new generated graphs. | |||||
| knew = compute_kernel(G_gen_median_list + Gn_median, gkernel, node_label, | |||||
| edge_label, False) | |||||
| dhat_new_list = [] | |||||
| for idx, g_tmp in enumerate(G_gen_median_list): | |||||
| # @todo: the term3 below could use the one at the beginning of the function. | |||||
| dhat_new_list.append(dis_gstar(idx, range(len(G_gen_median_list), | |||||
| len(G_gen_median_list) + len(Gn_median) + 1), | |||||
| alpha_range, knew, withterm3=False)) | |||||
| print('\nsmallest distance in kernel space: ', dhat_new_list[0]) | |||||
| dis_ks_min_list[-1].append(dhat_new_list[0]) | |||||
| print('\nsods of the set median for this class:', sod_set_median_list[-1]) | |||||
| print('\nsods in graph space for this class:', sod_gs_list[-1]) | |||||
| print('\ndistance in kernel space of set median for this class:', | |||||
| dis_ks_set_median_list[-1]) | |||||
| print('\nsmallest distances in kernel space for this class:', | |||||
| dis_ks_min_list[-1]) | |||||
| print('\ntimes for this class:', time_list[-1]) | |||||
| sod_set_median_list[-1] = np.mean(sod_set_median_list[-1]) | |||||
| sod_gs_list[-1] = np.mean(sod_gs_list[-1]) | |||||
| dis_ks_set_median_list[-1] = np.mean(dis_ks_set_median_list[-1]) | |||||
| dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1]) | |||||
| time_list[-1] = np.mean(time_list[-1]) | |||||
| print() | |||||
| print('\nmean sods of the set median for each class:', sod_set_median_list) | |||||
| print('\nmean sods in graph space for each class:', sod_gs_list) | |||||
| print('\ndistances in kernel space of set median for each class:', | |||||
| dis_ks_set_median_list) | |||||
| print('\nmean smallest distances in kernel space for each class:', | |||||
| dis_ks_min_list) | |||||
| print('\nmean times for each class:', time_list) | |||||
| print('\nmean sods of the set median of all:', np.mean(sod_set_median_list)) | |||||
| print('\nmean sods in graph space of all:', np.mean(sod_gs_list)) | |||||
| print('\nmean distances in kernel space of set median of all:', | |||||
| np.mean(dis_ks_set_median_list)) | |||||
| print('\nmean smallest distances in kernel space of all:', | |||||
| np.mean(dis_ks_min_list)) | |||||
| print('\nmean times of all:', np.mean(time_list)) | |||||
| nb_better_sods = 0 | |||||
| nb_worse_sods = 0 | |||||
| nb_same_sods = 0 | |||||
| for sods in sod_list_list: | |||||
| if sods[0] > sods[-1]: | |||||
| nb_better_sods += 1 | |||||
| elif sods[0] < sods[-1]: | |||||
| nb_worse_sods += 1 | |||||
| else: | |||||
| nb_same_sods += 1 | |||||
| print('\n In', str(len(sod_list_list)), 'sod lists,', str(nb_better_sods), | |||||
| 'are getting better,', str(nb_worse_sods), 'are getting worse,', | |||||
| str(nb_same_sods), 'are not changed; ', str(nb_better_sods / len(sod_list_list)), | |||||
| 'sods are improved.') | |||||
| ############################################################################### | ############################################################################### | ||||
| # tests on different numbers of median-sets. | # tests on different numbers of median-sets. | ||||
| @@ -33,46 +387,352 @@ def test_iam_median_nb(): | |||||
| remove_edges(Gn) | remove_edges(Gn) | ||||
| gkernel = 'marginalizedkernel' | gkernel = 'marginalizedkernel' | ||||
| # lmbda = 0.03 # termination probalility | |||||
| # r_max = 10 # iteration limit for pre-image. | |||||
| # alpha_range = np.linspace(0.5, 0.5, 1) | |||||
| # k = 5 # k nearest neighbors | |||||
| # epsilon = 1e-6 | |||||
| # InitIAMWithAllDk = True | |||||
| lmbda = 0.03 # termination probalility | |||||
| # # parameters for GED function | |||||
| # c_vi = 0.037 | |||||
| # c_vr = 0.038 | |||||
| # c_vs = 0.075 | |||||
| # c_ei = 0.001 | |||||
| # c_er = 0.001 | |||||
| # c_es = 0.0 | |||||
| # ite_max_iam = 50 | |||||
| # epsilon_iam = 0.001 | |||||
| # removeNodes = False | |||||
| # connected_iam = False | |||||
| # # parameters for IAM function | |||||
| # ged_cost = 'CONSTANT' | |||||
| # ged_method = 'IPFP' | |||||
| # edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es] | |||||
| # ged_stabilizer = 'min' | |||||
| # ged_repeat = 50 | |||||
| # params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, | |||||
| # 'edit_cost_constant': edit_cost_constant, | |||||
| # 'stabilizer': ged_stabilizer, 'repeat': ged_repeat} | |||||
| # parameters for GED function | # parameters for GED function | ||||
| ged_cost='CHEM_1' | |||||
| ged_method='IPFP' | |||||
| saveGXL='gedlib' | |||||
| # parameters for IAM function | |||||
| c_ei=1 | |||||
| c_er=1 | |||||
| c_es=1 | |||||
| c_vi = 4 | |||||
| c_vr = 4 | |||||
| c_vs = 2 | |||||
| c_ei = 1 | |||||
| c_er = 1 | |||||
| c_es = 1 | |||||
| ite_max_iam = 50 | ite_max_iam = 50 | ||||
| epsilon_iam = 0.001 | epsilon_iam = 0.001 | ||||
| removeNodes = False | removeNodes = False | ||||
| connected_iam = False | connected_iam = False | ||||
| # number of graphs; we what to compute the median of these graphs. | |||||
| nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100] | |||||
| # parameters for IAM function | |||||
| ged_cost = 'CHEM_1' | |||||
| ged_method = 'IPFP' | |||||
| edit_cost_constant = [] | |||||
| ged_stabilizer = 'min' | |||||
| ged_repeat = 50 | |||||
| params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, | |||||
| 'edit_cost_constant': edit_cost_constant, | |||||
| 'stabilizer': ged_stabilizer, 'repeat': ged_repeat} | |||||
| # find out all the graphs classified to positive group 1. | # find out all the graphs classified to positive group 1. | ||||
| idx_dict = get_same_item_indices(y_all) | idx_dict = get_same_item_indices(y_all) | ||||
| Gn = [Gn[i] for i in idx_dict[1]] | Gn = [Gn[i] for i in idx_dict[1]] | ||||
| # number of graphs; we what to compute the median of these graphs. | |||||
| # nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100] | |||||
| nb_median_range = [len(Gn)] | |||||
| # # compute Gram matrix. | # # compute Gram matrix. | ||||
| # time0 = time.time() | # time0 = time.time() | ||||
| # km = compute_kernel(Gn, gkernel, True) | # km = compute_kernel(Gn, gkernel, True) | ||||
| # time_km = time.time() - time0 | # time_km = time.time() - time0 | ||||
| # # write Gram matrix to file. | # # write Gram matrix to file. | ||||
| # np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km) | # np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km) | ||||
| time_list = [] | |||||
| dis_ks_min_list = [] | |||||
| sod_gs_list = [] | |||||
| # sod_gs_min_list = [] | |||||
| # nb_updated_list = [] | |||||
| # nb_updated_k_list = [] | |||||
| g_best = [] | |||||
| for nb_median in nb_median_range: | |||||
| print('\n-------------------------------------------------------') | |||||
| print('number of median graphs =', nb_median) | |||||
| random.seed(1) | |||||
| idx_rdm = random.sample(range(len(Gn)), nb_median) | |||||
| print('graphs chosen:', idx_rdm) | |||||
| Gn_median = [Gn[idx].copy() for idx in idx_rdm] | |||||
| Gn_candidate = [g.copy() for g in Gn] | |||||
| # for g in Gn_median: | |||||
| # nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True) | |||||
| ## plt.savefig("results/preimage_mix/mutag.png", format="PNG") | |||||
| # plt.show() | |||||
| # plt.clf() | |||||
| ################################################################### | |||||
| # gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz') | |||||
| # km_tmp = gmfile['gm'] | |||||
| # time_km = gmfile['gmtime'] | |||||
| # # modify mixed gram matrix. | |||||
| # km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median)) | |||||
| # for i in range(len(Gn)): | |||||
| # for j in range(i, len(Gn)): | |||||
| # km[i, j] = km_tmp[i, j] | |||||
| # km[j, i] = km[i, j] | |||||
| # for i in range(len(Gn)): | |||||
| # for j, idx in enumerate(idx_rdm): | |||||
| # km[i, len(Gn) + j] = km[i, idx] | |||||
| # km[len(Gn) + j, i] = km[i, idx] | |||||
| # for i, idx1 in enumerate(idx_rdm): | |||||
| # for j, idx2 in enumerate(idx_rdm): | |||||
| # km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2] | |||||
| ################################################################### | |||||
| alpha_range = [1 / nb_median] * nb_median | |||||
| time0 = time.time() | |||||
| ghat_new_list, sod_min = iam_upgraded(Gn_median, Gn_candidate, | |||||
| c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam, | |||||
| epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes, | |||||
| params_ged=params_ged) | |||||
| time_total = time.time() - time0 | |||||
| print('\ntime: ', time_total) | |||||
| time_list.append(time_total) | |||||
| # compute distance between \psi and the new generated graphs. | |||||
| knew = compute_kernel(ghat_new_list + Gn_median, gkernel, False) | |||||
| dhat_new_list = [] | |||||
| for idx, g_tmp in enumerate(ghat_new_list): | |||||
| # @todo: the term3 below could use the one at the beginning of the function. | |||||
| dhat_new_list.append(dis_gstar(idx, range(len(ghat_new_list), | |||||
| len(ghat_new_list) + len(Gn_median) + 1), | |||||
| alpha_range, knew, withterm3=False)) | |||||
| print('\nsmallest distance in kernel space: ', dhat_new_list[0]) | |||||
| dis_ks_min_list.append(dhat_new_list[0]) | |||||
| g_best.append(ghat_new_list[0]) | |||||
| # show the best graph and save it to file. | |||||
| # print('the shortest distance is', dhat) | |||||
| print('one of the possible corresponding pre-images is') | |||||
| nx.draw(ghat_new_list[0], labels=nx.get_node_attributes(ghat_new_list[0], 'atom'), | |||||
| with_labels=True) | |||||
| plt.show() | |||||
| # plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) + | |||||
| plt.savefig('results/iam/mutag_median_unfit2.nb' + str(nb_median) + | |||||
| '.png', format="PNG") | |||||
| plt.clf() | |||||
| # print(ghat_list[0].nodes(data=True)) | |||||
| # print(ghat_list[0].edges(data=True)) | |||||
| sod_gs_list.append(sod_min) | |||||
| # sod_gs_min_list.append(np.min(sod_min)) | |||||
| print('\nsmallest sod in graph space: ', sod_min) | |||||
| print('\nsods in graph space: ', sod_gs_list) | |||||
| # print('\nsmallest sod in graph space for each set of median graphs: ', sod_gs_min_list) | |||||
| print('\nsmallest distance in kernel space for each set of median graphs: ', | |||||
| dis_ks_min_list) | |||||
| # print('\nnumber of updates of the best graph for each set of median graphs by IAM: ', | |||||
| # nb_updated_list) | |||||
| # print('\nnumber of updates of k nearest graphs for each set of median graphs by IAM: ', | |||||
| # nb_updated_k_list) | |||||
| print('\ntimes:', time_list) | |||||
| def test_iam_letter_h(): | |||||
| from median import draw_Letter_graph | |||||
| ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt', | |||||
| 'extra_params': {}} # node nsymb | |||||
| # ds = {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt', | |||||
| # 'extra_params': {}} # node nsymb | |||||
| # Gn = Gn[0:50] | |||||
| Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||||
| gkernel = 'structuralspkernel' | |||||
| # parameters for GED function from the IAM paper. | |||||
| c_vi = 3 | |||||
| c_vr = 3 | |||||
| c_vs = 1 | |||||
| c_ei = 3 | |||||
| c_er = 3 | |||||
| c_es = 1 | |||||
| ite_max_iam = 50 | |||||
| epsilon_iam = 0.001 | |||||
| removeNodes = False | |||||
| connected_iam = False | |||||
| # parameters for IAM function | |||||
| # ged_cost = 'CONSTANT' | |||||
| ged_cost = 'LETTER' | |||||
| ged_method = 'IPFP' | |||||
| # edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es] | |||||
| edit_cost_constant = [] | |||||
| ged_stabilizer = 'min' | |||||
| ged_repeat = 50 | |||||
| params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, | |||||
| 'edit_cost_constant': edit_cost_constant, | |||||
| 'stabilizer': ged_stabilizer, 'repeat': ged_repeat} | |||||
| # classify graphs according to letters. | |||||
| time_list = [] | |||||
| dis_ks_min_list = [] | |||||
| sod_gs_list = [] | |||||
| g_best = [] | |||||
| sod_set_median_list = [] | |||||
| idx_dict = get_same_item_indices(y_all) | |||||
| for letter in idx_dict: | |||||
| print('\n-------------------------------------------------------') | |||||
| print('letter', letter) | |||||
| Gn_let = [Gn[i].copy() for i in idx_dict[letter]] | |||||
| time_list.append([]) | |||||
| dis_ks_min_list.append([]) | |||||
| sod_gs_list.append([]) | |||||
| g_best.append([]) | |||||
| sod_set_median_list.append([]) | |||||
| for repeat in range(50): | |||||
| idx_rdm = random.sample(range(len(Gn_let)), 50) | |||||
| print('graphs chosen:', idx_rdm) | |||||
| Gn_median = [Gn_let[idx].copy() for idx in idx_rdm] | |||||
| Gn_candidate = [g.copy() for g in Gn_median] | |||||
| alpha_range = [1 / len(Gn_median)] * len(Gn_median) | |||||
| time0 = time.time() | |||||
| ghat_new_list, sod_min, sod_set_median = iam_upgraded(Gn_median, | |||||
| Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam, | |||||
| epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes, | |||||
| params_ged=params_ged) | |||||
| time_total = time.time() - time0 | |||||
| print('\ntime: ', time_total) | |||||
| time_list[-1].append(time_total) | |||||
| g_best[-1].append(ghat_new_list[0]) | |||||
| sod_set_median_list[-1].append(sod_set_median) | |||||
| print('\nsmallest sod of the set median:', sod_set_median) | |||||
| sod_gs_list[-1].append(sod_min) | |||||
| print('\nsmallest sod in graph space:', sod_min) | |||||
| # show the best graph and save it to file. | |||||
| print('one of the possible corresponding pre-images is') | |||||
| draw_Letter_graph(ghat_new_list[0], savepath='results/iam/paper_compare/') | |||||
| # compute distance between \psi and the new generated graphs. | |||||
| knew = compute_kernel(ghat_new_list + Gn_median, gkernel, False) | |||||
| dhat_new_list = [] | |||||
| for idx, g_tmp in enumerate(ghat_new_list): | |||||
| # @todo: the term3 below could use the one at the beginning of the function. | |||||
| dhat_new_list.append(dis_gstar(idx, range(len(ghat_new_list), | |||||
| len(ghat_new_list) + len(Gn_median) + 1), | |||||
| alpha_range, knew, withterm3=False)) | |||||
| print('\nsmallest distance in kernel space: ', dhat_new_list[0]) | |||||
| dis_ks_min_list[-1].append(dhat_new_list[0]) | |||||
| print('\nsods of the set median for this letter:', sod_set_median_list[-1]) | |||||
| print('\nsods in graph space for this letter:', sod_gs_list[-1]) | |||||
| print('\nsmallest distances in kernel space for this letter:', | |||||
| dis_ks_min_list[-1]) | |||||
| print('\ntimes for this letter:', time_list[-1]) | |||||
| sod_set_median_list[-1] = np.mean(sod_set_median_list[-1]) | |||||
| sod_gs_list[-1] = np.mean(sod_gs_list[-1]) | |||||
| dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1]) | |||||
| time_list[-1] = np.mean(time_list[-1]) | |||||
| print('\nmean sods of the set median for each letter:', sod_set_median_list) | |||||
| print('\nmean sods in graph space for each letter:', sod_gs_list) | |||||
| print('\nmean smallest distances in kernel space for each letter:', | |||||
| dis_ks_min_list) | |||||
| print('\nmean times for each letter:', time_list) | |||||
| print('\nmean sods of the set median of all:', np.mean(sod_set_median_list)) | |||||
| print('\nmean sods in graph space of all:', np.mean(sod_gs_list)) | |||||
| print('\nmean smallest distances in kernel space of all:', | |||||
| np.mean(dis_ks_min_list)) | |||||
| print('\nmean times of all:', np.mean(time_list)) | |||||
| def test_iam_fitdistance(): | |||||
| ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', | |||||
| 'extra_params': {}} # node/edge symb | |||||
| Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||||
| # Gn = Gn[0:50] | |||||
| # remove_edges(Gn) | |||||
| gkernel = 'marginalizedkernel' | |||||
| node_label = 'atom' | |||||
| edge_label = 'bond_type' | |||||
| # lmbda = 0.03 # termination probalility | |||||
| # # parameters for GED function | |||||
| # c_vi = 0.037 | |||||
| # c_vr = 0.038 | |||||
| # c_vs = 0.075 | |||||
| # c_ei = 0.001 | |||||
| # c_er = 0.001 | |||||
| # c_es = 0.0 | |||||
| # ite_max_iam = 50 | |||||
| # epsilon_iam = 0.001 | |||||
| # removeNodes = False | |||||
| # connected_iam = False | |||||
| # # parameters for IAM function | |||||
| # ged_cost = 'CONSTANT' | |||||
| # ged_method = 'IPFP' | |||||
| # edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es] | |||||
| # ged_stabilizer = 'min' | |||||
| # ged_repeat = 50 | |||||
| # params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, | |||||
| # 'edit_cost_constant': edit_cost_constant, | |||||
| # 'stabilizer': ged_stabilizer, 'repeat': ged_repeat} | |||||
| # parameters for GED function | |||||
| c_vi = 4 | |||||
| c_vr = 4 | |||||
| c_vs = 2 | |||||
| c_ei = 1 | |||||
| c_er = 1 | |||||
| c_es = 1 | |||||
| ite_max_iam = 50 | |||||
| epsilon_iam = 0.001 | |||||
| removeNodes = False | |||||
| connected_iam = False | |||||
| # parameters for IAM function | |||||
| ged_cost = 'CHEM_1' | |||||
| ged_method = 'IPFP' | |||||
| edit_cost_constant = [] | |||||
| ged_stabilizer = 'min' | |||||
| ged_repeat = 50 | |||||
| params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, | |||||
| 'edit_cost_constant': edit_cost_constant, | |||||
| 'stabilizer': ged_stabilizer, 'repeat': ged_repeat} | |||||
| # find out all the graphs classified to positive group 1. | |||||
| idx_dict = get_same_item_indices(y_all) | |||||
| Gn = [Gn[i] for i in idx_dict[1]] | |||||
| # number of graphs; we what to compute the median of these graphs. | |||||
| # nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100] | |||||
| nb_median_range = [10] | |||||
| # # compute Gram matrix. | |||||
| # time0 = time.time() | |||||
| # km = compute_kernel(Gn, gkernel, True) | |||||
| # time_km = time.time() - time0 | |||||
| # # write Gram matrix to file. | |||||
| # np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km) | |||||
| time_list = [] | time_list = [] | ||||
| dis_ks_min_list = [] | dis_ks_min_list = [] | ||||
| dis_ks_gen_median_list = [] | |||||
| sod_gs_list = [] | sod_gs_list = [] | ||||
| sod_gs_min_list = [] | |||||
| nb_updated_list = [] | |||||
| nb_updated_k_list = [] | |||||
| # sod_gs_min_list = [] | |||||
| # nb_updated_list = [] | |||||
| # nb_updated_k_list = [] | |||||
| g_best = [] | g_best = [] | ||||
| for nb_median in nb_median_range: | for nb_median in nb_median_range: | ||||
| print('\n-------------------------------------------------------') | print('\n-------------------------------------------------------') | ||||
| @@ -90,72 +750,80 @@ def test_iam_median_nb(): | |||||
| # plt.clf() | # plt.clf() | ||||
| ################################################################### | ################################################################### | ||||
| gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz') | |||||
| km_tmp = gmfile['gm'] | |||||
| time_km = gmfile['gmtime'] | |||||
| # modify mixed gram matrix. | |||||
| km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median)) | |||||
| for i in range(len(Gn)): | |||||
| for j in range(i, len(Gn)): | |||||
| km[i, j] = km_tmp[i, j] | |||||
| km[j, i] = km[i, j] | |||||
| for i in range(len(Gn)): | |||||
| for j, idx in enumerate(idx_rdm): | |||||
| km[i, len(Gn) + j] = km[i, idx] | |||||
| km[len(Gn) + j, i] = km[i, idx] | |||||
| for i, idx1 in enumerate(idx_rdm): | |||||
| for j, idx2 in enumerate(idx_rdm): | |||||
| km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2] | |||||
| # gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz') | |||||
| # km_tmp = gmfile['gm'] | |||||
| # time_km = gmfile['gmtime'] | |||||
| # # modify mixed gram matrix. | |||||
| # km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median)) | |||||
| # for i in range(len(Gn)): | |||||
| # for j in range(i, len(Gn)): | |||||
| # km[i, j] = km_tmp[i, j] | |||||
| # km[j, i] = km[i, j] | |||||
| # for i in range(len(Gn)): | |||||
| # for j, idx in enumerate(idx_rdm): | |||||
| # km[i, len(Gn) + j] = km[i, idx] | |||||
| # km[len(Gn) + j, i] = km[i, idx] | |||||
| # for i, idx1 in enumerate(idx_rdm): | |||||
| # for j, idx2 in enumerate(idx_rdm): | |||||
| # km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2] | |||||
| ################################################################### | ################################################################### | ||||
| alpha_range = [1 / nb_median] * nb_median | alpha_range = [1 / nb_median] * nb_median | ||||
| time0 = time.time() | time0 = time.time() | ||||
| ghat_new_list, dis_min = iam_upgraded(Gn_median, Gn_candidate, | |||||
| c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam, | |||||
| epsilon=epsilon_iam, removeNodes=removeNodes, | |||||
| connected=connected_iam, | |||||
| params_ged={'ged_cost': ged_cost, 'ged_method': ged_method, | |||||
| 'saveGXL': saveGXL}) | |||||
| G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \ | |||||
| = iam_upgraded(Gn_median, Gn_candidate, | |||||
| c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam, | |||||
| epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes, | |||||
| params_ged=params_ged) | |||||
| time_total = time.time() - time0 | time_total = time.time() - time0 | ||||
| print('\ntime: ', time_total) | print('\ntime: ', time_total) | ||||
| time_list.append(time_total) | time_list.append(time_total) | ||||
| print('\nsmallest distance in kernel space: ', dhat) | |||||
| dis_ks_min_list.append(dhat) | |||||
| g_best.append(ghat_list) | |||||
| print('\nnumber of updates of the best graph: ', nb_updated) | |||||
| nb_updated_list.append(nb_updated) | |||||
| print('\nnumber of updates of k nearest graphs: ', nb_updated_k) | |||||
| nb_updated_k_list.append(nb_updated_k) | |||||
| # compute distance between \psi and the new generated graphs. | |||||
| knew = compute_kernel(G_gen_median_list + Gn_median, gkernel, node_label, | |||||
| edge_label, False) | |||||
| dhat_new_list = [] | |||||
| for idx, g_tmp in enumerate(G_gen_median_list): | |||||
| # @todo: the term3 below could use the one at the beginning of the function. | |||||
| dhat_new_list.append(dis_gstar(idx, range(len(G_gen_median_list), | |||||
| len(G_gen_median_list) + len(Gn_median) + 1), | |||||
| alpha_range, knew, withterm3=False)) | |||||
| print('\nsmallest distance in kernel space: ', dhat_new_list[0]) | |||||
| dis_ks_min_list.append(dhat_new_list[0]) | |||||
| g_best.append(G_gen_median_list[0]) | |||||
| # show the best graph and save it to file. | # show the best graph and save it to file. | ||||
| print('the shortest distance is', dhat) | |||||
| # print('the shortest distance is', dhat) | |||||
| print('one of the possible corresponding pre-images is') | print('one of the possible corresponding pre-images is') | ||||
| nx.draw(ghat_list[0], labels=nx.get_node_attributes(ghat_list[0], 'atom'), | |||||
| nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'), | |||||
| with_labels=True) | with_labels=True) | ||||
| plt.show() | plt.show() | ||||
| plt.savefig('results/preimage_iam/mutag_median_nb' + str(nb_median) + | |||||
| '.png', format="PNG") | |||||
| # plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) + | |||||
| # plt.savefig('results/iam/mutag_median_unfit2.nb' + str(nb_median) + | |||||
| # '.png', format="PNG") | |||||
| plt.clf() | plt.clf() | ||||
| # print(ghat_list[0].nodes(data=True)) | # print(ghat_list[0].nodes(data=True)) | ||||
| # print(ghat_list[0].edges(data=True)) | # print(ghat_list[0].edges(data=True)) | ||||
| # compute the corresponding sod in graph space. | |||||
| sod_tmp, _ = ged_median([ghat_list[0]], Gn_median, ged_cost=ged_cost, | |||||
| ged_method=ged_method, saveGXL=saveGXL) | |||||
| sod_gs_list.append(sod_tmp) | |||||
| sod_gs_min_list.append(np.min(sod_tmp)) | |||||
| print('\nsmallest sod in graph space: ', np.min(sod_tmp)) | |||||
| sod_gs_list.append(sod_gen_median) | |||||
| # sod_gs_min_list.append(np.min(sod_gen_median)) | |||||
| print('\nsmallest sod in graph space: ', sod_gen_median) | |||||
| print('\nsmallest sod of set median in graph space: ', sod_set_median) | |||||
| print('\nsods in graph space: ', sod_gs_list) | print('\nsods in graph space: ', sod_gs_list) | ||||
| print('\nsmallest sod in graph space for each set of median graphs: ', sod_gs_min_list) | |||||
| # print('\nsmallest sod in graph space for each set of median graphs: ', sod_gs_min_list) | |||||
| print('\nsmallest distance in kernel space for each set of median graphs: ', | print('\nsmallest distance in kernel space for each set of median graphs: ', | ||||
| dis_ks_min_list) | dis_ks_min_list) | ||||
| print('\nnumber of updates of the best graph for each set of median graphs by IAM: ', | |||||
| nb_updated_list) | |||||
| print('\nnumber of updates of k nearest graphs for each set of median graphs by IAM: ', | |||||
| nb_updated_k_list) | |||||
| # print('\nnumber of updates of the best graph for each set of median graphs by IAM: ', | |||||
| # nb_updated_list) | |||||
| # print('\nnumber of updates of k nearest graphs for each set of median graphs by IAM: ', | |||||
| # nb_updated_k_list) | |||||
| print('\ntimes:', time_list) | print('\ntimes:', time_list) | ||||
| ############################################################################### | ############################################################################### | ||||
| @@ -164,4 +832,11 @@ def test_iam_median_nb(): | |||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||
| ############################################################################### | ############################################################################### | ||||
| # tests on different numbers of median-sets. | # tests on different numbers of median-sets. | ||||
| test_iam_median_nb() | |||||
| # test_iam_median_nb() | |||||
| # test_iam_letter_h() | |||||
| test_iam_monoterpenoides() | |||||
| # test_iam_mutag() | |||||
| # test_iam_fitdistance() | |||||
| # print("test log") | |||||
| @@ -192,26 +192,42 @@ def test_preimage_iam_median_nb(): | |||||
| gkernel = 'marginalizedkernel' | gkernel = 'marginalizedkernel' | ||||
| lmbda = 0.03 # termination probalility | lmbda = 0.03 # termination probalility | ||||
| r_max = 10 # iteration limit for pre-image. | |||||
| r_max = 3 # iteration limit for pre-image. | |||||
| # alpha_range = np.linspace(0.5, 0.5, 1) | # alpha_range = np.linspace(0.5, 0.5, 1) | ||||
| k = 5 # k nearest neighbors | k = 5 # k nearest neighbors | ||||
| epsilon = 1e-6 | epsilon = 1e-6 | ||||
| InitIAMWithAllDk = True | InitIAMWithAllDk = True | ||||
| # parameters for GED function | |||||
| ged_cost='CHEM_1' | |||||
| ged_method='IPFP' | |||||
| saveGXL='gedlib' | |||||
| # parameters for IAM function | # parameters for IAM function | ||||
| c_ei=1 | |||||
| c_er=1 | |||||
| c_es=1 | |||||
| # c_vi = 0.037 | |||||
| # c_vr = 0.038 | |||||
| # c_vs = 0.075 | |||||
| # c_ei = 0.001 | |||||
| # c_er = 0.001 | |||||
| # c_es = 0.0 | |||||
| c_vi = 4 | |||||
| c_vr = 4 | |||||
| c_vs = 2 | |||||
| c_ei = 1 | |||||
| c_er = 1 | |||||
| c_es = 1 | |||||
| ite_max_iam = 50 | ite_max_iam = 50 | ||||
| epsilon_iam = 0.001 | epsilon_iam = 0.001 | ||||
| removeNodes = True | removeNodes = True | ||||
| connected_iam = False | connected_iam = False | ||||
| # parameters for GED function | |||||
| # ged_cost='CHEM_1' | |||||
| ged_cost = 'CONSTANT' | |||||
| ged_method = 'IPFP' | |||||
| edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es] | |||||
| ged_stabilizer = 'min' | |||||
| ged_repeat = 50 | |||||
| params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, | |||||
| 'edit_cost_constant': edit_cost_constant, | |||||
| 'stabilizer': ged_stabilizer, 'repeat': ged_repeat} | |||||
| # number of graphs; we what to compute the median of these graphs. | # number of graphs; we what to compute the median of these graphs. | ||||
| nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100] | |||||
| # nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100] | |||||
| nb_median_range = [2] | |||||
| # find out all the graphs classified to positive group 1. | # find out all the graphs classified to positive group 1. | ||||
| idx_dict = get_same_item_indices(y_all) | idx_dict = get_same_item_indices(y_all) | ||||
| @@ -274,8 +290,7 @@ def test_preimage_iam_median_nb(): | |||||
| params_iam={'c_ei': c_ei, 'c_er': c_er, 'c_es': c_es, | params_iam={'c_ei': c_ei, 'c_er': c_er, 'c_es': c_es, | ||||
| 'ite_max': ite_max_iam, 'epsilon': epsilon_iam, | 'ite_max': ite_max_iam, 'epsilon': epsilon_iam, | ||||
| 'removeNodes': removeNodes, 'connected': connected_iam}, | 'removeNodes': removeNodes, 'connected': connected_iam}, | ||||
| params_ged={'ged_cost': ged_cost, 'ged_method': ged_method, | |||||
| 'saveGXL': saveGXL}) | |||||
| params_ged=params_ged) | |||||
| time_total = time.time() - time0 + time_km | time_total = time.time() - time0 + time_km | ||||
| print('\ntime: ', time_total) | print('\ntime: ', time_total) | ||||
| @@ -293,16 +308,15 @@ def test_preimage_iam_median_nb(): | |||||
| print('one of the possible corresponding pre-images is') | print('one of the possible corresponding pre-images is') | ||||
| nx.draw(ghat_list[0], labels=nx.get_node_attributes(ghat_list[0], 'atom'), | nx.draw(ghat_list[0], labels=nx.get_node_attributes(ghat_list[0], 'atom'), | ||||
| with_labels=True) | with_labels=True) | ||||
| # plt.show() | |||||
| plt.savefig('results/preimage_iam/mutag_median_nb' + str(nb_median) + | |||||
| '.png', format="PNG") | |||||
| plt.show() | |||||
| # plt.savefig('results/preimage_iam/mutag_median_cs.001_nb' + str(nb_median) + | |||||
| # '.png', format="PNG") | |||||
| plt.clf() | plt.clf() | ||||
| # print(ghat_list[0].nodes(data=True)) | # print(ghat_list[0].nodes(data=True)) | ||||
| # print(ghat_list[0].edges(data=True)) | # print(ghat_list[0].edges(data=True)) | ||||
| # compute the corresponding sod in graph space. | # compute the corresponding sod in graph space. | ||||
| sod_tmp, _ = ged_median([ghat_list[0]], Gn_median, ged_cost=ged_cost, | |||||
| ged_method=ged_method, saveGXL=saveGXL) | |||||
| sod_tmp, _ = ged_median([ghat_list[0]], Gn_median, params_ged=params_ged) | |||||
| sod_gs_list.append(sod_tmp) | sod_gs_list.append(sod_tmp) | ||||
| sod_gs_min_list.append(np.min(sod_tmp)) | sod_gs_min_list.append(np.min(sod_tmp)) | ||||
| print('\nsmallest sod in graph space: ', np.min(sod_tmp)) | print('\nsmallest sod in graph space: ', np.min(sod_tmp)) | ||||
| @@ -39,13 +39,13 @@ def dis_gstar(idx_g, idx_gi, alpha, Kmatrix, term3=0, withterm3=True): | |||||
| return np.sqrt(term1 - term2 + term3) | return np.sqrt(term1 - term2 + term3) | ||||
| def compute_kernel(Gn, graph_kernel, verbose): | |||||
| def compute_kernel(Gn, graph_kernel, node_label, edge_label, verbose): | |||||
| if graph_kernel == 'marginalizedkernel': | if graph_kernel == 'marginalizedkernel': | ||||
| Kmatrix, _ = marginalizedkernel(Gn, node_label='atom', edge_label=None, | |||||
| Kmatrix, _ = marginalizedkernel(Gn, node_label=node_label, edge_label=edge_label, | |||||
| p_quit=0.03, n_iteration=10, remove_totters=False, | p_quit=0.03, n_iteration=10, remove_totters=False, | ||||
| n_jobs=multiprocessing.cpu_count(), verbose=verbose) | n_jobs=multiprocessing.cpu_count(), verbose=verbose) | ||||
| elif graph_kernel == 'untilhpathkernel': | elif graph_kernel == 'untilhpathkernel': | ||||
| Kmatrix, _ = untilhpathkernel(Gn, node_label='atom', edge_label=None, | |||||
| Kmatrix, _ = untilhpathkernel(Gn, node_label=node_label, edge_label=edge_label, | |||||
| depth=10, k_func='MinMax', compute_method='trie', | depth=10, k_func='MinMax', compute_method='trie', | ||||
| n_jobs=multiprocessing.cpu_count(), verbose=verbose) | n_jobs=multiprocessing.cpu_count(), verbose=verbose) | ||||
| elif graph_kernel == 'spkernel': | elif graph_kernel == 'spkernel': | ||||
| @@ -77,10 +77,10 @@ def gram2distances(Kmatrix): | |||||
| return dmatrix | return dmatrix | ||||
| def kernel_distance_matrix(Gn, Kmatrix=None, gkernel=None): | |||||
| def kernel_distance_matrix(Gn, node_label, edge_label, Kmatrix=None, gkernel=None): | |||||
| dis_mat = np.empty((len(Gn), len(Gn))) | dis_mat = np.empty((len(Gn), len(Gn))) | ||||
| if Kmatrix == None: | if Kmatrix == None: | ||||
| Kmatrix = compute_kernel(Gn, gkernel, True) | |||||
| Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True) | |||||
| for i in range(len(Gn)): | for i in range(len(Gn)): | ||||
| for j in range(i, len(Gn)): | for j in range(i, len(Gn)): | ||||
| dis = Kmatrix[i, i] + Kmatrix[j, j] - 2 * Kmatrix[i, j] | dis = Kmatrix[i, i] + Kmatrix[j, j] - 2 * Kmatrix[i, j] | ||||
| @@ -1,9 +1,9 @@ | |||||
| """ Utilities function to manage graph files | """ Utilities function to manage graph files | ||||
| """ | """ | ||||
| from os.path import dirname, splitext | |||||
| def loadCT(filename): | def loadCT(filename): | ||||
| """load data from .ct file. | |||||
| """load data from a Chemical Table (.ct) file. | |||||
| Notes | Notes | ||||
| ------ | ------ | ||||
| @@ -13,8 +13,11 @@ def loadCT(filename): | |||||
| 0.0000 0.0000 0.0000 C <- each line describes a node (x,y,z + label) | 0.0000 0.0000 0.0000 C <- each line describes a node (x,y,z + label) | ||||
| 0.0000 0.0000 0.0000 C | 0.0000 0.0000 0.0000 C | ||||
| 0.0000 0.0000 0.0000 O | 0.0000 0.0000 0.0000 O | ||||
| 1 3 1 1 <- each line describes an edge : to, from,?, label | |||||
| 1 3 1 1 <- each line describes an edge : to, from, bond type, bond stereo | |||||
| 2 3 1 1 | 2 3 1 1 | ||||
| Check https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=10&ved=2ahUKEwivhaSdjsTlAhVhx4UKHczHA8gQFjAJegQIARAC&url=https%3A%2F%2Fwww.daylight.com%2Fmeetings%2Fmug05%2FKappler%2Fctfile.pdf&usg=AOvVaw1cDNrrmMClkFPqodlF2inS | |||||
| for detailed format discription. | |||||
| """ | """ | ||||
| import networkx as nx | import networkx as nx | ||||
| from os.path import basename | from os.path import basename | ||||
| @@ -35,22 +38,15 @@ def loadCT(filename): | |||||
| for i in range(0, nb_nodes): | for i in range(0, nb_nodes): | ||||
| tmp = content[i + 2].split(" ") | tmp = content[i + 2].split(" ") | ||||
| tmp = [x for x in tmp if x != ''] | tmp = [x for x in tmp if x != ''] | ||||
| g.add_node(i, atom=tmp[3], label=tmp[3]) | |||||
| g.add_node(i, atom=tmp[3].strip(), | |||||
| label=[item.strip() for item in tmp[3:]], | |||||
| attributes=[item.strip() for item in tmp[0:3]]) | |||||
| for i in range(0, nb_edges): | for i in range(0, nb_edges): | ||||
| tmp = content[i + g.number_of_nodes() + 2].split(" ") | tmp = content[i + g.number_of_nodes() + 2].split(" ") | ||||
| tmp = [x for x in tmp if x != ''] | tmp = [x for x in tmp if x != ''] | ||||
| g.add_edge( | |||||
| int(tmp[0]) - 1, | |||||
| int(tmp[1]) - 1, | |||||
| bond_type=tmp[3].strip(), | |||||
| label=tmp[3].strip()) | |||||
| # for i in range(0, nb_edges): | |||||
| # tmp = content[i + g.number_of_nodes() + 2] | |||||
| # tmp = [tmp[i:i+3] for i in range(0, len(tmp), 3)] | |||||
| # g.add_edge(int(tmp[0]) - 1, int(tmp[1]) - 1, | |||||
| # bond_type=tmp[3].strip(), label=tmp[3].strip()) | |||||
| g.add_edge(int(tmp[0]) - 1, int(tmp[1]) - 1, | |||||
| bond_type=tmp[2].strip(), | |||||
| label=[item.strip() for item in tmp[2:]]) | |||||
| return g | return g | ||||
| @@ -71,6 +67,7 @@ def loadGXL(filename): | |||||
| labels[attr.attrib['name']] = attr[0].text | labels[attr.attrib['name']] = attr[0].text | ||||
| if 'chem' in labels: | if 'chem' in labels: | ||||
| labels['label'] = labels['chem'] | labels['label'] = labels['chem'] | ||||
| labels['atom'] = labels['chem'] | |||||
| g.add_node(index, **labels) | g.add_node(index, **labels) | ||||
| index += 1 | index += 1 | ||||
| @@ -80,6 +77,7 @@ def loadGXL(filename): | |||||
| labels[attr.attrib['name']] = attr[0].text | labels[attr.attrib['name']] = attr[0].text | ||||
| if 'valence' in labels: | if 'valence' in labels: | ||||
| labels['label'] = labels['valence'] | labels['label'] = labels['valence'] | ||||
| labels['bond_type'] = labels['valence'] | |||||
| g.add_edge(dic[edge.attrib['from']], dic[edge.attrib['to']], **labels) | g.add_edge(dic[edge.attrib['from']], dic[edge.attrib['to']], **labels) | ||||
| return g | return g | ||||
| @@ -392,7 +390,7 @@ def loadDataset(filename, filename_y=None, extra_params=None): | |||||
| Notes | Notes | ||||
| ----- | ----- | ||||
| This function supports following graph dataset formats: | This function supports following graph dataset formats: | ||||
| 'ds': load data from .ct file. See comments of function loadCT for a example. | |||||
| 'ds': load data from .ds file. See comments of function loadFromDS for a example. | |||||
| 'cxl': load data from Graph eXchange Language file (.cxl file). See | 'cxl': load data from Graph eXchange Language file (.cxl file). See | ||||
| http://www.gupro.de/GXL/Introduction/background.html, 2019 for detail. | http://www.gupro.de/GXL/Introduction/background.html, 2019 for detail. | ||||
| 'sdf': load data from structured data file (.sdf file). See | 'sdf': load data from structured data file (.sdf file). See | ||||
| @@ -406,45 +404,24 @@ def loadDataset(filename, filename_y=None, extra_params=None): | |||||
| 2019 for details. Note here filename is the name of either .txt file in | 2019 for details. Note here filename is the name of either .txt file in | ||||
| the dataset directory. | the dataset directory. | ||||
| """ | """ | ||||
| from os.path import dirname, splitext | |||||
| dirname_dataset = dirname(filename) | |||||
| extension = splitext(filename)[1][1:] | extension = splitext(filename)[1][1:] | ||||
| data = [] | |||||
| y = [] | |||||
| if extension == "ds": | if extension == "ds": | ||||
| content = open(filename).read().splitlines() | |||||
| if filename_y is None or filename_y == '': | |||||
| for i in range(0, len(content)): | |||||
| tmp = content[i].split(' ') | |||||
| # remove the '#'s in file names | |||||
| data.append( | |||||
| loadCT(dirname_dataset + '/' + tmp[0].replace('#', '', 1))) | |||||
| y.append(float(tmp[1])) | |||||
| else: # y in a seperate file | |||||
| for i in range(0, len(content)): | |||||
| tmp = content[i] | |||||
| # remove the '#'s in file names | |||||
| data.append( | |||||
| loadCT(dirname_dataset + '/' + tmp.replace('#', '', 1))) | |||||
| content_y = open(filename_y).read().splitlines() | |||||
| # assume entries in filename and filename_y have the same order. | |||||
| for item in content_y: | |||||
| tmp = item.split(' ') | |||||
| # assume the 3rd entry in a line is y (for Alkane dataset) | |||||
| y.append(float(tmp[2])) | |||||
| data, y = loadFromDS(filename, filename_y) | |||||
| elif extension == "cxl": | elif extension == "cxl": | ||||
| import xml.etree.ElementTree as ET | import xml.etree.ElementTree as ET | ||||
| dirname_dataset = dirname(filename) | |||||
| tree = ET.parse(filename) | tree = ET.parse(filename) | ||||
| root = tree.getroot() | root = tree.getroot() | ||||
| data = [] | data = [] | ||||
| y = [] | y = [] | ||||
| for graph in root.iter('print'): | |||||
| for graph in root.iter('graph'): | |||||
| mol_filename = graph.attrib['file'] | mol_filename = graph.attrib['file'] | ||||
| mol_class = graph.attrib['class'] | mol_class = graph.attrib['class'] | ||||
| data.append(loadGXL(dirname_dataset + '/' + mol_filename)) | data.append(loadGXL(dirname_dataset + '/' + mol_filename)) | ||||
| y.append(mol_class) | y.append(mol_class) | ||||
| elif extension == 'xml': | |||||
| data, y = loadFromXML(filename, extra_params) | |||||
| elif extension == "sdf": | elif extension == "sdf": | ||||
| import numpy as np | import numpy as np | ||||
| from tqdm import tqdm | from tqdm import tqdm | ||||
| @@ -471,6 +448,7 @@ def loadDataset(filename, filename_y=None, extra_params=None): | |||||
| elif extension == "mat": | elif extension == "mat": | ||||
| data, y = loadMAT(filename, extra_params) | data, y = loadMAT(filename, extra_params) | ||||
| elif extension == 'txt': | elif extension == 'txt': | ||||
| dirname_dataset = dirname(filename) | |||||
| data, y = loadTXT(dirname_dataset) | data, y = loadTXT(dirname_dataset) | ||||
| # print(len(y)) | # print(len(y)) | ||||
| # print(y) | # print(y) | ||||
| @@ -485,6 +463,75 @@ def loadDataset(filename, filename_y=None, extra_params=None): | |||||
| return data, y | return data, y | ||||
| def loadFromXML(filename, extra_params): | |||||
| import xml.etree.ElementTree as ET | |||||
| dirname_dataset = dirname(filename) | |||||
| tree = ET.parse(filename) | |||||
| root = tree.getroot() | |||||
| data = [] | |||||
| y = [] | |||||
| for graph in root.iter('print'): | |||||
| mol_filename = graph.attrib['file'] | |||||
| mol_class = graph.attrib['class'] | |||||
| data.append(loadGXL(dirname_dataset + '/' + mol_filename)) | |||||
| y.append(mol_class) | |||||
| return data, y | |||||
| def loadFromDS(filename, filename_y): | |||||
| """Load data from .ds file. | |||||
| Possible graph formats include: | |||||
| '.ct': see function loadCT for detail. | |||||
| '.gxl': see dunction loadGXL for detail. | |||||
| Note these graph formats are checked automatically by the extensions of | |||||
| graph files. | |||||
| """ | |||||
| dirname_dataset = dirname(filename) | |||||
| data = [] | |||||
| y = [] | |||||
| content = open(filename).read().splitlines() | |||||
| extension = splitext(content[0].split(' ')[0])[1][1:] | |||||
| if filename_y is None or filename_y == '': | |||||
| if extension == 'ct': | |||||
| for i in range(0, len(content)): | |||||
| tmp = content[i].split(' ') | |||||
| # remove the '#'s in file names | |||||
| data.append( | |||||
| loadCT(dirname_dataset + '/' + tmp[0].replace('#', '', 1))) | |||||
| y.append(float(tmp[1])) | |||||
| elif extension == 'gxl': | |||||
| for i in range(0, len(content)): | |||||
| tmp = content[i].split(' ') | |||||
| # remove the '#'s in file names | |||||
| data.append( | |||||
| loadGXL(dirname_dataset + '/' + tmp[0].replace('#', '', 1))) | |||||
| y.append(float(tmp[1])) | |||||
| else: # y in a seperate file | |||||
| if extension == 'ct': | |||||
| for i in range(0, len(content)): | |||||
| tmp = content[i] | |||||
| # remove the '#'s in file names | |||||
| data.append( | |||||
| loadCT(dirname_dataset + '/' + tmp.replace('#', '', 1))) | |||||
| elif extension == 'gxl': | |||||
| for i in range(0, len(content)): | |||||
| tmp = content[i] | |||||
| # remove the '#'s in file names | |||||
| data.append( | |||||
| loadGXL(dirname_dataset + '/' + tmp.replace('#', '', 1))) | |||||
| content_y = open(filename_y).read().splitlines() | |||||
| # assume entries in filename and filename_y have the same order. | |||||
| for item in content_y: | |||||
| tmp = item.split(' ') | |||||
| # assume the 3rd entry in a line is y (for Alkane dataset) | |||||
| y.append(float(tmp[2])) | |||||
| return data, y | |||||
| def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=None): | def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=None): | ||||
| """Save list of graphs. | """Save list of graphs. | ||||
| """ | """ | ||||
| @@ -509,7 +556,30 @@ def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=None | |||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||
| ds = {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat', | |||||
| 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb | |||||
| Gn, y = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||||
| saveDataset(Gn, y, group='xml', filename='temp/temp') | |||||
| # ### Load dataset from .ds file. | |||||
| # # .ct files. | |||||
| ds = {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds', | |||||
| 'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt'} | |||||
| Gn, y = loadDataset(ds['dataset'], filename_y=ds['dataset_y']) | |||||
| # ds = {'name': 'Acyclic', 'dataset': '../../datasets/acyclic/dataset_bps.ds'} # node symb | |||||
| # Gn, y = loadDataset(ds['dataset']) | |||||
| # ds = {'name': 'MAO', 'dataset': '../../datasets/MAO/dataset.ds'} # node/edge symb | |||||
| # Gn, y = loadDataset(ds['dataset']) | |||||
| # ds = {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds'} # unlabeled | |||||
| # Gn, y = loadDataset(ds['dataset']) | |||||
| print(Gn[1].nodes(data=True)) | |||||
| print(Gn[1].edges(data=True)) | |||||
| print(y[1]) | |||||
| # # .gxl file. | |||||
| # ds = {'name': 'monoterpenoides', | |||||
| # 'dataset': '../../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | |||||
| # Gn, y = loadDataset(ds['dataset']) | |||||
| # print(Gn[1].nodes(data=True)) | |||||
| # print(Gn[1].edges(data=True)) | |||||
| # print(y[1]) | |||||
| # ds = {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat', | |||||
| # 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb | |||||
| # Gn, y = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||||
| # saveDataset(Gn, y, group='xml', filename='temp/temp') | |||||