| @@ -28,6 +28,7 @@ dslist = [ | |||||
| # {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb | # {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb | ||||
| # {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb | # {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb | ||||
| # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | ||||
| # {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge symb | |||||
| # | # | ||||
| # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | ||||
| # # node/edge symb | # # node/edge symb | ||||
| @@ -57,7 +58,7 @@ estimator = marginalizedkernel | |||||
| #param_grid_precomputed = {'p_quit': np.linspace(0.1, 0.3, 3), | #param_grid_precomputed = {'p_quit': np.linspace(0.1, 0.3, 3), | ||||
| # 'n_iteration': np.linspace(1, 1, 1), | # 'n_iteration': np.linspace(1, 1, 1), | ||||
| param_grid_precomputed = {'p_quit': np.linspace(0.1, 0.9, 9), | param_grid_precomputed = {'p_quit': np.linspace(0.1, 0.9, 9), | ||||
| 'n_iteration': np.linspace(5, 20, 4), | |||||
| 'n_iteration': np.linspace(1, 19, 7), | |||||
| 'remove_totters': [False]} | 'remove_totters': [False]} | ||||
| param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, | param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, | ||||
| {'alpha': np.logspace(-10, 10, num=41, base=10)}] | {'alpha': np.logspace(-10, 10, num=41, base=10)}] | ||||
| @@ -24,6 +24,9 @@ dslist = [ | |||||
| # {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb | # {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb | ||||
| # {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb | # {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb | ||||
| # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | ||||
| # {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge | |||||
| # {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt'}, | |||||
| # # node nsymb symb | |||||
| # | # | ||||
| # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | ||||
| # # node/edge symb | # # node/edge symb | ||||
| @@ -30,6 +30,8 @@ dslist = [ | |||||
| # {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | # {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | ||||
| # # node symb/nsymb | # # node symb/nsymb | ||||
| # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | ||||
| # {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt'}, | |||||
| # # node nsymb symb | |||||
| # | # | ||||
| # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | ||||
| # # node/edge symb | # # node/edge symb | ||||
| @@ -26,6 +26,7 @@ dslist = [ | |||||
| {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb | {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb | ||||
| {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb | {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb | ||||
| {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | ||||
| # {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge symb | |||||
| # node symb/nsymb | # node symb/nsymb | ||||
| # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | ||||
| # {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | # {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | ||||
| @@ -27,7 +27,8 @@ dslist = [ | |||||
| {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb | {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb | ||||
| {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb | {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb | ||||
| {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb | {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb | ||||
| {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | |||||
| {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | |||||
| # {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge symb | |||||
| # | # | ||||
| # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | ||||
| # # node/edge symb | # # node/edge symb | ||||
| @@ -54,11 +55,11 @@ dslist = [ | |||||
| # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | ||||
| ] | ] | ||||
| estimator = untilhpathkernel | estimator = untilhpathkernel | ||||
| param_grid_precomputed = {'depth': np.linspace(3, 10, 8), # [2], | |||||
| 'k_func': [None]} # ['MinMax', 'tanimoto'], | |||||
| #param_grid_precomputed = {'depth': np.linspace(1, 10, 10), # [2], | |||||
| # 'k_func': ['MinMax'], # ['MinMax', 'tanimoto'], | |||||
| # 'compute_method': ['trie']} # ['MinMax']} | |||||
| #param_grid_precomputed = {'depth': np.linspace(3, 10, 8), # [2], | |||||
| # 'k_func': [None]} # ['MinMax', 'tanimoto'], | |||||
| param_grid_precomputed = {'depth': np.linspace(1, 10, 10), # [2], | |||||
| 'k_func': ['MinMax', 'tanimoto'], # ['MinMax'], # | |||||
| 'compute_method': ['trie']} # ['MinMax']} | |||||
| param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, | param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, | ||||
| {'alpha': np.logspace(-10, 10, num=41, base=10)}] | {'alpha': np.logspace(-10, 10, num=41, base=10)}] | ||||
| @@ -30,6 +30,8 @@ dslist = [ | |||||
| {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb | {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb | ||||
| {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb | {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb | ||||
| {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | ||||
| # {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge symb | |||||
| # | # | ||||
| # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | ||||
| # # node/edge symb | # # node/edge symb | ||||
| @@ -7,7 +7,7 @@ Created on Wed Oct 16 14:20:06 2019 | |||||
| """ | """ | ||||
| import numpy as np | import numpy as np | ||||
| from tqdm import tqdm | from tqdm import tqdm | ||||
| from itertools import combinations_with_replacement | |||||
| from itertools import combinations_with_replacement, combinations | |||||
| import multiprocessing | import multiprocessing | ||||
| from multiprocessing import Pool | from multiprocessing import Pool | ||||
| from functools import partial | from functools import partial | ||||
| @@ -22,110 +22,88 @@ import sys | |||||
| from ged import GED, get_nb_edit_operations | from ged import GED, get_nb_edit_operations | ||||
| from utils import kernel_distance_matrix | from utils import kernel_distance_matrix | ||||
| def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max, | |||||
| fitkernel=None, gamma=1.0): | |||||
| def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max, k=4, | |||||
| params_ged={'lib': 'gedlibpy', 'cost': 'CONSTANT', | |||||
| 'method': 'IPFP', 'stabilizer': None}, | |||||
| init_costs=[3, 3, 1, 3, 3, 1], | |||||
| parallel=True): | |||||
| # c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them. | # c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them. | ||||
| # random.seed(1) | # random.seed(1) | ||||
| cost_rdm = random.sample(range(1, 10), 6) | |||||
| # edit_costs = cost_rdm + [0] | |||||
| edit_costs = cost_rdm | |||||
| # edit_costs = [i * 0.01 for i in cost_rdm] + [0] | |||||
| # edit_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0] | |||||
| # edit_costs = [0, 0, 0.9544, 0.026, 0.0196, 0] | |||||
| # edit_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0] | |||||
| idx_cost_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0] | |||||
| # cost_rdm = random.sample(range(1, 10), 6) | |||||
| # init_costs = cost_rdm + [0] | |||||
| # init_costs = cost_rdm | |||||
| init_costs = [3, 3, 1, 3, 3, 1] | |||||
| # init_costs = [i * 0.01 for i in cost_rdm] + [0] | |||||
| # init_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0] | |||||
| # init_costs = [0, 0, 0.9544, 0.026, 0.0196, 0] | |||||
| # init_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0] | |||||
| # idx_cost_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0] | |||||
| # compute distances in feature space. | # compute distances in feature space. | ||||
| coef_dk = 1 | |||||
| dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, gkernel=gkernel) | dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, gkernel=gkernel) | ||||
| dis_k_vec = [] | dis_k_vec = [] | ||||
| for i in range(len(dis_k_mat)): | for i in range(len(dis_k_mat)): | ||||
| for j in range(i, len(dis_k_mat)): | |||||
| # for j in range(i, len(dis_k_mat)): | |||||
| for j in range(i + 1, len(dis_k_mat)): | |||||
| dis_k_vec.append(dis_k_mat[i, j]) | dis_k_vec.append(dis_k_mat[i, j]) | ||||
| dis_k_vec = np.array(dis_k_vec) | dis_k_vec = np.array(dis_k_vec) | ||||
| if fitkernel == None: | |||||
| dis_k_vec_ajusted = dis_k_vec | |||||
| elif fitkernel == 'gaussian': | |||||
| coef_dk = 1 / np.max(dis_k_vec) | |||||
| idx_dk_nonzeros = np.where(dis_k_vec != 0)[0] | |||||
| # remove 0's and constraint d_k between 0 and 1. | |||||
| dis_k_vec = dis_k_vec[idx_dk_nonzeros] * coef_dk | |||||
| dis_k_vec_ajusted = np.sqrt(-np.log(dis_k_vec) / gamma) | |||||
| residual_list = [] | |||||
| edit_cost_list = [] | |||||
| time_list = [] | |||||
| nb_cost_mat_list = [] | |||||
| # init ged. | |||||
| print('\ninitial:') | |||||
| time0 = time.time() | |||||
| params_ged['edit_cost_constant'] = init_costs | |||||
| ged_vec_init, ged_mat, n_edit_operations = compute_geds(Gn, params_ged, | |||||
| parallel=parallel) | |||||
| residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))] | |||||
| time_list = [time.time() - time0] | |||||
| edit_cost_list = [init_costs] | |||||
| nb_cost_mat = np.array(n_edit_operations) | |||||
| nb_cost_mat_list = [nb_cost_mat] | |||||
| print('edit_costs:', init_costs) | |||||
| print('residual_list:', residual_list) | |||||
| for itr in range(itr_max): | for itr in range(itr_max): | ||||
| print('\niteration', itr) | print('\niteration', itr) | ||||
| time0 = time.time() | time0 = time.time() | ||||
| # compute GEDs and numbers of edit operations. | |||||
| edit_cost_constant = [i for i in edit_costs] | |||||
| edit_cost_list.append(edit_cost_constant) | |||||
| ged_all, ged_mat, n_edit_operations = compute_geds(Gn, edit_cost_constant, | |||||
| idx_cost_nonzeros, parallel=True) | |||||
| if fitkernel == None: | |||||
| residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec))) | |||||
| elif fitkernel == 'gaussian': | |||||
| ged_all = np.array(ged_all)[idx_dk_nonzeros] | |||||
| residual = np.sqrt(np.sum(np.square( | |||||
| np.exp(-gamma * ged_all ** 2) / coef_dk - dis_k_vec))) | |||||
| residual_list.append(residual) | |||||
| # "fit" geds to distances in feature space by tuning edit costs using the | # "fit" geds to distances in feature space by tuning edit costs using the | ||||
| # Least Squares Method. | # Least Squares Method. | ||||
| nb_cost_mat = np.array(n_edit_operations).T | |||||
| if fitkernel == 'gaussian': | |||||
| nb_cost_mat = nb_cost_mat[idx_dk_nonzeros] | |||||
| nb_cost_mat_list.append(nb_cost_mat) | |||||
| edit_costs_new, residual = compute_better_costs(nb_cost_mat, dis_k_vec_ajusted) | |||||
| print('pseudo residual:', residual) | |||||
| edit_costs_new, residual = update_costs(nb_cost_mat, dis_k_vec) | |||||
| for i in range(len(edit_costs_new)): | for i in range(len(edit_costs_new)): | ||||
| if edit_costs_new[i] < 0: | if edit_costs_new[i] < 0: | ||||
| if edit_costs_new[i] > -1e-9: | if edit_costs_new[i] > -1e-9: | ||||
| edit_costs_new[i] = 0 | edit_costs_new[i] = 0 | ||||
| else: | else: | ||||
| raise ValueError('The edit cost is negative.') | raise ValueError('The edit cost is negative.') | ||||
| for idx, item in enumerate(idx_cost_nonzeros): | |||||
| edit_costs[item] = edit_costs_new[idx] | |||||
| # for i in range(len(edit_costs_new)): | |||||
| # if edit_costs_new[i] < 0: | |||||
| # edit_costs_new[i] = 0 | |||||
| # compute new GEDs and numbers of edit operations. | |||||
| params_ged['edit_cost_constant'] = edit_costs_new | |||||
| ged_vec, ged_mat, n_edit_operations = compute_geds(Gn, params_ged, | |||||
| parallel=parallel) | |||||
| residual_list.append(np.sqrt(np.sum(np.square(np.array(ged_vec) - dis_k_vec)))) | |||||
| time_list.append(time.time() - time0) | time_list.append(time.time() - time0) | ||||
| print('edit_costs:', edit_costs) | |||||
| edit_cost_list.append(edit_costs_new) | |||||
| nb_cost_mat = np.array(n_edit_operations) | |||||
| nb_cost_mat_list.append(nb_cost_mat) | |||||
| print('edit_costs:', edit_costs_new) | |||||
| print('residual_list:', residual_list) | print('residual_list:', residual_list) | ||||
| print() | |||||
| edit_cost_list.append(edit_costs) | |||||
| ged_all, ged_mat, n_edit_operations = compute_geds(Gn, edit_costs, | |||||
| idx_cost_nonzeros, parallel=True) | |||||
| if fitkernel == 0: | |||||
| residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec))) | |||||
| elif fitkernel == 'gaussian': | |||||
| ged_all = np.array(ged_all)[idx_dk_nonzeros] | |||||
| residual = np.sqrt(np.sum(np.square( | |||||
| np.exp(-gamma * ged_all ** 2) / coef_dk - dis_k_vec))) | |||||
| residual_list.append(residual) | |||||
| nb_cost_mat_list.append(np.array(n_edit_operations).T) | |||||
| return edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, \ | |||||
| time_list, nb_cost_mat_list, coef_dk | |||||
| return edit_costs_new, residual_list, edit_cost_list, dis_k_mat, ged_mat, \ | |||||
| time_list, nb_cost_mat_list | |||||
| def compute_geds(Gn, edit_cost_constant, idx_nonzeros, parallel=False): | |||||
| def compute_geds(Gn, params_ged, parallel=False): | |||||
| ged_mat = np.zeros((len(Gn), len(Gn))) | ged_mat = np.zeros((len(Gn), len(Gn))) | ||||
| if parallel: | if parallel: | ||||
| # print('parallel') | # print('parallel') | ||||
| len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | |||||
| ged_all = [0 for i in range(len_itr)] | |||||
| n_edit_operations = [[0 for i in range(len_itr)] for j in | |||||
| range(len(idx_nonzeros))] | |||||
| itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||||
| # len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | |||||
| len_itr = int(len(Gn) * (len(Gn) - 1) / 2) | |||||
| ged_vec = [0 for i in range(len_itr)] | |||||
| n_edit_operations = [0 for i in range(len_itr)] | |||||
| # itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||||
| itr = combinations(range(0, len(Gn)), 2) | |||||
| n_jobs = multiprocessing.cpu_count() | n_jobs = multiprocessing.cpu_count() | ||||
| if len_itr < 100 * n_jobs: | if len_itr < 100 * n_jobs: | ||||
| chunksize = int(len_itr / n_jobs) + 1 | chunksize = int(len_itr / n_jobs) + 1 | ||||
| @@ -134,68 +112,52 @@ def compute_geds(Gn, edit_cost_constant, idx_nonzeros, parallel=False): | |||||
| def init_worker(gn_toshare): | def init_worker(gn_toshare): | ||||
| global G_gn | global G_gn | ||||
| G_gn = gn_toshare | G_gn = gn_toshare | ||||
| do_partial = partial(_wrapper_compute_ged_parallel, edit_cost_constant, | |||||
| idx_nonzeros) | |||||
| do_partial = partial(_wrapper_compute_ged_parallel, params_ged) | |||||
| pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn,)) | pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn,)) | ||||
| iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize), | iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize), | ||||
| desc='computing GEDs', file=sys.stdout) | desc='computing GEDs', file=sys.stdout) | ||||
| # iterator = pool.imap_unordered(do_partial, itr, chunksize) | # iterator = pool.imap_unordered(do_partial, itr, chunksize) | ||||
| for i, j, dis, n_eo_tmp in iterator: | for i, j, dis, n_eo_tmp in iterator: | ||||
| idx_itr = int(len(Gn) * i + j - i * (i + 1) / 2) | |||||
| ged_all[idx_itr] = dis | |||||
| idx_itr = int(len(Gn) * i + j - (i + 1) * (i + 2) / 2) | |||||
| ged_vec[idx_itr] = dis | |||||
| ged_mat[i][j] = dis | ged_mat[i][j] = dis | ||||
| ged_mat[j][i] = dis | ged_mat[j][i] = dis | ||||
| for idx, item in enumerate(idx_nonzeros): | |||||
| n_edit_operations[idx][idx_itr] = n_eo_tmp[item] | |||||
| n_edit_operations[idx_itr] = n_eo_tmp | |||||
| # print('\n-------------------------------------------') | # print('\n-------------------------------------------') | ||||
| # print(i, j, idx_itr, dis) | # print(i, j, idx_itr, dis) | ||||
| pool.close() | pool.close() | ||||
| pool.join() | pool.join() | ||||
| else: | else: | ||||
| ged_all = [] | |||||
| n_edit_operations = [[] for i in range(len(idx_nonzeros))] | |||||
| ged_vec = [] | |||||
| n_edit_operations = [] | |||||
| for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout): | for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout): | ||||
| # for i in range(len(Gn)): | # for i in range(len(Gn)): | ||||
| for j in range(i, len(Gn)): | |||||
| # time0 = time.time() | |||||
| dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], lib='gedlibpy', | |||||
| cost='CONSTANT', method='IPFP', | |||||
| edit_cost_constant=edit_cost_constant, stabilizer='min', | |||||
| repeat=50) | |||||
| # time1 = time.time() - time0 | |||||
| # time0 = time.time() | |||||
| ged_all.append(dis) | |||||
| for j in range(i + 1, len(Gn)): | |||||
| dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], **params_ged) | |||||
| ged_vec.append(dis) | |||||
| ged_mat[i][j] = dis | ged_mat[i][j] = dis | ||||
| ged_mat[j][i] = dis | ged_mat[j][i] = dis | ||||
| n_eo_tmp = get_nb_edit_operations(Gn[i], Gn[j], pi_forward, pi_backward) | n_eo_tmp = get_nb_edit_operations(Gn[i], Gn[j], pi_forward, pi_backward) | ||||
| for idx, item in enumerate(idx_nonzeros): | |||||
| n_edit_operations[idx].append(n_eo_tmp[item]) | |||||
| # time2 = time.time() - time0 | |||||
| # print(time1, time2, time1 / time2) | |||||
| n_edit_operations.append(n_eo_tmp) | |||||
| return ged_all, ged_mat, n_edit_operations | |||||
| return ged_vec, ged_mat, n_edit_operations | |||||
| def _wrapper_compute_ged_parallel(edit_cost_constant, idx_nonzeros, itr): | |||||
| def _wrapper_compute_ged_parallel(params_ged, itr): | |||||
| i = itr[0] | i = itr[0] | ||||
| j = itr[1] | j = itr[1] | ||||
| dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], edit_cost_constant, | |||||
| idx_nonzeros) | |||||
| dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], params_ged) | |||||
| return i, j, dis, n_eo_tmp | return i, j, dis, n_eo_tmp | ||||
| def _compute_ged_parallel(g1, g2, edit_cost_constant, idx_nonzeros): | |||||
| dis, pi_forward, pi_backward = GED(g1, g2, lib='gedlibpy', | |||||
| cost='CONSTANT', method='IPFP', | |||||
| edit_cost_constant=edit_cost_constant, stabilizer='min', | |||||
| repeat=50) | |||||
| n_eo_tmp = get_nb_edit_operations(g1, g2, pi_forward, pi_backward) | |||||
| def _compute_ged_parallel(g1, g2, params_ged): | |||||
| dis, pi_forward, pi_backward = GED(g1, g2, **params_ged) | |||||
| n_eo_tmp = get_nb_edit_operations(g1, g2, pi_forward, pi_backward) | |||||
| return dis, n_eo_tmp | return dis, n_eo_tmp | ||||
| def compute_better_costs(nb_cost_mat, dis_k_vec): | |||||
| def update_costs(nb_cost_mat, dis_k_vec): | |||||
| # # method 1: simple least square method. | # # method 1: simple least square method. | ||||
| # edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec, | # edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec, | ||||
| # rcond=None) | # rcond=None) | ||||
| @@ -203,7 +165,7 @@ def compute_better_costs(nb_cost_mat, dis_k_vec): | |||||
| # # method 2: least square method with x_i >= 0. | # # method 2: least square method with x_i >= 0. | ||||
| # edit_costs_new, residual = optimize.nnls(nb_cost_mat, dis_k_vec) | # edit_costs_new, residual = optimize.nnls(nb_cost_mat, dis_k_vec) | ||||
| # method 3: solve as a quadratic program with constraints: x_i >= 0, sum(x) = 1. | |||||
| # method 3: solve as a quadratic program with constraints. | |||||
| # P = np.dot(nb_cost_mat.T, nb_cost_mat) | # P = np.dot(nb_cost_mat.T, nb_cost_mat) | ||||
| # q_T = -2 * np.dot(dis_k_vec.T, nb_cost_mat) | # q_T = -2 * np.dot(dis_k_vec.T, nb_cost_mat) | ||||
| # G = -1 * np.identity(nb_cost_mat.shape[1]) | # G = -1 * np.identity(nb_cost_mat.shape[1]) | ||||
| @@ -221,7 +183,7 @@ def compute_better_costs(nb_cost_mat, dis_k_vec): | |||||
| # h = np.array([0 for i in range(nb_cost_mat.shape[1])]) | # h = np.array([0 for i in range(nb_cost_mat.shape[1])]) | ||||
| x = cp.Variable(nb_cost_mat.shape[1]) | x = cp.Variable(nb_cost_mat.shape[1]) | ||||
| cost = cp.sum_squares(nb_cost_mat * x - dis_k_vec) | cost = cp.sum_squares(nb_cost_mat * x - dis_k_vec) | ||||
| constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])], | |||||
| constraints = [x >= [0.0001 for i in range(nb_cost_mat.shape[1])], | |||||
| # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] | # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] | ||||
| np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, | np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, | ||||
| np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] | np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] | ||||
| @@ -13,29 +13,30 @@ import multiprocessing | |||||
| from multiprocessing import Pool | from multiprocessing import Pool | ||||
| from functools import partial | from functools import partial | ||||
| from gedlibpy import librariesImport, gedlibpy | |||||
| from gedlibpy_linlin import librariesImport, gedlibpy | |||||
| def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP', | def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP', | ||||
| edit_cost_constant=[], stabilizer='min', repeat=50): | |||||
| edit_cost_constant=[], algo_options='', stabilizer='min', repeat=50): | |||||
| """ | """ | ||||
| Compute GED for 2 graphs. | Compute GED for 2 graphs. | ||||
| """ | """ | ||||
| if lib == 'gedlibpy': | |||||
| def convertGraph(G): | |||||
| """Convert a graph to the proper NetworkX format that can be | |||||
| recognized by library gedlibpy. | |||||
| """ | |||||
| G_new = nx.Graph() | |||||
| for nd, attrs in G.nodes(data=True): | |||||
| G_new.add_node(str(nd), chem=attrs['atom']) | |||||
| def convertGraph(G): | |||||
| """Convert a graph to the proper NetworkX format that can be | |||||
| recognized by library gedlibpy. | |||||
| """ | |||||
| G_new = nx.Graph() | |||||
| for nd, attrs in G.nodes(data=True): | |||||
| G_new.add_node(str(nd), chem=attrs['atom']) | |||||
| # G_new.add_node(str(nd), x=str(attrs['attributes'][0]), | # G_new.add_node(str(nd), x=str(attrs['attributes'][0]), | ||||
| # y=str(attrs['attributes'][1])) | # y=str(attrs['attributes'][1])) | ||||
| for nd1, nd2, attrs in G.edges(data=True): | |||||
| G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) | |||||
| # G_new.add_edge(str(nd1), str(nd2)) | |||||
| return G_new | |||||
| for nd1, nd2, attrs in G.edges(data=True): | |||||
| # G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) | |||||
| G_new.add_edge(str(nd1), str(nd2)) | |||||
| return G_new | |||||
| if lib == 'gedlibpy': | |||||
| gedlibpy.restart_env() | gedlibpy.restart_env() | ||||
| gedlibpy.add_nx_graph(convertGraph(g1), "") | gedlibpy.add_nx_graph(convertGraph(g1), "") | ||||
| gedlibpy.add_nx_graph(convertGraph(g2), "") | gedlibpy.add_nx_graph(convertGraph(g2), "") | ||||
| @@ -43,12 +44,12 @@ def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP', | |||||
| listID = gedlibpy.get_all_graph_ids() | listID = gedlibpy.get_all_graph_ids() | ||||
| gedlibpy.set_edit_cost(cost, edit_cost_constant=edit_cost_constant) | gedlibpy.set_edit_cost(cost, edit_cost_constant=edit_cost_constant) | ||||
| gedlibpy.init() | gedlibpy.init() | ||||
| gedlibpy.set_method(method, "") | |||||
| gedlibpy.set_method(method, algo_options) | |||||
| gedlibpy.init_method() | gedlibpy.init_method() | ||||
| g = listID[0] | g = listID[0] | ||||
| h = listID[1] | h = listID[1] | ||||
| if stabilizer == None: | |||||
| if stabilizer is None: | |||||
| gedlibpy.run_method(g, h) | gedlibpy.run_method(g, h) | ||||
| pi_forward = gedlibpy.get_forward_map(g, h) | pi_forward = gedlibpy.get_forward_map(g, h) | ||||
| pi_backward = gedlibpy.get_backward_map(g, h) | pi_backward = gedlibpy.get_backward_map(g, h) | ||||
| @@ -107,13 +108,57 @@ def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP', | |||||
| dis = upper | dis = upper | ||||
| # make the map label correct (label remove map as np.inf) | |||||
| nodes1 = [n for n in g1.nodes()] | |||||
| nodes2 = [n for n in g2.nodes()] | |||||
| nb1 = nx.number_of_nodes(g1) | |||||
| nb2 = nx.number_of_nodes(g2) | |||||
| pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward] | |||||
| pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward] | |||||
| elif lib == 'gedlib-bash': | |||||
| import time | |||||
| import random | |||||
| import sys | |||||
| import os | |||||
| sys.path.insert(0, "../") | |||||
| from pygraph.utils.graphfiles import saveDataset | |||||
| tmp_dir = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/' | |||||
| if not os.path.exists(tmp_dir): | |||||
| os.makedirs(tmp_dir) | |||||
| fn_collection = tmp_dir + 'collection.' + str(time.time()) + str(random.randint(0, 1e9)) | |||||
| xparams = {'method': 'gedlib', 'graph_dir': fn_collection} | |||||
| saveDataset([g1, g2], ['dummy', 'dummy'], gformat='gxl', group='xml', | |||||
| filename=fn_collection, xparams=xparams) | |||||
| command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/others/gedlib/gedlib2\'\n' | |||||
| command += 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GEDLIB_HOME/lib\n' | |||||
| command += 'export LD_LIBRARY_PATH\n' | |||||
| command += 'cd \'/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/bin\'\n' | |||||
| command += './ged_for_python_bash monoterpenoides ' + fn_collection \ | |||||
| + ' \'' + algo_options + '\' ' | |||||
| for ec in edit_cost_constant: | |||||
| command += str(ec) + ' ' | |||||
| # output = os.system(command) | |||||
| stream = os.popen(command) | |||||
| output = stream.readlines() | |||||
| # print(output) | |||||
| dis = float(output[0].strip()) | |||||
| runtime = float(output[1].strip()) | |||||
| size_forward = int(output[2].strip()) | |||||
| pi_forward = [int(item.strip()) for item in output[3:3+size_forward]] | |||||
| pi_backward = [int(item.strip()) for item in output[3+size_forward:]] | |||||
| # print(dis) | |||||
| # print(runtime) | |||||
| # print(size_forward) | |||||
| # print(pi_forward) | |||||
| # print(pi_backward) | |||||
| # make the map label correct (label remove map as np.inf) | |||||
| nodes1 = [n for n in g1.nodes()] | |||||
| nodes2 = [n for n in g2.nodes()] | |||||
| nb1 = nx.number_of_nodes(g1) | |||||
| nb2 = nx.number_of_nodes(g2) | |||||
| pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward] | |||||
| pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward] | |||||
| # print(pi_forward) | |||||
| return dis, pi_forward, pi_backward | return dis, pi_forward, pi_backward | ||||
| @@ -149,7 +194,7 @@ def GED_n(Gn, lib='gedlibpy', cost='CHEM_1', method='IPFP', | |||||
| g = listID[0] | g = listID[0] | ||||
| h = listID[1] | h = listID[1] | ||||
| if stabilizer == None: | |||||
| if stabilizer is None: | |||||
| gedlibpy.run_method(g, h) | gedlibpy.run_method(g, h) | ||||
| pi_forward = gedlibpy.get_forward_map(g, h) | pi_forward = gedlibpy.get_forward_map(g, h) | ||||
| pi_backward = gedlibpy.get_backward_map(g, h) | pi_backward = gedlibpy.get_backward_map(g, h) | ||||
| @@ -183,7 +228,8 @@ def GED_n(Gn, lib='gedlibpy', cost='CHEM_1', method='IPFP', | |||||
| def ged_median(Gn, Gn_median, verbose=False, params_ged={'lib': 'gedlibpy', | def ged_median(Gn, Gn_median, verbose=False, params_ged={'lib': 'gedlibpy', | ||||
| 'cost': 'CHEM_1', 'method': 'IPFP', 'edit_cost_constant': [], | 'cost': 'CHEM_1', 'method': 'IPFP', 'edit_cost_constant': [], | ||||
| 'stabilizer': 'min', 'repeat': 50}, parallel=False): | |||||
| 'algo_options': '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1', | |||||
| 'stabilizer': None}, parallel=False): | |||||
| if parallel: | if parallel: | ||||
| len_itr = int(len(Gn)) | len_itr = int(len(Gn)) | ||||
| pi_forward_list = [[] for i in range(len_itr)] | pi_forward_list = [[] for i in range(len_itr)] | ||||
| @@ -23,7 +23,8 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, | |||||
| connected=False, removeNodes=True, allBestInit=False, allBestNodes=False, | connected=False, removeNodes=True, allBestInit=False, allBestNodes=False, | ||||
| allBestEdges=False, allBestOutput=False, | allBestEdges=False, allBestOutput=False, | ||||
| params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', 'method': 'IPFP', | params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', 'method': 'IPFP', | ||||
| 'edit_cost_constant': [], 'stabilizer': 'min', 'repeat': 50}): | |||||
| 'edit_cost_constant': [], 'stabilizer': None, | |||||
| 'algo_options': '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'}): | |||||
| """See my name, then you know what I do. | """See my name, then you know what I do. | ||||
| """ | """ | ||||
| # Gn_median = Gn_median[0:10] | # Gn_median = Gn_median[0:10] | ||||
| @@ -435,6 +436,62 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, | |||||
| return G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median | return G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median | ||||
| def iam_bash(Gn_names, edit_cost_constant, dataset='monoterpenoides', | |||||
| graph_dir='/media/ljia/DATA/research-repo/codes/Linlin/py-graph/datasets/monoterpenoides/'): | |||||
| """Compute the iam by c++ implementation (gedlib) through bash. | |||||
| """ | |||||
| import os | |||||
| import time | |||||
| def createCollectionFile(Gn_names, y, filename): | |||||
| """Create collection file. | |||||
| """ | |||||
| dirname_ds = os.path.dirname(filename) | |||||
| if dirname_ds != '': | |||||
| dirname_ds += '/' | |||||
| if not os.path.exists(dirname_ds) : | |||||
| os.makedirs(dirname_ds) | |||||
| with open(filename + '.xml', 'w') as fgroup: | |||||
| fgroup.write("<?xml version=\"1.0\"?>") | |||||
| fgroup.write("\n<!DOCTYPE GraphCollection SYSTEM \"http://www.inf.unibz.it/~blumenthal/dtd/GraphCollection.dtd\">") | |||||
| fgroup.write("\n<GraphCollection>") | |||||
| for idx, fname in enumerate(Gn_names): | |||||
| fgroup.write("\n\t<graph file=\"" + fname + "\" class=\"" + str(y[idx]) + "\"/>") | |||||
| fgroup.write("\n</GraphCollection>") | |||||
| fgroup.close() | |||||
| tmp_dir = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/' | |||||
| fn_collection = tmp_dir + 'collection.' + str(time.time()) + str(random.randint(0, 1e9)) | |||||
| createCollectionFile(Gn_names, ['dummy'] * len(Gn_names), fn_collection) | |||||
| # graph_dir = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/gxl' | |||||
| command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/others/gedlib/gedlib2\'\n' | |||||
| command += 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GEDLIB_HOME/lib\n' | |||||
| command += 'export LD_LIBRARY_PATH\n' | |||||
| command += 'cd \'/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/bin\'\n' | |||||
| command += './iam_for_python_bash ' + dataset + ' ' + fn_collection \ | |||||
| + ' \'' + graph_dir + '\' ' | |||||
| if edit_cost_constant is None: | |||||
| command += 'None' | |||||
| else: | |||||
| for ec in edit_cost_constant: | |||||
| command += str(ec) + ' ' | |||||
| # output = os.system(command) | |||||
| stream = os.popen(command) | |||||
| output = stream.readlines() | |||||
| # print(output) | |||||
| sod_sm = float(output[0].strip()) | |||||
| sod_gm= float(output[1].strip()) | |||||
| fname_sm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/set_median.gxl' | |||||
| fname_gm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/gen_median.gxl' | |||||
| return sod_sm, sod_gm, fname_sm, fname_gm | |||||
| ############################################################################### | ############################################################################### | ||||
| # Old implementations. | # Old implementations. | ||||
| @@ -16,6 +16,319 @@ from utils import remove_edges | |||||
| from fitDistance import fit_GED_to_kernel_distance | from fitDistance import fit_GED_to_kernel_distance | ||||
| from utils import normalize_distance_matrix | from utils import normalize_distance_matrix | ||||
| def median_paper_clcpc_python_best(): | |||||
| """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with | |||||
| python invoking the c++ code by bash command (with updated library). | |||||
| """ | |||||
| # ds = {'name': 'monoterpenoides', | |||||
| # 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | |||||
| # _, y_all = loadDataset(ds['dataset']) | |||||
| gkernel = 'untilhpathkernel' | |||||
| node_label = 'atom' | |||||
| edge_label = 'bond_type' | |||||
| itr_max = 6 | |||||
| algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' | |||||
| params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', | |||||
| 'algo_options': algo_options, 'stabilizer': None} | |||||
| y_all = ['3', '1', '4', '6', '7', '8', '9', '2'] | |||||
| repeats = 50 | |||||
| collection_path = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/' | |||||
| graph_dir = collection_path + 'gxl/' | |||||
| fn_edit_costs_output = 'results/median_paper/edit_costs_output.python_init40.k10.txt' | |||||
| for y in y_all: | |||||
| for repeat in range(repeats): | |||||
| edit_costs_output_file = open(fn_edit_costs_output, 'a') | |||||
| collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml' | |||||
| Gn, _ = loadDataset(collection_file, extra_params=graph_dir) | |||||
| edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \ | |||||
| nb_cost_mat_list = fit_GED_to_kernel_distance(Gn, node_label, edge_label, | |||||
| gkernel, itr_max, params_ged=params_ged, | |||||
| parallel=True) | |||||
| total_time = np.sum(time_list) | |||||
| # print('\nedit_costs:', edit_costs) | |||||
| # print('\nresidual_list:', residual_list) | |||||
| # print('\nedit_cost_list:', edit_cost_list) | |||||
| # print('\ndistance matrix in kernel space:', dis_k_mat) | |||||
| # print('\nged matrix:', ged_mat) | |||||
| # print('\ntotal time:', total_time) | |||||
| # print('\nnb_cost_mat:', nb_cost_mat_list[-1]) | |||||
| np.savez('results/median_paper/fit_distance.clcpc.python_init40.monot.elabeled.uhpkernel.y' | |||||
| + y + '.repeat' + str(repeat) + '.k10..gm', | |||||
| edit_costs=edit_costs, | |||||
| residual_list=residual_list, edit_cost_list=edit_cost_list, | |||||
| dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, | |||||
| total_time=total_time, nb_cost_mat_list=nb_cost_mat_list) | |||||
| for ec in edit_costs: | |||||
| edit_costs_output_file.write(str(ec) + ' ') | |||||
| edit_costs_output_file.write('\n') | |||||
| edit_costs_output_file.close() | |||||
| # # normalized distance matrices. | |||||
| # gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz') | |||||
| # edit_costs = gmfile['edit_costs'] | |||||
| # residual_list = gmfile['residual_list'] | |||||
| # edit_cost_list = gmfile['edit_cost_list'] | |||||
| # dis_k_mat = gmfile['dis_k_mat'] | |||||
| # ged_mat = gmfile['ged_mat'] | |||||
| # total_time = gmfile['total_time'] | |||||
| # nb_cost_mat_list = gmfile['nb_cost_mat_list'] | |||||
| nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat) | |||||
| print(nb_consistent, nb_inconsistent, ratio_consistent) | |||||
| # norm_dis_k_mat = normalize_distance_matrix(dis_k_mat) | |||||
| # plt.imshow(norm_dis_k_mat) | |||||
| # plt.colorbar() | |||||
| # plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_best.monot.elabeled.uhpkernel.y' | |||||
| # + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300) | |||||
| # plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_best.monot.elabeled.uhpkernel.y' | |||||
| # + y + '.repeat' + str(repeat) + '.png', format='png') | |||||
| # # plt.show() | |||||
| # plt.clf() | |||||
| # | |||||
| # norm_ged_mat = normalize_distance_matrix(ged_mat) | |||||
| # plt.imshow(norm_ged_mat) | |||||
| # plt.colorbar() | |||||
| # plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_best.monot.elabeled.uhpkernel.y' | |||||
| # + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300) | |||||
| # plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_best.monot.elabeled.uhpkernel.y' | |||||
| # + y + '.repeat' + str(repeat) + '.png', format='png') | |||||
| # # plt.show() | |||||
| # plt.clf() | |||||
| # | |||||
| # norm_diff = norm_ged_mat - norm_dis_k_mat | |||||
| # plt.imshow(norm_diff) | |||||
| # plt.colorbar() | |||||
| # plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_best.monot.elabeled.uhpkernel.y' | |||||
| # + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300) | |||||
| # plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_best.monot.elabeled.uhpkernel.y' | |||||
| # + y + '.repeat' + str(repeat) + '.png', format='png') | |||||
| # # plt.show() | |||||
| # plt.clf() | |||||
| # # draw_count_bar(norm_diff) | |||||
| def median_paper_clcpc_python_bash_cpp(): | |||||
| """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with | |||||
| python invoking the c++ code by bash command (with updated library). | |||||
| """ | |||||
| # ds = {'name': 'monoterpenoides', | |||||
| # 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | |||||
| # _, y_all = loadDataset(ds['dataset']) | |||||
| gkernel = 'untilhpathkernel' | |||||
| node_label = 'atom' | |||||
| edge_label = 'bond_type' | |||||
| itr_max = 20 | |||||
| algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5' | |||||
| params_ged = {'lib': 'gedlib-bash', 'cost': 'CONSTANT', 'method': 'IPFP', | |||||
| 'algo_options': algo_options} | |||||
| y_all = ['3', '1', '4', '6', '7', '8', '9', '2'] | |||||
| repeats = 50 | |||||
| collection_path = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/' | |||||
| graph_dir = collection_path + 'gxl/' | |||||
| fn_edit_costs_output = 'results/median_paper/edit_costs_output.txt' | |||||
| for y in y_all: | |||||
| for repeat in range(repeats): | |||||
| edit_costs_output_file = open(fn_edit_costs_output, 'a') | |||||
| collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml' | |||||
| Gn, _ = loadDataset(collection_file, extra_params=graph_dir) | |||||
| edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \ | |||||
| nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label, | |||||
| gkernel, itr_max, params_ged=params_ged, | |||||
| parallel=False) | |||||
| total_time = np.sum(time_list) | |||||
| # print('\nedit_costs:', edit_costs) | |||||
| # print('\nresidual_list:', residual_list) | |||||
| # print('\nedit_cost_list:', edit_cost_list) | |||||
| # print('\ndistance matrix in kernel space:', dis_k_mat) | |||||
| # print('\nged matrix:', ged_mat) | |||||
| # print('\ntotal time:', total_time) | |||||
| # print('\nnb_cost_mat:', nb_cost_mat_list[-1]) | |||||
| np.savez('results/median_paper/fit_distance.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' | |||||
| + y + '.repeat' + str(repeat) + '.gm', | |||||
| edit_costs=edit_costs, | |||||
| residual_list=residual_list, edit_cost_list=edit_cost_list, | |||||
| dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, | |||||
| total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, | |||||
| coef_dk=coef_dk) | |||||
| for ec in edit_costs: | |||||
| edit_costs_output_file.write(str(ec) + ' ') | |||||
| edit_costs_output_file.write('\n') | |||||
| edit_costs_output_file.close() | |||||
| # # normalized distance matrices. | |||||
| # gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz') | |||||
| # edit_costs = gmfile['edit_costs'] | |||||
| # residual_list = gmfile['residual_list'] | |||||
| # edit_cost_list = gmfile['edit_cost_list'] | |||||
| # dis_k_mat = gmfile['dis_k_mat'] | |||||
| # ged_mat = gmfile['ged_mat'] | |||||
| # total_time = gmfile['total_time'] | |||||
| # nb_cost_mat_list = gmfile['nb_cost_mat_list'] | |||||
| # coef_dk = gmfile['coef_dk'] | |||||
| nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat) | |||||
| print(nb_consistent, nb_inconsistent, ratio_consistent) | |||||
| # norm_dis_k_mat = normalize_distance_matrix(dis_k_mat) | |||||
| # plt.imshow(norm_dis_k_mat) | |||||
| # plt.colorbar() | |||||
| # plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' | |||||
| # + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300) | |||||
| # plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' | |||||
| # + y + '.repeat' + str(repeat) + '.png', format='png') | |||||
| # # plt.show() | |||||
| # plt.clf() | |||||
| # | |||||
| # norm_ged_mat = normalize_distance_matrix(ged_mat) | |||||
| # plt.imshow(norm_ged_mat) | |||||
| # plt.colorbar() | |||||
| # plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' | |||||
| # + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300) | |||||
| # plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' | |||||
| # + y + '.repeat' + str(repeat) + '.png', format='png') | |||||
| # # plt.show() | |||||
| # plt.clf() | |||||
| # | |||||
| # norm_diff = norm_ged_mat - norm_dis_k_mat | |||||
| # plt.imshow(norm_diff) | |||||
| # plt.colorbar() | |||||
| # plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' | |||||
| # + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300) | |||||
| # plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' | |||||
| # + y + '.repeat' + str(repeat) + '.png', format='png') | |||||
| # # plt.show() | |||||
| # plt.clf() | |||||
| # # draw_count_bar(norm_diff) | |||||
| def test_cs_leq_ci_plus_cr_python_bash_cpp(): | |||||
| """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with | |||||
| python invoking the c++ code by bash command (with updated library). | |||||
| """ | |||||
| ds = {'name': 'monoterpenoides', | |||||
| 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | |||||
| Gn, y_all = loadDataset(ds['dataset']) | |||||
| # Gn = Gn[0:10] | |||||
| gkernel = 'untilhpathkernel' | |||||
| node_label = 'atom' | |||||
| edge_label = 'bond_type' | |||||
| itr_max = 10 | |||||
| algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5' | |||||
| params_ged = {'lib': 'gedlib-bash', 'cost': 'CONSTANT', 'method': 'IPFP', | |||||
| 'algo_options': algo_options} | |||||
| edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \ | |||||
| nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label, | |||||
| gkernel, itr_max, params_ged=params_ged, | |||||
| parallel=False) | |||||
| total_time = np.sum(time_list) | |||||
| print('\nedit_costs:', edit_costs) | |||||
| print('\nresidual_list:', residual_list) | |||||
| print('\nedit_cost_list:', edit_cost_list) | |||||
| print('\ndistance matrix in kernel space:', dis_k_mat) | |||||
| print('\nged matrix:', ged_mat) | |||||
| print('\ntotal time:', total_time) | |||||
| print('\nnb_cost_mat:', nb_cost_mat_list[-1]) | |||||
| np.savez('results/fit_distance.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel.gm', | |||||
| edit_costs=edit_costs, | |||||
| residual_list=residual_list, edit_cost_list=edit_cost_list, | |||||
| dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, | |||||
| total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, | |||||
| coef_dk=coef_dk) | |||||
| # ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', | |||||
| # 'extra_params': {}} # node/edge symb | |||||
| # Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||||
| ## Gn = Gn[0:10] | |||||
| ## remove_edges(Gn) | |||||
| # gkernel = 'untilhpathkernel' | |||||
| # node_label = 'atom' | |||||
| # edge_label = 'bond_type' | |||||
| # itr_max = 10 | |||||
| # edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \ | |||||
| # nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label, | |||||
| # gkernel, itr_max) | |||||
| # total_time = np.sum(time_list) | |||||
| # print('\nedit_costs:', edit_costs) | |||||
| # print('\nresidual_list:', residual_list) | |||||
| # print('\nedit_cost_list:', edit_cost_list) | |||||
| # print('\ndistance matrix in kernel space:', dis_k_mat) | |||||
| # print('\nged matrix:', ged_mat) | |||||
| # print('\ntotal time:', total_time) | |||||
| # print('\nnb_cost_mat:', nb_cost_mat_list[-1]) | |||||
| # np.savez('results/fit_distance.cs_leq_ci_plus_cr.mutag.elabeled.uhpkernel.gm', | |||||
| # edit_costs=edit_costs, | |||||
| # residual_list=residual_list, edit_cost_list=edit_cost_list, | |||||
| # dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, | |||||
| # total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, coef_dk) | |||||
| # # normalized distance matrices. | |||||
| # gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.monot.elabeled.uhpkernel.gm.npz') | |||||
| # edit_costs = gmfile['edit_costs'] | |||||
| # residual_list = gmfile['residual_list'] | |||||
| # edit_cost_list = gmfile['edit_cost_list'] | |||||
| # dis_k_mat = gmfile['dis_k_mat'] | |||||
| # ged_mat = gmfile['ged_mat'] | |||||
| # total_time = gmfile['total_time'] | |||||
| # nb_cost_mat_list = gmfile['nb_cost_mat_list'] | |||||
| # coef_dk = gmfile['coef_dk'] | |||||
| nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat) | |||||
| print(nb_consistent, nb_inconsistent, ratio_consistent) | |||||
| # dis_k_sub = pairwise_substitution(dis_k_mat) | |||||
| # ged_sub = pairwise_substitution(ged_mat) | |||||
| # np.savez('results/sub_dis_mat.cs_leq_ci_plus_cr.gm', | |||||
| # dis_k_sub=dis_k_sub, ged_sub=ged_sub) | |||||
| norm_dis_k_mat = normalize_distance_matrix(dis_k_mat) | |||||
| plt.imshow(norm_dis_k_mat) | |||||
| plt.colorbar() | |||||
| plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' | |||||
| + '.eps', format='eps', dpi=300) | |||||
| plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' | |||||
| + '.png', format='png') | |||||
| # plt.show() | |||||
| plt.clf() | |||||
| norm_ged_mat = normalize_distance_matrix(ged_mat) | |||||
| plt.imshow(norm_ged_mat) | |||||
| plt.colorbar() | |||||
| plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' | |||||
| + '.eps', format='eps', dpi=300) | |||||
| plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' | |||||
| + '.png', format='png') | |||||
| # plt.show() | |||||
| plt.clf() | |||||
| norm_diff = norm_ged_mat - norm_dis_k_mat | |||||
| plt.imshow(norm_diff) | |||||
| plt.colorbar() | |||||
| plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' | |||||
| + '.eps', format='eps', dpi=300) | |||||
| plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' | |||||
| + '.png', format='png') | |||||
| # plt.show() | |||||
| plt.clf() | |||||
| # draw_count_bar(norm_diff) | |||||
| def test_anycosts(): | def test_anycosts(): | ||||
| ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', | ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', | ||||
| 'extra_params': {}} # node/edge symb | 'extra_params': {}} # node/edge symb | ||||
| @@ -295,8 +608,12 @@ def draw_count_bar(norm_diff): | |||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||
| # test_anycosts() | # test_anycosts() | ||||
| test_cs_leq_ci_plus_cr() | |||||
| # test_cs_leq_ci_plus_cr() | |||||
| # test_unfitted() | # test_unfitted() | ||||
| # test_cs_leq_ci_plus_cr_python_bash_cpp() | |||||
| # median_paper_clcpc_python_bash_cpp() | |||||
| median_paper_clcpc_python_best() | |||||
| # x = np.array([[1,2,3],[4,5,6],[7,8,9]]) | # x = np.array([[1,2,3],[4,5,6],[7,8,9]]) | ||||
| # xx = pairwise_substitution(x) | # xx = pairwise_substitution(x) | ||||
| @@ -22,6 +22,130 @@ from iam import iam_upgraded | |||||
| from utils import remove_edges, compute_kernel, get_same_item_indices, dis_gstar | from utils import remove_edges, compute_kernel, get_same_item_indices, dis_gstar | ||||
| #from ged import ged_median | #from ged import ged_median | ||||
| def test_iam_monoterpenoides_with_init40(): | |||||
| gkernel = 'untilhpathkernel' | |||||
| node_label = 'atom' | |||||
| edge_label = 'bond_type' | |||||
| # unfitted edit costs. | |||||
| c_vi = 3 | |||||
| c_vr = 3 | |||||
| c_vs = 1 | |||||
| c_ei = 3 | |||||
| c_er = 3 | |||||
| c_es = 1 | |||||
| ite_max_iam = 50 | |||||
| epsilon_iam = 0.0001 | |||||
| removeNodes = False | |||||
| connected_iam = False | |||||
| # parameters for IAM function | |||||
| # ged_cost = 'CONSTANT' | |||||
| ged_cost = 'CONSTANT' | |||||
| ged_method = 'IPFP' | |||||
| edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es] | |||||
| ged_stabilizer = None | |||||
| # ged_repeat = 50 | |||||
| algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' | |||||
| params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, | |||||
| 'edit_cost_constant': edit_cost_constant, | |||||
| 'algo_options': algo_options, | |||||
| 'stabilizer': ged_stabilizer} | |||||
| collection_path = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/' | |||||
| graph_dir = collection_path + 'gxl/' | |||||
| y_all = ['3', '1', '4', '6', '7', '8', '9', '2'] | |||||
| repeats = 50 | |||||
| # classify graphs according to classes. | |||||
| time_list = [] | |||||
| dis_ks_min_list = [] | |||||
| dis_ks_set_median_list = [] | |||||
| sod_gs_list = [] | |||||
| g_best = [] | |||||
| sod_set_median_list = [] | |||||
| sod_list_list = [] | |||||
| for y in y_all: | |||||
| print('\n-------------------------------------------------------') | |||||
| print('class of y:', y) | |||||
| time_list.append([]) | |||||
| dis_ks_min_list.append([]) | |||||
| dis_ks_set_median_list.append([]) | |||||
| sod_gs_list.append([]) | |||||
| g_best.append([]) | |||||
| sod_set_median_list.append([]) | |||||
| for repeat in range(repeats): | |||||
| # load median set. | |||||
| collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml' | |||||
| Gn_median, _ = loadDataset(collection_file, extra_params=graph_dir) | |||||
| Gn_candidate = [g.copy() for g in Gn_median] | |||||
| time0 = time.time() | |||||
| G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \ | |||||
| = iam_upgraded(Gn_median, | |||||
| Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam, | |||||
| epsilon=epsilon_iam, node_label=node_label, edge_label=edge_label, | |||||
| connected=connected_iam, removeNodes=removeNodes, | |||||
| params_ged=params_ged) | |||||
| time_total = time.time() - time0 | |||||
| print('\ntime: ', time_total) | |||||
| time_list[-1].append(time_total) | |||||
| g_best[-1].append(G_gen_median_list[0]) | |||||
| sod_set_median_list[-1].append(sod_set_median) | |||||
| print('\nsmallest sod of the set median:', sod_set_median) | |||||
| sod_gs_list[-1].append(sod_gen_median) | |||||
| print('\nsmallest sod in graph space:', sod_gen_median) | |||||
| sod_list_list.append(sod_list) | |||||
| # # show the best graph and save it to file. | |||||
| # print('one of the possible corresponding pre-images is') | |||||
| # nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'), | |||||
| # with_labels=True) | |||||
| ## plt.show() | |||||
| # # plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) + | |||||
| ## plt.savefig('results/iam/paper_compare/monoter_y' + str(y_class) + | |||||
| ## '_repeat' + str(repeat) + '_' + str(time.time()) + | |||||
| ## '.png', format="PNG") | |||||
| # plt.clf() | |||||
| # # print(G_gen_median_list[0].nodes(data=True)) | |||||
| # # print(G_gen_median_list[0].edges(data=True)) | |||||
| print('\nsods of the set median for this class:', sod_set_median_list[-1]) | |||||
| print('\nsods in graph space for this class:', sod_gs_list[-1]) | |||||
| # print('\ndistance in kernel space of set median for this class:', | |||||
| # dis_ks_set_median_list[-1]) | |||||
| # print('\nsmallest distances in kernel space for this class:', | |||||
| # dis_ks_min_list[-1]) | |||||
| print('\ntimes for this class:', time_list[-1]) | |||||
| sod_set_median_list[-1] = np.mean(sod_set_median_list[-1]) | |||||
| sod_gs_list[-1] = np.mean(sod_gs_list[-1]) | |||||
| # dis_ks_set_median_list[-1] = np.mean(dis_ks_set_median_list[-1]) | |||||
| # dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1]) | |||||
| time_list[-1] = np.mean(time_list[-1]) | |||||
| print() | |||||
| print('\nmean sods of the set median for each class:', sod_set_median_list) | |||||
| print('\nmean sods in graph space for each class:', sod_gs_list) | |||||
| # print('\ndistances in kernel space of set median for each class:', | |||||
| # dis_ks_set_median_list) | |||||
| # print('\nmean smallest distances in kernel space for each class:', | |||||
| # dis_ks_min_list) | |||||
| print('\nmean times for each class:', time_list) | |||||
| print('\nmean sods of the set median of all:', np.mean(sod_set_median_list)) | |||||
| print('\nmean sods in graph space of all:', np.mean(sod_gs_list)) | |||||
| # print('\nmean distances in kernel space of set median of all:', | |||||
| # np.mean(dis_ks_set_median_list)) | |||||
| # print('\nmean smallest distances in kernel space of all:', | |||||
| # np.mean(dis_ks_min_list)) | |||||
| print('\nmean times of all:', np.mean(time_list)) | |||||
| def test_iam_monoterpenoides(): | def test_iam_monoterpenoides(): | ||||
| ds = {'name': 'monoterpenoides', | ds = {'name': 'monoterpenoides', | ||||
| 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | ||||
| @@ -834,9 +958,10 @@ if __name__ == '__main__': | |||||
| # tests on different numbers of median-sets. | # tests on different numbers of median-sets. | ||||
| # test_iam_median_nb() | # test_iam_median_nb() | ||||
| # test_iam_letter_h() | # test_iam_letter_h() | ||||
| test_iam_monoterpenoides() | |||||
| # test_iam_monoterpenoides() | |||||
| # test_iam_mutag() | # test_iam_mutag() | ||||
| # test_iam_fitdistance() | # test_iam_fitdistance() | ||||
| # print("test log") | # print("test log") | ||||
| test_iam_monoterpenoides_with_init40() | |||||
| @@ -17,8 +17,10 @@ from pygraph.kernels.marginalizedKernel import marginalizedkernel | |||||
| from pygraph.kernels.untilHPathKernel import untilhpathkernel | from pygraph.kernels.untilHPathKernel import untilhpathkernel | ||||
| from pygraph.kernels.spKernel import spkernel | from pygraph.kernels.spKernel import spkernel | ||||
| import functools | import functools | ||||
| from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct | |||||
| from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct, polynomialkernel | |||||
| from pygraph.kernels.structuralspKernel import structuralspkernel | from pygraph.kernels.structuralspKernel import structuralspkernel | ||||
| from pygraph.kernels.treeletKernel import treeletkernel | |||||
| from pygraph.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel | |||||
| def remove_edges(Gn): | def remove_edges(Gn): | ||||
| @@ -46,18 +48,29 @@ def compute_kernel(Gn, graph_kernel, node_label, edge_label, verbose): | |||||
| n_jobs=multiprocessing.cpu_count(), verbose=verbose) | n_jobs=multiprocessing.cpu_count(), verbose=verbose) | ||||
| elif graph_kernel == 'untilhpathkernel': | elif graph_kernel == 'untilhpathkernel': | ||||
| Kmatrix, _ = untilhpathkernel(Gn, node_label=node_label, edge_label=edge_label, | Kmatrix, _ = untilhpathkernel(Gn, node_label=node_label, edge_label=edge_label, | ||||
| depth=10, k_func='MinMax', compute_method='trie', | |||||
| depth=7, k_func='MinMax', compute_method='trie', | |||||
| n_jobs=multiprocessing.cpu_count(), verbose=verbose) | n_jobs=multiprocessing.cpu_count(), verbose=verbose) | ||||
| elif graph_kernel == 'spkernel': | elif graph_kernel == 'spkernel': | ||||
| mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | ||||
| Kmatrix, _, _ = spkernel(Gn, node_label='atom', node_kernels= | |||||
| Kmatrix, _, _ = spkernel(Gn, node_label=node_label, node_kernels= | |||||
| {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}, | {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}, | ||||
| n_jobs=multiprocessing.cpu_count(), verbose=verbose) | n_jobs=multiprocessing.cpu_count(), verbose=verbose) | ||||
| elif graph_kernel == 'structuralspkernel': | elif graph_kernel == 'structuralspkernel': | ||||
| mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | ||||
| Kmatrix, _ = structuralspkernel(Gn, node_label='atom', node_kernels= | |||||
| Kmatrix, _ = structuralspkernel(Gn, node_label=node_label, node_kernels= | |||||
| {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}, | {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}, | ||||
| n_jobs=multiprocessing.cpu_count(), verbose=verbose) | n_jobs=multiprocessing.cpu_count(), verbose=verbose) | ||||
| elif graph_kernel == 'treeletkernel': | |||||
| # pkernel = functools.partial(polynomialkernel, d=2, c=1e5) | |||||
| pkernel = functools.partial(gaussiankernel, gamma=1e-6) | |||||
| mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | |||||
| Kmatrix, _ = treeletkernel(Gn, node_label=node_label, edge_label=edge_label, | |||||
| sub_kernel=pkernel, | |||||
| n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||||
| elif graph_kernel == 'weisfeilerlehmankernel': | |||||
| Kmatrix, _ = weisfeilerlehmankernel(Gn, node_label=node_label, edge_label=edge_label, | |||||
| height=4, base_kernel='subtree', | |||||
| n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||||
| # normalization | # normalization | ||||
| Kmatrix_diag = Kmatrix.diagonal().copy() | Kmatrix_diag = Kmatrix.diagonal().copy() | ||||
| @@ -79,7 +92,7 @@ def gram2distances(Kmatrix): | |||||
| def kernel_distance_matrix(Gn, node_label, edge_label, Kmatrix=None, gkernel=None): | def kernel_distance_matrix(Gn, node_label, edge_label, Kmatrix=None, gkernel=None): | ||||
| dis_mat = np.empty((len(Gn), len(Gn))) | dis_mat = np.empty((len(Gn), len(Gn))) | ||||
| if Kmatrix == None: | |||||
| if Kmatrix is None: | |||||
| Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True) | Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True) | ||||
| for i in range(len(Gn)): | for i in range(len(Gn)): | ||||
| for j in range(i, len(Gn)): | for j in range(i, len(Gn)): | ||||
| @@ -109,6 +122,21 @@ def get_same_item_indices(ls): | |||||
| return idx_dict | return idx_dict | ||||
| def k_nearest_neighbors_to_median_in_kernel_space(Gn, Kmatrix=None, gkernel=None, | |||||
| node_label=None, edge_label=None): | |||||
| dis_k_all = [] # distance between g_star and each graph. | |||||
| alpha = [1 / len(Gn)] * len(Gn) | |||||
| if Kmatrix is None: | |||||
| Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True) | |||||
| term3 = 0 | |||||
| for i1, a1 in enumerate(alpha): | |||||
| for i2, a2 in enumerate(alpha): | |||||
| term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] | |||||
| for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout): | |||||
| dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3) | |||||
| dis_all.append(dtemp) | |||||
| def normalize_distance_matrix(D): | def normalize_distance_matrix(D): | ||||
| max_value = np.amax(D) | max_value = np.amax(D) | ||||
| min_value = np.amin(D) | min_value = np.amin(D) | ||||
| @@ -124,21 +124,21 @@ def saveGXL(graph, filename, method='benoit'): | |||||
| # reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22 | # reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22 | ||||
| # pass | # pass | ||||
| gxl_file = open(filename, 'w') | gxl_file = open(filename, 'w') | ||||
| gxl_file.write("<?xml version=\"1.0\"?>\n") | |||||
| gxl_file.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") | |||||
| gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n") | gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n") | ||||
| gxl_file.write("<gxl>\n") | |||||
| gxl_file.write("<gxl xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n") | |||||
| gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"true\" edgemode=\"undirected\">\n") | gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"true\" edgemode=\"undirected\">\n") | ||||
| for v, attrs in graph.nodes(data=True): | for v, attrs in graph.nodes(data=True): | ||||
| gxl_file.write("<node id=\"_" + str(v) + "\">") | gxl_file.write("<node id=\"_" + str(v) + "\">") | ||||
| gxl_file.write("<attr name=\"" + "chem" + "\"><int>" + str(attrs['atom']) + "</int></attr>") | |||||
| gxl_file.write("<attr name=\"" + "chem" + "\"><int>" + str(attrs['chem']) + "</int></attr>") | |||||
| gxl_file.write("</node>\n") | gxl_file.write("</node>\n") | ||||
| for v1, v2, attrs in graph.edges(data=True): | for v1, v2, attrs in graph.edges(data=True): | ||||
| gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\">") | gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\">") | ||||
| # gxl_file.write("<attr name=\"valence\"><int>" + str(attrs['bond_type']) + "</int></attr>") | |||||
| gxl_file.write("<attr name=\"valence\"><int>" + "1" + "</int></attr>") | |||||
| gxl_file.write("<attr name=\"valence\"><int>" + str(attrs['valence']) + "</int></attr>") | |||||
| # gxl_file.write("<attr name=\"valence\"><int>" + "1" + "</int></attr>") | |||||
| gxl_file.write("</edge>\n") | gxl_file.write("</edge>\n") | ||||
| gxl_file.write("</graph>\n") | gxl_file.write("</graph>\n") | ||||
| gxl_file.write("</gxl>\n") | |||||
| gxl_file.write("</gxl>") | |||||
| gxl_file.close() | gxl_file.close() | ||||
| elif method == 'gedlib-letter': | elif method == 'gedlib-letter': | ||||
| # reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22 | # reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22 | ||||
| @@ -147,15 +147,15 @@ def saveGXL(graph, filename, method='benoit'): | |||||
| gxl_file.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") | gxl_file.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") | ||||
| gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n") | gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n") | ||||
| gxl_file.write("<gxl xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n") | gxl_file.write("<gxl xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n") | ||||
| gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"false\" edgemode=\"undirected\">") | |||||
| gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"false\" edgemode=\"undirected\">\n") | |||||
| for v, attrs in graph.nodes(data=True): | for v, attrs in graph.nodes(data=True): | ||||
| gxl_file.write("<node id=\"_" + str(v) + "\">") | gxl_file.write("<node id=\"_" + str(v) + "\">") | ||||
| gxl_file.write("<attr name=\"x\"><float>" + str(attrs['attributes'][0]) + "</float></attr>") | gxl_file.write("<attr name=\"x\"><float>" + str(attrs['attributes'][0]) + "</float></attr>") | ||||
| gxl_file.write("<attr name=\"y\"><float>" + str(attrs['attributes'][1]) + "</float></attr>") | gxl_file.write("<attr name=\"y\"><float>" + str(attrs['attributes'][1]) + "</float></attr>") | ||||
| gxl_file.write("</node>") | |||||
| gxl_file.write("</node>\n") | |||||
| for v1, v2, attrs in graph.edges(data=True): | for v1, v2, attrs in graph.edges(data=True): | ||||
| gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\"/>") | |||||
| gxl_file.write("</graph>") | |||||
| gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\"/>\n") | |||||
| gxl_file.write("</graph>\n") | |||||
| gxl_file.write("</gxl>") | gxl_file.write("</gxl>") | ||||
| gxl_file.close() | gxl_file.close() | ||||
| @@ -466,12 +466,15 @@ def loadDataset(filename, filename_y=None, extra_params=None): | |||||
| def loadFromXML(filename, extra_params): | def loadFromXML(filename, extra_params): | ||||
| import xml.etree.ElementTree as ET | import xml.etree.ElementTree as ET | ||||
| dirname_dataset = dirname(filename) | |||||
| if extra_params: | |||||
| dirname_dataset = extra_params | |||||
| else: | |||||
| dirname_dataset = dirname(filename) | |||||
| tree = ET.parse(filename) | tree = ET.parse(filename) | ||||
| root = tree.getroot() | root = tree.getroot() | ||||
| data = [] | data = [] | ||||
| y = [] | y = [] | ||||
| for graph in root.iter('print'): | |||||
| for graph in root.iter('graph'): | |||||
| mol_filename = graph.attrib['file'] | mol_filename = graph.attrib['file'] | ||||
| mol_class = graph.attrib['class'] | mol_class = graph.attrib['class'] | ||||
| data.append(loadGXL(dirname_dataset + '/' + mol_filename)) | data.append(loadGXL(dirname_dataset + '/' + mol_filename)) | ||||
| @@ -541,15 +544,22 @@ def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=None | |||||
| dirname_ds += '/' | dirname_ds += '/' | ||||
| if not os.path.exists(dirname_ds) : | if not os.path.exists(dirname_ds) : | ||||
| os.makedirs(dirname_ds) | os.makedirs(dirname_ds) | ||||
| if 'graph_dir' in xparams: | |||||
| graph_dir = xparams['graph_dir'] + '/' | |||||
| if not os.path.exists(graph_dir): | |||||
| os.makedirs(graph_dir) | |||||
| else: | |||||
| graph_dir = dirname_ds | |||||
| if group == 'xml' and gformat == 'gxl': | if group == 'xml' and gformat == 'gxl': | ||||
| with open(filename + '.xml', 'w') as fgroup: | with open(filename + '.xml', 'w') as fgroup: | ||||
| fgroup.write("<?xml version=\"1.0\"?>") | fgroup.write("<?xml version=\"1.0\"?>") | ||||
| fgroup.write("\n<!DOCTYPE GraphCollection SYSTEM \"https://dbblumenthal.github.io/gedlib/GraphCollection_8dtd_source.html\">") | |||||
| fgroup.write("\n<!DOCTYPE GraphCollection SYSTEM \"http://www.inf.unibz.it/~blumenthal/dtd/GraphCollection.dtd\">") | |||||
| fgroup.write("\n<GraphCollection>") | fgroup.write("\n<GraphCollection>") | ||||
| for idx, g in enumerate(Gn): | for idx, g in enumerate(Gn): | ||||
| fname_tmp = "graph" + str(idx) + ".gxl" | fname_tmp = "graph" + str(idx) + ".gxl" | ||||
| saveGXL(g, dirname_ds + fname_tmp, method=xparams['method']) | |||||
| saveGXL(g, graph_dir + fname_tmp, method=xparams['method']) | |||||
| fgroup.write("\n\t<graph file=\"" + fname_tmp + "\" class=\"" + str(y[idx]) + "\"/>") | fgroup.write("\n\t<graph file=\"" + fname_tmp + "\" class=\"" + str(y[idx]) + "\"/>") | ||||
| fgroup.write("\n</GraphCollection>") | fgroup.write("\n</GraphCollection>") | ||||
| fgroup.close() | fgroup.close() | ||||
| @@ -558,18 +568,18 @@ def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=None | |||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||
| # ### Load dataset from .ds file. | # ### Load dataset from .ds file. | ||||
| # # .ct files. | # # .ct files. | ||||
| ds = {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds', | |||||
| 'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt'} | |||||
| Gn, y = loadDataset(ds['dataset'], filename_y=ds['dataset_y']) | |||||
| # ds = {'name': 'Acyclic', 'dataset': '../../datasets/acyclic/dataset_bps.ds'} # node symb | |||||
| # Gn, y = loadDataset(ds['dataset']) | |||||
| # ds = {'name': 'MAO', 'dataset': '../../datasets/MAO/dataset.ds'} # node/edge symb | |||||
| # Gn, y = loadDataset(ds['dataset']) | |||||
| # ds = {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds'} # unlabeled | |||||
| # Gn, y = loadDataset(ds['dataset']) | |||||
| print(Gn[1].nodes(data=True)) | |||||
| print(Gn[1].edges(data=True)) | |||||
| print(y[1]) | |||||
| # ds = {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds', | |||||
| # 'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt'} | |||||
| # Gn, y = loadDataset(ds['dataset'], filename_y=ds['dataset_y']) | |||||
| ## ds = {'name': 'Acyclic', 'dataset': '../../datasets/acyclic/dataset_bps.ds'} # node symb | |||||
| ## Gn, y = loadDataset(ds['dataset']) | |||||
| ## ds = {'name': 'MAO', 'dataset': '../../datasets/MAO/dataset.ds'} # node/edge symb | |||||
| ## Gn, y = loadDataset(ds['dataset']) | |||||
| ## ds = {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds'} # unlabeled | |||||
| ## Gn, y = loadDataset(ds['dataset']) | |||||
| # print(Gn[1].nodes(data=True)) | |||||
| # print(Gn[1].edges(data=True)) | |||||
| # print(y[1]) | |||||
| # # .gxl file. | # # .gxl file. | ||||
| # ds = {'name': 'monoterpenoides', | # ds = {'name': 'monoterpenoides', | ||||
| @@ -579,6 +589,33 @@ if __name__ == '__main__': | |||||
| # print(Gn[1].edges(data=True)) | # print(Gn[1].edges(data=True)) | ||||
| # print(y[1]) | # print(y[1]) | ||||
| ### Convert graph from one format to another. | |||||
| # .gxl file. | |||||
| import networkx as nx | |||||
| ds = {'name': 'monoterpenoides', | |||||
| 'dataset': '../../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | |||||
| Gn, y = loadDataset(ds['dataset']) | |||||
| y = [int(i) for i in y] | |||||
| print(Gn[1].nodes(data=True)) | |||||
| print(Gn[1].edges(data=True)) | |||||
| print(y[1]) | |||||
| # Convert a graph to the proper NetworkX format that can be recognized by library gedlib. | |||||
| Gn_new = [] | |||||
| for G in Gn: | |||||
| G_new = nx.Graph() | |||||
| for nd, attrs in G.nodes(data=True): | |||||
| G_new.add_node(str(nd), chem=attrs['atom']) | |||||
| for nd1, nd2, attrs in G.edges(data=True): | |||||
| G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) | |||||
| # G_new.add_edge(str(nd1), str(nd2)) | |||||
| Gn_new.append(G_new) | |||||
| print(Gn_new[1].nodes(data=True)) | |||||
| print(Gn_new[1].edges(data=True)) | |||||
| print(Gn_new[1]) | |||||
| filename = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/gxl/monoterpenoides' | |||||
| xparams = {'method': 'gedlib'} | |||||
| saveDataset(Gn, y, gformat='gxl', group='xml', filename=filename, xparams=xparams) | |||||
| # ds = {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat', | # ds = {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat', | ||||
| # 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb | # 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb | ||||
| # Gn, y = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | # Gn, y = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | ||||