| @@ -28,6 +28,7 @@ dslist = [ | |||
| # {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb | |||
| # {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb | |||
| # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | |||
| # {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge symb | |||
| # | |||
| # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | |||
| # # node/edge symb | |||
| @@ -57,7 +58,7 @@ estimator = marginalizedkernel | |||
| #param_grid_precomputed = {'p_quit': np.linspace(0.1, 0.3, 3), | |||
| # 'n_iteration': np.linspace(1, 1, 1), | |||
| param_grid_precomputed = {'p_quit': np.linspace(0.1, 0.9, 9), | |||
| 'n_iteration': np.linspace(5, 20, 4), | |||
| 'n_iteration': np.linspace(1, 19, 7), | |||
| 'remove_totters': [False]} | |||
| param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, | |||
| {'alpha': np.logspace(-10, 10, num=41, base=10)}] | |||
| @@ -24,6 +24,9 @@ dslist = [ | |||
| # {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb | |||
| # {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb | |||
| # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | |||
| # {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge | |||
| # {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt'}, | |||
| # # node nsymb symb | |||
| # | |||
| # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | |||
| # # node/edge symb | |||
| @@ -30,6 +30,8 @@ dslist = [ | |||
| # {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | |||
| # # node symb/nsymb | |||
| # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | |||
| # {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt'}, | |||
| # # node nsymb symb | |||
| # | |||
| # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | |||
| # # node/edge symb | |||
| @@ -26,6 +26,7 @@ dslist = [ | |||
| {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb | |||
| {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb | |||
| {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | |||
| # {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge symb | |||
| # node symb/nsymb | |||
| # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | |||
| # {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | |||
| @@ -27,7 +27,8 @@ dslist = [ | |||
| {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb | |||
| {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb | |||
| {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb | |||
| {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | |||
| {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | |||
| # {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge symb | |||
| # | |||
| # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | |||
| # # node/edge symb | |||
| @@ -54,11 +55,11 @@ dslist = [ | |||
| # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | |||
| ] | |||
| estimator = untilhpathkernel | |||
| param_grid_precomputed = {'depth': np.linspace(3, 10, 8), # [2], | |||
| 'k_func': [None]} # ['MinMax', 'tanimoto'], | |||
| #param_grid_precomputed = {'depth': np.linspace(1, 10, 10), # [2], | |||
| # 'k_func': ['MinMax'], # ['MinMax', 'tanimoto'], | |||
| # 'compute_method': ['trie']} # ['MinMax']} | |||
| #param_grid_precomputed = {'depth': np.linspace(3, 10, 8), # [2], | |||
| # 'k_func': [None]} # ['MinMax', 'tanimoto'], | |||
| param_grid_precomputed = {'depth': np.linspace(1, 10, 10), # [2], | |||
| 'k_func': ['MinMax', 'tanimoto'], # ['MinMax'], # | |||
| 'compute_method': ['trie']} # ['MinMax']} | |||
| param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, | |||
| {'alpha': np.logspace(-10, 10, num=41, base=10)}] | |||
| @@ -30,6 +30,8 @@ dslist = [ | |||
| {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb | |||
| {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb | |||
| {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | |||
| # {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge symb | |||
| # | |||
| # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | |||
| # # node/edge symb | |||
| @@ -7,7 +7,7 @@ Created on Wed Oct 16 14:20:06 2019 | |||
| """ | |||
| import numpy as np | |||
| from tqdm import tqdm | |||
| from itertools import combinations_with_replacement | |||
| from itertools import combinations_with_replacement, combinations | |||
| import multiprocessing | |||
| from multiprocessing import Pool | |||
| from functools import partial | |||
| @@ -22,110 +22,88 @@ import sys | |||
| from ged import GED, get_nb_edit_operations | |||
| from utils import kernel_distance_matrix | |||
| def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max, | |||
| fitkernel=None, gamma=1.0): | |||
| def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max, k=4, | |||
| params_ged={'lib': 'gedlibpy', 'cost': 'CONSTANT', | |||
| 'method': 'IPFP', 'stabilizer': None}, | |||
| init_costs=[3, 3, 1, 3, 3, 1], | |||
| parallel=True): | |||
| # c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them. | |||
| # random.seed(1) | |||
| cost_rdm = random.sample(range(1, 10), 6) | |||
| # edit_costs = cost_rdm + [0] | |||
| edit_costs = cost_rdm | |||
| # edit_costs = [i * 0.01 for i in cost_rdm] + [0] | |||
| # edit_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0] | |||
| # edit_costs = [0, 0, 0.9544, 0.026, 0.0196, 0] | |||
| # edit_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0] | |||
| idx_cost_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0] | |||
| # cost_rdm = random.sample(range(1, 10), 6) | |||
| # init_costs = cost_rdm + [0] | |||
| # init_costs = cost_rdm | |||
| init_costs = [3, 3, 1, 3, 3, 1] | |||
| # init_costs = [i * 0.01 for i in cost_rdm] + [0] | |||
| # init_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0] | |||
| # init_costs = [0, 0, 0.9544, 0.026, 0.0196, 0] | |||
| # init_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0] | |||
| # idx_cost_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0] | |||
| # compute distances in feature space. | |||
| coef_dk = 1 | |||
| dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, gkernel=gkernel) | |||
| dis_k_vec = [] | |||
| for i in range(len(dis_k_mat)): | |||
| for j in range(i, len(dis_k_mat)): | |||
| # for j in range(i, len(dis_k_mat)): | |||
| for j in range(i + 1, len(dis_k_mat)): | |||
| dis_k_vec.append(dis_k_mat[i, j]) | |||
| dis_k_vec = np.array(dis_k_vec) | |||
| if fitkernel == None: | |||
| dis_k_vec_ajusted = dis_k_vec | |||
| elif fitkernel == 'gaussian': | |||
| coef_dk = 1 / np.max(dis_k_vec) | |||
| idx_dk_nonzeros = np.where(dis_k_vec != 0)[0] | |||
| # remove 0's and constraint d_k between 0 and 1. | |||
| dis_k_vec = dis_k_vec[idx_dk_nonzeros] * coef_dk | |||
| dis_k_vec_ajusted = np.sqrt(-np.log(dis_k_vec) / gamma) | |||
| residual_list = [] | |||
| edit_cost_list = [] | |||
| time_list = [] | |||
| nb_cost_mat_list = [] | |||
| # init ged. | |||
| print('\ninitial:') | |||
| time0 = time.time() | |||
| params_ged['edit_cost_constant'] = init_costs | |||
| ged_vec_init, ged_mat, n_edit_operations = compute_geds(Gn, params_ged, | |||
| parallel=parallel) | |||
| residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))] | |||
| time_list = [time.time() - time0] | |||
| edit_cost_list = [init_costs] | |||
| nb_cost_mat = np.array(n_edit_operations) | |||
| nb_cost_mat_list = [nb_cost_mat] | |||
| print('edit_costs:', init_costs) | |||
| print('residual_list:', residual_list) | |||
| for itr in range(itr_max): | |||
| print('\niteration', itr) | |||
| time0 = time.time() | |||
| # compute GEDs and numbers of edit operations. | |||
| edit_cost_constant = [i for i in edit_costs] | |||
| edit_cost_list.append(edit_cost_constant) | |||
| ged_all, ged_mat, n_edit_operations = compute_geds(Gn, edit_cost_constant, | |||
| idx_cost_nonzeros, parallel=True) | |||
| if fitkernel == None: | |||
| residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec))) | |||
| elif fitkernel == 'gaussian': | |||
| ged_all = np.array(ged_all)[idx_dk_nonzeros] | |||
| residual = np.sqrt(np.sum(np.square( | |||
| np.exp(-gamma * ged_all ** 2) / coef_dk - dis_k_vec))) | |||
| residual_list.append(residual) | |||
| # "fit" geds to distances in feature space by tuning edit costs using the | |||
| # Least Squares Method. | |||
| nb_cost_mat = np.array(n_edit_operations).T | |||
| if fitkernel == 'gaussian': | |||
| nb_cost_mat = nb_cost_mat[idx_dk_nonzeros] | |||
| nb_cost_mat_list.append(nb_cost_mat) | |||
| edit_costs_new, residual = compute_better_costs(nb_cost_mat, dis_k_vec_ajusted) | |||
| print('pseudo residual:', residual) | |||
| edit_costs_new, residual = update_costs(nb_cost_mat, dis_k_vec) | |||
| for i in range(len(edit_costs_new)): | |||
| if edit_costs_new[i] < 0: | |||
| if edit_costs_new[i] > -1e-9: | |||
| edit_costs_new[i] = 0 | |||
| else: | |||
| raise ValueError('The edit cost is negative.') | |||
| for idx, item in enumerate(idx_cost_nonzeros): | |||
| edit_costs[item] = edit_costs_new[idx] | |||
| # for i in range(len(edit_costs_new)): | |||
| # if edit_costs_new[i] < 0: | |||
| # edit_costs_new[i] = 0 | |||
| # compute new GEDs and numbers of edit operations. | |||
| params_ged['edit_cost_constant'] = edit_costs_new | |||
| ged_vec, ged_mat, n_edit_operations = compute_geds(Gn, params_ged, | |||
| parallel=parallel) | |||
| residual_list.append(np.sqrt(np.sum(np.square(np.array(ged_vec) - dis_k_vec)))) | |||
| time_list.append(time.time() - time0) | |||
| print('edit_costs:', edit_costs) | |||
| edit_cost_list.append(edit_costs_new) | |||
| nb_cost_mat = np.array(n_edit_operations) | |||
| nb_cost_mat_list.append(nb_cost_mat) | |||
| print('edit_costs:', edit_costs_new) | |||
| print('residual_list:', residual_list) | |||
| print() | |||
| edit_cost_list.append(edit_costs) | |||
| ged_all, ged_mat, n_edit_operations = compute_geds(Gn, edit_costs, | |||
| idx_cost_nonzeros, parallel=True) | |||
| if fitkernel == 0: | |||
| residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec))) | |||
| elif fitkernel == 'gaussian': | |||
| ged_all = np.array(ged_all)[idx_dk_nonzeros] | |||
| residual = np.sqrt(np.sum(np.square( | |||
| np.exp(-gamma * ged_all ** 2) / coef_dk - dis_k_vec))) | |||
| residual_list.append(residual) | |||
| nb_cost_mat_list.append(np.array(n_edit_operations).T) | |||
| return edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, \ | |||
| time_list, nb_cost_mat_list, coef_dk | |||
| return edit_costs_new, residual_list, edit_cost_list, dis_k_mat, ged_mat, \ | |||
| time_list, nb_cost_mat_list | |||
| def compute_geds(Gn, edit_cost_constant, idx_nonzeros, parallel=False): | |||
| def compute_geds(Gn, params_ged, parallel=False): | |||
| ged_mat = np.zeros((len(Gn), len(Gn))) | |||
| if parallel: | |||
| # print('parallel') | |||
| len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | |||
| ged_all = [0 for i in range(len_itr)] | |||
| n_edit_operations = [[0 for i in range(len_itr)] for j in | |||
| range(len(idx_nonzeros))] | |||
| itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||
| # len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | |||
| len_itr = int(len(Gn) * (len(Gn) - 1) / 2) | |||
| ged_vec = [0 for i in range(len_itr)] | |||
| n_edit_operations = [0 for i in range(len_itr)] | |||
| # itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||
| itr = combinations(range(0, len(Gn)), 2) | |||
| n_jobs = multiprocessing.cpu_count() | |||
| if len_itr < 100 * n_jobs: | |||
| chunksize = int(len_itr / n_jobs) + 1 | |||
| @@ -134,68 +112,52 @@ def compute_geds(Gn, edit_cost_constant, idx_nonzeros, parallel=False): | |||
| def init_worker(gn_toshare): | |||
| global G_gn | |||
| G_gn = gn_toshare | |||
| do_partial = partial(_wrapper_compute_ged_parallel, edit_cost_constant, | |||
| idx_nonzeros) | |||
| do_partial = partial(_wrapper_compute_ged_parallel, params_ged) | |||
| pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn,)) | |||
| iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize), | |||
| desc='computing GEDs', file=sys.stdout) | |||
| # iterator = pool.imap_unordered(do_partial, itr, chunksize) | |||
| for i, j, dis, n_eo_tmp in iterator: | |||
| idx_itr = int(len(Gn) * i + j - i * (i + 1) / 2) | |||
| ged_all[idx_itr] = dis | |||
| idx_itr = int(len(Gn) * i + j - (i + 1) * (i + 2) / 2) | |||
| ged_vec[idx_itr] = dis | |||
| ged_mat[i][j] = dis | |||
| ged_mat[j][i] = dis | |||
| for idx, item in enumerate(idx_nonzeros): | |||
| n_edit_operations[idx][idx_itr] = n_eo_tmp[item] | |||
| n_edit_operations[idx_itr] = n_eo_tmp | |||
| # print('\n-------------------------------------------') | |||
| # print(i, j, idx_itr, dis) | |||
| pool.close() | |||
| pool.join() | |||
| else: | |||
| ged_all = [] | |||
| n_edit_operations = [[] for i in range(len(idx_nonzeros))] | |||
| ged_vec = [] | |||
| n_edit_operations = [] | |||
| for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout): | |||
| # for i in range(len(Gn)): | |||
| for j in range(i, len(Gn)): | |||
| # time0 = time.time() | |||
| dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], lib='gedlibpy', | |||
| cost='CONSTANT', method='IPFP', | |||
| edit_cost_constant=edit_cost_constant, stabilizer='min', | |||
| repeat=50) | |||
| # time1 = time.time() - time0 | |||
| # time0 = time.time() | |||
| ged_all.append(dis) | |||
| for j in range(i + 1, len(Gn)): | |||
| dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], **params_ged) | |||
| ged_vec.append(dis) | |||
| ged_mat[i][j] = dis | |||
| ged_mat[j][i] = dis | |||
| n_eo_tmp = get_nb_edit_operations(Gn[i], Gn[j], pi_forward, pi_backward) | |||
| for idx, item in enumerate(idx_nonzeros): | |||
| n_edit_operations[idx].append(n_eo_tmp[item]) | |||
| # time2 = time.time() - time0 | |||
| # print(time1, time2, time1 / time2) | |||
| n_edit_operations.append(n_eo_tmp) | |||
| return ged_all, ged_mat, n_edit_operations | |||
| return ged_vec, ged_mat, n_edit_operations | |||
| def _wrapper_compute_ged_parallel(edit_cost_constant, idx_nonzeros, itr): | |||
| def _wrapper_compute_ged_parallel(params_ged, itr): | |||
| i = itr[0] | |||
| j = itr[1] | |||
| dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], edit_cost_constant, | |||
| idx_nonzeros) | |||
| dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], params_ged) | |||
| return i, j, dis, n_eo_tmp | |||
| def _compute_ged_parallel(g1, g2, edit_cost_constant, idx_nonzeros): | |||
| dis, pi_forward, pi_backward = GED(g1, g2, lib='gedlibpy', | |||
| cost='CONSTANT', method='IPFP', | |||
| edit_cost_constant=edit_cost_constant, stabilizer='min', | |||
| repeat=50) | |||
| n_eo_tmp = get_nb_edit_operations(g1, g2, pi_forward, pi_backward) | |||
| def _compute_ged_parallel(g1, g2, params_ged): | |||
| dis, pi_forward, pi_backward = GED(g1, g2, **params_ged) | |||
| n_eo_tmp = get_nb_edit_operations(g1, g2, pi_forward, pi_backward) | |||
| return dis, n_eo_tmp | |||
| def compute_better_costs(nb_cost_mat, dis_k_vec): | |||
| def update_costs(nb_cost_mat, dis_k_vec): | |||
| # # method 1: simple least square method. | |||
| # edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec, | |||
| # rcond=None) | |||
| @@ -203,7 +165,7 @@ def compute_better_costs(nb_cost_mat, dis_k_vec): | |||
| # # method 2: least square method with x_i >= 0. | |||
| # edit_costs_new, residual = optimize.nnls(nb_cost_mat, dis_k_vec) | |||
| # method 3: solve as a quadratic program with constraints: x_i >= 0, sum(x) = 1. | |||
| # method 3: solve as a quadratic program with constraints. | |||
| # P = np.dot(nb_cost_mat.T, nb_cost_mat) | |||
| # q_T = -2 * np.dot(dis_k_vec.T, nb_cost_mat) | |||
| # G = -1 * np.identity(nb_cost_mat.shape[1]) | |||
| @@ -221,7 +183,7 @@ def compute_better_costs(nb_cost_mat, dis_k_vec): | |||
| # h = np.array([0 for i in range(nb_cost_mat.shape[1])]) | |||
| x = cp.Variable(nb_cost_mat.shape[1]) | |||
| cost = cp.sum_squares(nb_cost_mat * x - dis_k_vec) | |||
| constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])], | |||
| constraints = [x >= [0.0001 for i in range(nb_cost_mat.shape[1])], | |||
| # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] | |||
| np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, | |||
| np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] | |||
| @@ -13,29 +13,30 @@ import multiprocessing | |||
| from multiprocessing import Pool | |||
| from functools import partial | |||
| from gedlibpy import librariesImport, gedlibpy | |||
| from gedlibpy_linlin import librariesImport, gedlibpy | |||
| def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP', | |||
| edit_cost_constant=[], stabilizer='min', repeat=50): | |||
| edit_cost_constant=[], algo_options='', stabilizer='min', repeat=50): | |||
| """ | |||
| Compute GED for 2 graphs. | |||
| """ | |||
| if lib == 'gedlibpy': | |||
| def convertGraph(G): | |||
| """Convert a graph to the proper NetworkX format that can be | |||
| recognized by library gedlibpy. | |||
| """ | |||
| G_new = nx.Graph() | |||
| for nd, attrs in G.nodes(data=True): | |||
| G_new.add_node(str(nd), chem=attrs['atom']) | |||
| def convertGraph(G): | |||
| """Convert a graph to the proper NetworkX format that can be | |||
| recognized by library gedlibpy. | |||
| """ | |||
| G_new = nx.Graph() | |||
| for nd, attrs in G.nodes(data=True): | |||
| G_new.add_node(str(nd), chem=attrs['atom']) | |||
| # G_new.add_node(str(nd), x=str(attrs['attributes'][0]), | |||
| # y=str(attrs['attributes'][1])) | |||
| for nd1, nd2, attrs in G.edges(data=True): | |||
| G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) | |||
| # G_new.add_edge(str(nd1), str(nd2)) | |||
| return G_new | |||
| for nd1, nd2, attrs in G.edges(data=True): | |||
| # G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) | |||
| G_new.add_edge(str(nd1), str(nd2)) | |||
| return G_new | |||
| if lib == 'gedlibpy': | |||
| gedlibpy.restart_env() | |||
| gedlibpy.add_nx_graph(convertGraph(g1), "") | |||
| gedlibpy.add_nx_graph(convertGraph(g2), "") | |||
| @@ -43,12 +44,12 @@ def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP', | |||
| listID = gedlibpy.get_all_graph_ids() | |||
| gedlibpy.set_edit_cost(cost, edit_cost_constant=edit_cost_constant) | |||
| gedlibpy.init() | |||
| gedlibpy.set_method(method, "") | |||
| gedlibpy.set_method(method, algo_options) | |||
| gedlibpy.init_method() | |||
| g = listID[0] | |||
| h = listID[1] | |||
| if stabilizer == None: | |||
| if stabilizer is None: | |||
| gedlibpy.run_method(g, h) | |||
| pi_forward = gedlibpy.get_forward_map(g, h) | |||
| pi_backward = gedlibpy.get_backward_map(g, h) | |||
| @@ -107,13 +108,57 @@ def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP', | |||
| dis = upper | |||
| # make the map label correct (label remove map as np.inf) | |||
| nodes1 = [n for n in g1.nodes()] | |||
| nodes2 = [n for n in g2.nodes()] | |||
| nb1 = nx.number_of_nodes(g1) | |||
| nb2 = nx.number_of_nodes(g2) | |||
| pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward] | |||
| pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward] | |||
| elif lib == 'gedlib-bash': | |||
| import time | |||
| import random | |||
| import sys | |||
| import os | |||
| sys.path.insert(0, "../") | |||
| from pygraph.utils.graphfiles import saveDataset | |||
| tmp_dir = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/' | |||
| if not os.path.exists(tmp_dir): | |||
| os.makedirs(tmp_dir) | |||
| fn_collection = tmp_dir + 'collection.' + str(time.time()) + str(random.randint(0, 1e9)) | |||
| xparams = {'method': 'gedlib', 'graph_dir': fn_collection} | |||
| saveDataset([g1, g2], ['dummy', 'dummy'], gformat='gxl', group='xml', | |||
| filename=fn_collection, xparams=xparams) | |||
| command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/others/gedlib/gedlib2\'\n' | |||
| command += 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GEDLIB_HOME/lib\n' | |||
| command += 'export LD_LIBRARY_PATH\n' | |||
| command += 'cd \'/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/bin\'\n' | |||
| command += './ged_for_python_bash monoterpenoides ' + fn_collection \ | |||
| + ' \'' + algo_options + '\' ' | |||
| for ec in edit_cost_constant: | |||
| command += str(ec) + ' ' | |||
| # output = os.system(command) | |||
| stream = os.popen(command) | |||
| output = stream.readlines() | |||
| # print(output) | |||
| dis = float(output[0].strip()) | |||
| runtime = float(output[1].strip()) | |||
| size_forward = int(output[2].strip()) | |||
| pi_forward = [int(item.strip()) for item in output[3:3+size_forward]] | |||
| pi_backward = [int(item.strip()) for item in output[3+size_forward:]] | |||
| # print(dis) | |||
| # print(runtime) | |||
| # print(size_forward) | |||
| # print(pi_forward) | |||
| # print(pi_backward) | |||
| # make the map label correct (label remove map as np.inf) | |||
| nodes1 = [n for n in g1.nodes()] | |||
| nodes2 = [n for n in g2.nodes()] | |||
| nb1 = nx.number_of_nodes(g1) | |||
| nb2 = nx.number_of_nodes(g2) | |||
| pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward] | |||
| pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward] | |||
| # print(pi_forward) | |||
| return dis, pi_forward, pi_backward | |||
| @@ -149,7 +194,7 @@ def GED_n(Gn, lib='gedlibpy', cost='CHEM_1', method='IPFP', | |||
| g = listID[0] | |||
| h = listID[1] | |||
| if stabilizer == None: | |||
| if stabilizer is None: | |||
| gedlibpy.run_method(g, h) | |||
| pi_forward = gedlibpy.get_forward_map(g, h) | |||
| pi_backward = gedlibpy.get_backward_map(g, h) | |||
| @@ -183,7 +228,8 @@ def GED_n(Gn, lib='gedlibpy', cost='CHEM_1', method='IPFP', | |||
| def ged_median(Gn, Gn_median, verbose=False, params_ged={'lib': 'gedlibpy', | |||
| 'cost': 'CHEM_1', 'method': 'IPFP', 'edit_cost_constant': [], | |||
| 'stabilizer': 'min', 'repeat': 50}, parallel=False): | |||
| 'algo_options': '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1', | |||
| 'stabilizer': None}, parallel=False): | |||
| if parallel: | |||
| len_itr = int(len(Gn)) | |||
| pi_forward_list = [[] for i in range(len_itr)] | |||
| @@ -23,7 +23,8 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, | |||
| connected=False, removeNodes=True, allBestInit=False, allBestNodes=False, | |||
| allBestEdges=False, allBestOutput=False, | |||
| params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', 'method': 'IPFP', | |||
| 'edit_cost_constant': [], 'stabilizer': 'min', 'repeat': 50}): | |||
| 'edit_cost_constant': [], 'stabilizer': None, | |||
| 'algo_options': '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'}): | |||
| """See my name, then you know what I do. | |||
| """ | |||
| # Gn_median = Gn_median[0:10] | |||
| @@ -435,6 +436,62 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, | |||
| return G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median | |||
| def iam_bash(Gn_names, edit_cost_constant, dataset='monoterpenoides', | |||
| graph_dir='/media/ljia/DATA/research-repo/codes/Linlin/py-graph/datasets/monoterpenoides/'): | |||
| """Compute the iam by c++ implementation (gedlib) through bash. | |||
| """ | |||
| import os | |||
| import time | |||
| def createCollectionFile(Gn_names, y, filename): | |||
| """Create collection file. | |||
| """ | |||
| dirname_ds = os.path.dirname(filename) | |||
| if dirname_ds != '': | |||
| dirname_ds += '/' | |||
| if not os.path.exists(dirname_ds) : | |||
| os.makedirs(dirname_ds) | |||
| with open(filename + '.xml', 'w') as fgroup: | |||
| fgroup.write("<?xml version=\"1.0\"?>") | |||
| fgroup.write("\n<!DOCTYPE GraphCollection SYSTEM \"http://www.inf.unibz.it/~blumenthal/dtd/GraphCollection.dtd\">") | |||
| fgroup.write("\n<GraphCollection>") | |||
| for idx, fname in enumerate(Gn_names): | |||
| fgroup.write("\n\t<graph file=\"" + fname + "\" class=\"" + str(y[idx]) + "\"/>") | |||
| fgroup.write("\n</GraphCollection>") | |||
| fgroup.close() | |||
| tmp_dir = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/' | |||
| fn_collection = tmp_dir + 'collection.' + str(time.time()) + str(random.randint(0, 1e9)) | |||
| createCollectionFile(Gn_names, ['dummy'] * len(Gn_names), fn_collection) | |||
| # graph_dir = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/gxl' | |||
| command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/others/gedlib/gedlib2\'\n' | |||
| command += 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GEDLIB_HOME/lib\n' | |||
| command += 'export LD_LIBRARY_PATH\n' | |||
| command += 'cd \'/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/bin\'\n' | |||
| command += './iam_for_python_bash ' + dataset + ' ' + fn_collection \ | |||
| + ' \'' + graph_dir + '\' ' | |||
| if edit_cost_constant is None: | |||
| command += 'None' | |||
| else: | |||
| for ec in edit_cost_constant: | |||
| command += str(ec) + ' ' | |||
| # output = os.system(command) | |||
| stream = os.popen(command) | |||
| output = stream.readlines() | |||
| # print(output) | |||
| sod_sm = float(output[0].strip()) | |||
| sod_gm= float(output[1].strip()) | |||
| fname_sm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/set_median.gxl' | |||
| fname_gm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/gen_median.gxl' | |||
| return sod_sm, sod_gm, fname_sm, fname_gm | |||
| ############################################################################### | |||
| # Old implementations. | |||
| @@ -16,6 +16,319 @@ from utils import remove_edges | |||
| from fitDistance import fit_GED_to_kernel_distance | |||
| from utils import normalize_distance_matrix | |||
| def median_paper_clcpc_python_best(): | |||
| """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with | |||
| python invoking the c++ code by bash command (with updated library). | |||
| """ | |||
| # ds = {'name': 'monoterpenoides', | |||
| # 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | |||
| # _, y_all = loadDataset(ds['dataset']) | |||
| gkernel = 'untilhpathkernel' | |||
| node_label = 'atom' | |||
| edge_label = 'bond_type' | |||
| itr_max = 6 | |||
| algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' | |||
| params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', | |||
| 'algo_options': algo_options, 'stabilizer': None} | |||
| y_all = ['3', '1', '4', '6', '7', '8', '9', '2'] | |||
| repeats = 50 | |||
| collection_path = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/' | |||
| graph_dir = collection_path + 'gxl/' | |||
| fn_edit_costs_output = 'results/median_paper/edit_costs_output.python_init40.k10.txt' | |||
| for y in y_all: | |||
| for repeat in range(repeats): | |||
| edit_costs_output_file = open(fn_edit_costs_output, 'a') | |||
| collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml' | |||
| Gn, _ = loadDataset(collection_file, extra_params=graph_dir) | |||
| edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \ | |||
| nb_cost_mat_list = fit_GED_to_kernel_distance(Gn, node_label, edge_label, | |||
| gkernel, itr_max, params_ged=params_ged, | |||
| parallel=True) | |||
| total_time = np.sum(time_list) | |||
| # print('\nedit_costs:', edit_costs) | |||
| # print('\nresidual_list:', residual_list) | |||
| # print('\nedit_cost_list:', edit_cost_list) | |||
| # print('\ndistance matrix in kernel space:', dis_k_mat) | |||
| # print('\nged matrix:', ged_mat) | |||
| # print('\ntotal time:', total_time) | |||
| # print('\nnb_cost_mat:', nb_cost_mat_list[-1]) | |||
| np.savez('results/median_paper/fit_distance.clcpc.python_init40.monot.elabeled.uhpkernel.y' | |||
| + y + '.repeat' + str(repeat) + '.k10..gm', | |||
| edit_costs=edit_costs, | |||
| residual_list=residual_list, edit_cost_list=edit_cost_list, | |||
| dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, | |||
| total_time=total_time, nb_cost_mat_list=nb_cost_mat_list) | |||
| for ec in edit_costs: | |||
| edit_costs_output_file.write(str(ec) + ' ') | |||
| edit_costs_output_file.write('\n') | |||
| edit_costs_output_file.close() | |||
| # # normalized distance matrices. | |||
| # gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz') | |||
| # edit_costs = gmfile['edit_costs'] | |||
| # residual_list = gmfile['residual_list'] | |||
| # edit_cost_list = gmfile['edit_cost_list'] | |||
| # dis_k_mat = gmfile['dis_k_mat'] | |||
| # ged_mat = gmfile['ged_mat'] | |||
| # total_time = gmfile['total_time'] | |||
| # nb_cost_mat_list = gmfile['nb_cost_mat_list'] | |||
| nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat) | |||
| print(nb_consistent, nb_inconsistent, ratio_consistent) | |||
| # norm_dis_k_mat = normalize_distance_matrix(dis_k_mat) | |||
| # plt.imshow(norm_dis_k_mat) | |||
| # plt.colorbar() | |||
| # plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_best.monot.elabeled.uhpkernel.y' | |||
| # + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300) | |||
| # plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_best.monot.elabeled.uhpkernel.y' | |||
| # + y + '.repeat' + str(repeat) + '.png', format='png') | |||
| # # plt.show() | |||
| # plt.clf() | |||
| # | |||
| # norm_ged_mat = normalize_distance_matrix(ged_mat) | |||
| # plt.imshow(norm_ged_mat) | |||
| # plt.colorbar() | |||
| # plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_best.monot.elabeled.uhpkernel.y' | |||
| # + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300) | |||
| # plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_best.monot.elabeled.uhpkernel.y' | |||
| # + y + '.repeat' + str(repeat) + '.png', format='png') | |||
| # # plt.show() | |||
| # plt.clf() | |||
| # | |||
| # norm_diff = norm_ged_mat - norm_dis_k_mat | |||
| # plt.imshow(norm_diff) | |||
| # plt.colorbar() | |||
| # plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_best.monot.elabeled.uhpkernel.y' | |||
| # + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300) | |||
| # plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_best.monot.elabeled.uhpkernel.y' | |||
| # + y + '.repeat' + str(repeat) + '.png', format='png') | |||
| # # plt.show() | |||
| # plt.clf() | |||
| # # draw_count_bar(norm_diff) | |||
| def median_paper_clcpc_python_bash_cpp(): | |||
| """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with | |||
| python invoking the c++ code by bash command (with updated library). | |||
| """ | |||
| # ds = {'name': 'monoterpenoides', | |||
| # 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | |||
| # _, y_all = loadDataset(ds['dataset']) | |||
| gkernel = 'untilhpathkernel' | |||
| node_label = 'atom' | |||
| edge_label = 'bond_type' | |||
| itr_max = 20 | |||
| algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5' | |||
| params_ged = {'lib': 'gedlib-bash', 'cost': 'CONSTANT', 'method': 'IPFP', | |||
| 'algo_options': algo_options} | |||
| y_all = ['3', '1', '4', '6', '7', '8', '9', '2'] | |||
| repeats = 50 | |||
| collection_path = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/' | |||
| graph_dir = collection_path + 'gxl/' | |||
| fn_edit_costs_output = 'results/median_paper/edit_costs_output.txt' | |||
| for y in y_all: | |||
| for repeat in range(repeats): | |||
| edit_costs_output_file = open(fn_edit_costs_output, 'a') | |||
| collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml' | |||
| Gn, _ = loadDataset(collection_file, extra_params=graph_dir) | |||
| edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \ | |||
| nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label, | |||
| gkernel, itr_max, params_ged=params_ged, | |||
| parallel=False) | |||
| total_time = np.sum(time_list) | |||
| # print('\nedit_costs:', edit_costs) | |||
| # print('\nresidual_list:', residual_list) | |||
| # print('\nedit_cost_list:', edit_cost_list) | |||
| # print('\ndistance matrix in kernel space:', dis_k_mat) | |||
| # print('\nged matrix:', ged_mat) | |||
| # print('\ntotal time:', total_time) | |||
| # print('\nnb_cost_mat:', nb_cost_mat_list[-1]) | |||
| np.savez('results/median_paper/fit_distance.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' | |||
| + y + '.repeat' + str(repeat) + '.gm', | |||
| edit_costs=edit_costs, | |||
| residual_list=residual_list, edit_cost_list=edit_cost_list, | |||
| dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, | |||
| total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, | |||
| coef_dk=coef_dk) | |||
| for ec in edit_costs: | |||
| edit_costs_output_file.write(str(ec) + ' ') | |||
| edit_costs_output_file.write('\n') | |||
| edit_costs_output_file.close() | |||
| # # normalized distance matrices. | |||
| # gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz') | |||
| # edit_costs = gmfile['edit_costs'] | |||
| # residual_list = gmfile['residual_list'] | |||
| # edit_cost_list = gmfile['edit_cost_list'] | |||
| # dis_k_mat = gmfile['dis_k_mat'] | |||
| # ged_mat = gmfile['ged_mat'] | |||
| # total_time = gmfile['total_time'] | |||
| # nb_cost_mat_list = gmfile['nb_cost_mat_list'] | |||
| # coef_dk = gmfile['coef_dk'] | |||
| nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat) | |||
| print(nb_consistent, nb_inconsistent, ratio_consistent) | |||
| # norm_dis_k_mat = normalize_distance_matrix(dis_k_mat) | |||
| # plt.imshow(norm_dis_k_mat) | |||
| # plt.colorbar() | |||
| # plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' | |||
| # + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300) | |||
| # plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' | |||
| # + y + '.repeat' + str(repeat) + '.png', format='png') | |||
| # # plt.show() | |||
| # plt.clf() | |||
| # | |||
| # norm_ged_mat = normalize_distance_matrix(ged_mat) | |||
| # plt.imshow(norm_ged_mat) | |||
| # plt.colorbar() | |||
| # plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' | |||
| # + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300) | |||
| # plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' | |||
| # + y + '.repeat' + str(repeat) + '.png', format='png') | |||
| # # plt.show() | |||
| # plt.clf() | |||
| # | |||
| # norm_diff = norm_ged_mat - norm_dis_k_mat | |||
| # plt.imshow(norm_diff) | |||
| # plt.colorbar() | |||
| # plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' | |||
| # + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300) | |||
| # plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' | |||
| # + y + '.repeat' + str(repeat) + '.png', format='png') | |||
| # # plt.show() | |||
| # plt.clf() | |||
| # # draw_count_bar(norm_diff) | |||
| def test_cs_leq_ci_plus_cr_python_bash_cpp(): | |||
| """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with | |||
| python invoking the c++ code by bash command (with updated library). | |||
| """ | |||
| ds = {'name': 'monoterpenoides', | |||
| 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | |||
| Gn, y_all = loadDataset(ds['dataset']) | |||
| # Gn = Gn[0:10] | |||
| gkernel = 'untilhpathkernel' | |||
| node_label = 'atom' | |||
| edge_label = 'bond_type' | |||
| itr_max = 10 | |||
| algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5' | |||
| params_ged = {'lib': 'gedlib-bash', 'cost': 'CONSTANT', 'method': 'IPFP', | |||
| 'algo_options': algo_options} | |||
| edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \ | |||
| nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label, | |||
| gkernel, itr_max, params_ged=params_ged, | |||
| parallel=False) | |||
| total_time = np.sum(time_list) | |||
| print('\nedit_costs:', edit_costs) | |||
| print('\nresidual_list:', residual_list) | |||
| print('\nedit_cost_list:', edit_cost_list) | |||
| print('\ndistance matrix in kernel space:', dis_k_mat) | |||
| print('\nged matrix:', ged_mat) | |||
| print('\ntotal time:', total_time) | |||
| print('\nnb_cost_mat:', nb_cost_mat_list[-1]) | |||
| np.savez('results/fit_distance.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel.gm', | |||
| edit_costs=edit_costs, | |||
| residual_list=residual_list, edit_cost_list=edit_cost_list, | |||
| dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, | |||
| total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, | |||
| coef_dk=coef_dk) | |||
| # ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', | |||
| # 'extra_params': {}} # node/edge symb | |||
| # Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||
| ## Gn = Gn[0:10] | |||
| ## remove_edges(Gn) | |||
| # gkernel = 'untilhpathkernel' | |||
| # node_label = 'atom' | |||
| # edge_label = 'bond_type' | |||
| # itr_max = 10 | |||
| # edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \ | |||
| # nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label, | |||
| # gkernel, itr_max) | |||
| # total_time = np.sum(time_list) | |||
| # print('\nedit_costs:', edit_costs) | |||
| # print('\nresidual_list:', residual_list) | |||
| # print('\nedit_cost_list:', edit_cost_list) | |||
| # print('\ndistance matrix in kernel space:', dis_k_mat) | |||
| # print('\nged matrix:', ged_mat) | |||
| # print('\ntotal time:', total_time) | |||
| # print('\nnb_cost_mat:', nb_cost_mat_list[-1]) | |||
| # np.savez('results/fit_distance.cs_leq_ci_plus_cr.mutag.elabeled.uhpkernel.gm', | |||
| # edit_costs=edit_costs, | |||
| # residual_list=residual_list, edit_cost_list=edit_cost_list, | |||
| # dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, | |||
| # total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, coef_dk) | |||
| # # normalized distance matrices. | |||
| # gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.monot.elabeled.uhpkernel.gm.npz') | |||
| # edit_costs = gmfile['edit_costs'] | |||
| # residual_list = gmfile['residual_list'] | |||
| # edit_cost_list = gmfile['edit_cost_list'] | |||
| # dis_k_mat = gmfile['dis_k_mat'] | |||
| # ged_mat = gmfile['ged_mat'] | |||
| # total_time = gmfile['total_time'] | |||
| # nb_cost_mat_list = gmfile['nb_cost_mat_list'] | |||
| # coef_dk = gmfile['coef_dk'] | |||
| nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat) | |||
| print(nb_consistent, nb_inconsistent, ratio_consistent) | |||
| # dis_k_sub = pairwise_substitution(dis_k_mat) | |||
| # ged_sub = pairwise_substitution(ged_mat) | |||
| # np.savez('results/sub_dis_mat.cs_leq_ci_plus_cr.gm', | |||
| # dis_k_sub=dis_k_sub, ged_sub=ged_sub) | |||
| norm_dis_k_mat = normalize_distance_matrix(dis_k_mat) | |||
| plt.imshow(norm_dis_k_mat) | |||
| plt.colorbar() | |||
| plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' | |||
| + '.eps', format='eps', dpi=300) | |||
| plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' | |||
| + '.png', format='png') | |||
| # plt.show() | |||
| plt.clf() | |||
| norm_ged_mat = normalize_distance_matrix(ged_mat) | |||
| plt.imshow(norm_ged_mat) | |||
| plt.colorbar() | |||
| plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' | |||
| + '.eps', format='eps', dpi=300) | |||
| plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' | |||
| + '.png', format='png') | |||
| # plt.show() | |||
| plt.clf() | |||
| norm_diff = norm_ged_mat - norm_dis_k_mat | |||
| plt.imshow(norm_diff) | |||
| plt.colorbar() | |||
| plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' | |||
| + '.eps', format='eps', dpi=300) | |||
| plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' | |||
| + '.png', format='png') | |||
| # plt.show() | |||
| plt.clf() | |||
| # draw_count_bar(norm_diff) | |||
| def test_anycosts(): | |||
| ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', | |||
| 'extra_params': {}} # node/edge symb | |||
| @@ -295,8 +608,12 @@ def draw_count_bar(norm_diff): | |||
| if __name__ == '__main__': | |||
| # test_anycosts() | |||
| test_cs_leq_ci_plus_cr() | |||
| # test_cs_leq_ci_plus_cr() | |||
| # test_unfitted() | |||
| # test_cs_leq_ci_plus_cr_python_bash_cpp() | |||
| # median_paper_clcpc_python_bash_cpp() | |||
| median_paper_clcpc_python_best() | |||
| # x = np.array([[1,2,3],[4,5,6],[7,8,9]]) | |||
| # xx = pairwise_substitution(x) | |||
| @@ -22,6 +22,130 @@ from iam import iam_upgraded | |||
| from utils import remove_edges, compute_kernel, get_same_item_indices, dis_gstar | |||
| #from ged import ged_median | |||
| def test_iam_monoterpenoides_with_init40(): | |||
| gkernel = 'untilhpathkernel' | |||
| node_label = 'atom' | |||
| edge_label = 'bond_type' | |||
| # unfitted edit costs. | |||
| c_vi = 3 | |||
| c_vr = 3 | |||
| c_vs = 1 | |||
| c_ei = 3 | |||
| c_er = 3 | |||
| c_es = 1 | |||
| ite_max_iam = 50 | |||
| epsilon_iam = 0.0001 | |||
| removeNodes = False | |||
| connected_iam = False | |||
| # parameters for IAM function | |||
| # ged_cost = 'CONSTANT' | |||
| ged_cost = 'CONSTANT' | |||
| ged_method = 'IPFP' | |||
| edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es] | |||
| ged_stabilizer = None | |||
| # ged_repeat = 50 | |||
| algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' | |||
| params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, | |||
| 'edit_cost_constant': edit_cost_constant, | |||
| 'algo_options': algo_options, | |||
| 'stabilizer': ged_stabilizer} | |||
| collection_path = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/' | |||
| graph_dir = collection_path + 'gxl/' | |||
| y_all = ['3', '1', '4', '6', '7', '8', '9', '2'] | |||
| repeats = 50 | |||
| # classify graphs according to classes. | |||
| time_list = [] | |||
| dis_ks_min_list = [] | |||
| dis_ks_set_median_list = [] | |||
| sod_gs_list = [] | |||
| g_best = [] | |||
| sod_set_median_list = [] | |||
| sod_list_list = [] | |||
| for y in y_all: | |||
| print('\n-------------------------------------------------------') | |||
| print('class of y:', y) | |||
| time_list.append([]) | |||
| dis_ks_min_list.append([]) | |||
| dis_ks_set_median_list.append([]) | |||
| sod_gs_list.append([]) | |||
| g_best.append([]) | |||
| sod_set_median_list.append([]) | |||
| for repeat in range(repeats): | |||
| # load median set. | |||
| collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml' | |||
| Gn_median, _ = loadDataset(collection_file, extra_params=graph_dir) | |||
| Gn_candidate = [g.copy() for g in Gn_median] | |||
| time0 = time.time() | |||
| G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \ | |||
| = iam_upgraded(Gn_median, | |||
| Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam, | |||
| epsilon=epsilon_iam, node_label=node_label, edge_label=edge_label, | |||
| connected=connected_iam, removeNodes=removeNodes, | |||
| params_ged=params_ged) | |||
| time_total = time.time() - time0 | |||
| print('\ntime: ', time_total) | |||
| time_list[-1].append(time_total) | |||
| g_best[-1].append(G_gen_median_list[0]) | |||
| sod_set_median_list[-1].append(sod_set_median) | |||
| print('\nsmallest sod of the set median:', sod_set_median) | |||
| sod_gs_list[-1].append(sod_gen_median) | |||
| print('\nsmallest sod in graph space:', sod_gen_median) | |||
| sod_list_list.append(sod_list) | |||
| # # show the best graph and save it to file. | |||
| # print('one of the possible corresponding pre-images is') | |||
| # nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'), | |||
| # with_labels=True) | |||
| ## plt.show() | |||
| # # plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) + | |||
| ## plt.savefig('results/iam/paper_compare/monoter_y' + str(y_class) + | |||
| ## '_repeat' + str(repeat) + '_' + str(time.time()) + | |||
| ## '.png', format="PNG") | |||
| # plt.clf() | |||
| # # print(G_gen_median_list[0].nodes(data=True)) | |||
| # # print(G_gen_median_list[0].edges(data=True)) | |||
| print('\nsods of the set median for this class:', sod_set_median_list[-1]) | |||
| print('\nsods in graph space for this class:', sod_gs_list[-1]) | |||
| # print('\ndistance in kernel space of set median for this class:', | |||
| # dis_ks_set_median_list[-1]) | |||
| # print('\nsmallest distances in kernel space for this class:', | |||
| # dis_ks_min_list[-1]) | |||
| print('\ntimes for this class:', time_list[-1]) | |||
| sod_set_median_list[-1] = np.mean(sod_set_median_list[-1]) | |||
| sod_gs_list[-1] = np.mean(sod_gs_list[-1]) | |||
| # dis_ks_set_median_list[-1] = np.mean(dis_ks_set_median_list[-1]) | |||
| # dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1]) | |||
| time_list[-1] = np.mean(time_list[-1]) | |||
| print() | |||
| print('\nmean sods of the set median for each class:', sod_set_median_list) | |||
| print('\nmean sods in graph space for each class:', sod_gs_list) | |||
| # print('\ndistances in kernel space of set median for each class:', | |||
| # dis_ks_set_median_list) | |||
| # print('\nmean smallest distances in kernel space for each class:', | |||
| # dis_ks_min_list) | |||
| print('\nmean times for each class:', time_list) | |||
| print('\nmean sods of the set median of all:', np.mean(sod_set_median_list)) | |||
| print('\nmean sods in graph space of all:', np.mean(sod_gs_list)) | |||
| # print('\nmean distances in kernel space of set median of all:', | |||
| # np.mean(dis_ks_set_median_list)) | |||
| # print('\nmean smallest distances in kernel space of all:', | |||
| # np.mean(dis_ks_min_list)) | |||
| print('\nmean times of all:', np.mean(time_list)) | |||
| def test_iam_monoterpenoides(): | |||
| ds = {'name': 'monoterpenoides', | |||
| 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | |||
| @@ -834,9 +958,10 @@ if __name__ == '__main__': | |||
| # tests on different numbers of median-sets. | |||
| # test_iam_median_nb() | |||
| # test_iam_letter_h() | |||
| test_iam_monoterpenoides() | |||
| # test_iam_monoterpenoides() | |||
| # test_iam_mutag() | |||
| # test_iam_fitdistance() | |||
| # print("test log") | |||
| test_iam_monoterpenoides_with_init40() | |||
| @@ -17,8 +17,10 @@ from pygraph.kernels.marginalizedKernel import marginalizedkernel | |||
| from pygraph.kernels.untilHPathKernel import untilhpathkernel | |||
| from pygraph.kernels.spKernel import spkernel | |||
| import functools | |||
| from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct | |||
| from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct, polynomialkernel | |||
| from pygraph.kernels.structuralspKernel import structuralspkernel | |||
| from pygraph.kernels.treeletKernel import treeletkernel | |||
| from pygraph.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel | |||
| def remove_edges(Gn): | |||
| @@ -46,18 +48,29 @@ def compute_kernel(Gn, graph_kernel, node_label, edge_label, verbose): | |||
| n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||
| elif graph_kernel == 'untilhpathkernel': | |||
| Kmatrix, _ = untilhpathkernel(Gn, node_label=node_label, edge_label=edge_label, | |||
| depth=10, k_func='MinMax', compute_method='trie', | |||
| depth=7, k_func='MinMax', compute_method='trie', | |||
| n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||
| elif graph_kernel == 'spkernel': | |||
| mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | |||
| Kmatrix, _, _ = spkernel(Gn, node_label='atom', node_kernels= | |||
| Kmatrix, _, _ = spkernel(Gn, node_label=node_label, node_kernels= | |||
| {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}, | |||
| n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||
| elif graph_kernel == 'structuralspkernel': | |||
| mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | |||
| Kmatrix, _ = structuralspkernel(Gn, node_label='atom', node_kernels= | |||
| Kmatrix, _ = structuralspkernel(Gn, node_label=node_label, node_kernels= | |||
| {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}, | |||
| n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||
| elif graph_kernel == 'treeletkernel': | |||
| # pkernel = functools.partial(polynomialkernel, d=2, c=1e5) | |||
| pkernel = functools.partial(gaussiankernel, gamma=1e-6) | |||
| mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | |||
| Kmatrix, _ = treeletkernel(Gn, node_label=node_label, edge_label=edge_label, | |||
| sub_kernel=pkernel, | |||
| n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||
| elif graph_kernel == 'weisfeilerlehmankernel': | |||
| Kmatrix, _ = weisfeilerlehmankernel(Gn, node_label=node_label, edge_label=edge_label, | |||
| height=4, base_kernel='subtree', | |||
| n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||
| # normalization | |||
| Kmatrix_diag = Kmatrix.diagonal().copy() | |||
| @@ -79,7 +92,7 @@ def gram2distances(Kmatrix): | |||
| def kernel_distance_matrix(Gn, node_label, edge_label, Kmatrix=None, gkernel=None): | |||
| dis_mat = np.empty((len(Gn), len(Gn))) | |||
| if Kmatrix == None: | |||
| if Kmatrix is None: | |||
| Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True) | |||
| for i in range(len(Gn)): | |||
| for j in range(i, len(Gn)): | |||
| @@ -109,6 +122,21 @@ def get_same_item_indices(ls): | |||
| return idx_dict | |||
| def k_nearest_neighbors_to_median_in_kernel_space(Gn, Kmatrix=None, gkernel=None, | |||
| node_label=None, edge_label=None): | |||
| dis_k_all = [] # distance between g_star and each graph. | |||
| alpha = [1 / len(Gn)] * len(Gn) | |||
| if Kmatrix is None: | |||
| Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True) | |||
| term3 = 0 | |||
| for i1, a1 in enumerate(alpha): | |||
| for i2, a2 in enumerate(alpha): | |||
| term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] | |||
| for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout): | |||
| dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3) | |||
| dis_all.append(dtemp) | |||
| def normalize_distance_matrix(D): | |||
| max_value = np.amax(D) | |||
| min_value = np.amin(D) | |||
| @@ -124,21 +124,21 @@ def saveGXL(graph, filename, method='benoit'): | |||
| # reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22 | |||
| # pass | |||
| gxl_file = open(filename, 'w') | |||
| gxl_file.write("<?xml version=\"1.0\"?>\n") | |||
| gxl_file.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") | |||
| gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n") | |||
| gxl_file.write("<gxl>\n") | |||
| gxl_file.write("<gxl xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n") | |||
| gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"true\" edgemode=\"undirected\">\n") | |||
| for v, attrs in graph.nodes(data=True): | |||
| gxl_file.write("<node id=\"_" + str(v) + "\">") | |||
| gxl_file.write("<attr name=\"" + "chem" + "\"><int>" + str(attrs['atom']) + "</int></attr>") | |||
| gxl_file.write("<attr name=\"" + "chem" + "\"><int>" + str(attrs['chem']) + "</int></attr>") | |||
| gxl_file.write("</node>\n") | |||
| for v1, v2, attrs in graph.edges(data=True): | |||
| gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\">") | |||
| # gxl_file.write("<attr name=\"valence\"><int>" + str(attrs['bond_type']) + "</int></attr>") | |||
| gxl_file.write("<attr name=\"valence\"><int>" + "1" + "</int></attr>") | |||
| gxl_file.write("<attr name=\"valence\"><int>" + str(attrs['valence']) + "</int></attr>") | |||
| # gxl_file.write("<attr name=\"valence\"><int>" + "1" + "</int></attr>") | |||
| gxl_file.write("</edge>\n") | |||
| gxl_file.write("</graph>\n") | |||
| gxl_file.write("</gxl>\n") | |||
| gxl_file.write("</gxl>") | |||
| gxl_file.close() | |||
| elif method == 'gedlib-letter': | |||
| # reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22 | |||
| @@ -147,15 +147,15 @@ def saveGXL(graph, filename, method='benoit'): | |||
| gxl_file.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") | |||
| gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n") | |||
| gxl_file.write("<gxl xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n") | |||
| gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"false\" edgemode=\"undirected\">") | |||
| gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"false\" edgemode=\"undirected\">\n") | |||
| for v, attrs in graph.nodes(data=True): | |||
| gxl_file.write("<node id=\"_" + str(v) + "\">") | |||
| gxl_file.write("<attr name=\"x\"><float>" + str(attrs['attributes'][0]) + "</float></attr>") | |||
| gxl_file.write("<attr name=\"y\"><float>" + str(attrs['attributes'][1]) + "</float></attr>") | |||
| gxl_file.write("</node>") | |||
| gxl_file.write("</node>\n") | |||
| for v1, v2, attrs in graph.edges(data=True): | |||
| gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\"/>") | |||
| gxl_file.write("</graph>") | |||
| gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\"/>\n") | |||
| gxl_file.write("</graph>\n") | |||
| gxl_file.write("</gxl>") | |||
| gxl_file.close() | |||
| @@ -466,12 +466,15 @@ def loadDataset(filename, filename_y=None, extra_params=None): | |||
| def loadFromXML(filename, extra_params): | |||
| import xml.etree.ElementTree as ET | |||
| dirname_dataset = dirname(filename) | |||
| if extra_params: | |||
| dirname_dataset = extra_params | |||
| else: | |||
| dirname_dataset = dirname(filename) | |||
| tree = ET.parse(filename) | |||
| root = tree.getroot() | |||
| data = [] | |||
| y = [] | |||
| for graph in root.iter('print'): | |||
| for graph in root.iter('graph'): | |||
| mol_filename = graph.attrib['file'] | |||
| mol_class = graph.attrib['class'] | |||
| data.append(loadGXL(dirname_dataset + '/' + mol_filename)) | |||
| @@ -541,15 +544,22 @@ def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=None | |||
| dirname_ds += '/' | |||
| if not os.path.exists(dirname_ds) : | |||
| os.makedirs(dirname_ds) | |||
| if 'graph_dir' in xparams: | |||
| graph_dir = xparams['graph_dir'] + '/' | |||
| if not os.path.exists(graph_dir): | |||
| os.makedirs(graph_dir) | |||
| else: | |||
| graph_dir = dirname_ds | |||
| if group == 'xml' and gformat == 'gxl': | |||
| with open(filename + '.xml', 'w') as fgroup: | |||
| fgroup.write("<?xml version=\"1.0\"?>") | |||
| fgroup.write("\n<!DOCTYPE GraphCollection SYSTEM \"https://dbblumenthal.github.io/gedlib/GraphCollection_8dtd_source.html\">") | |||
| fgroup.write("\n<!DOCTYPE GraphCollection SYSTEM \"http://www.inf.unibz.it/~blumenthal/dtd/GraphCollection.dtd\">") | |||
| fgroup.write("\n<GraphCollection>") | |||
| for idx, g in enumerate(Gn): | |||
| fname_tmp = "graph" + str(idx) + ".gxl" | |||
| saveGXL(g, dirname_ds + fname_tmp, method=xparams['method']) | |||
| saveGXL(g, graph_dir + fname_tmp, method=xparams['method']) | |||
| fgroup.write("\n\t<graph file=\"" + fname_tmp + "\" class=\"" + str(y[idx]) + "\"/>") | |||
| fgroup.write("\n</GraphCollection>") | |||
| fgroup.close() | |||
| @@ -558,18 +568,18 @@ def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=None | |||
| if __name__ == '__main__': | |||
| # ### Load dataset from .ds file. | |||
| # # .ct files. | |||
| ds = {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds', | |||
| 'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt'} | |||
| Gn, y = loadDataset(ds['dataset'], filename_y=ds['dataset_y']) | |||
| # ds = {'name': 'Acyclic', 'dataset': '../../datasets/acyclic/dataset_bps.ds'} # node symb | |||
| # Gn, y = loadDataset(ds['dataset']) | |||
| # ds = {'name': 'MAO', 'dataset': '../../datasets/MAO/dataset.ds'} # node/edge symb | |||
| # Gn, y = loadDataset(ds['dataset']) | |||
| # ds = {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds'} # unlabeled | |||
| # Gn, y = loadDataset(ds['dataset']) | |||
| print(Gn[1].nodes(data=True)) | |||
| print(Gn[1].edges(data=True)) | |||
| print(y[1]) | |||
| # ds = {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds', | |||
| # 'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt'} | |||
| # Gn, y = loadDataset(ds['dataset'], filename_y=ds['dataset_y']) | |||
| ## ds = {'name': 'Acyclic', 'dataset': '../../datasets/acyclic/dataset_bps.ds'} # node symb | |||
| ## Gn, y = loadDataset(ds['dataset']) | |||
| ## ds = {'name': 'MAO', 'dataset': '../../datasets/MAO/dataset.ds'} # node/edge symb | |||
| ## Gn, y = loadDataset(ds['dataset']) | |||
| ## ds = {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds'} # unlabeled | |||
| ## Gn, y = loadDataset(ds['dataset']) | |||
| # print(Gn[1].nodes(data=True)) | |||
| # print(Gn[1].edges(data=True)) | |||
| # print(y[1]) | |||
| # # .gxl file. | |||
| # ds = {'name': 'monoterpenoides', | |||
| @@ -579,6 +589,33 @@ if __name__ == '__main__': | |||
| # print(Gn[1].edges(data=True)) | |||
| # print(y[1]) | |||
| ### Convert graph from one format to another. | |||
| # .gxl file. | |||
| import networkx as nx | |||
| ds = {'name': 'monoterpenoides', | |||
| 'dataset': '../../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | |||
| Gn, y = loadDataset(ds['dataset']) | |||
| y = [int(i) for i in y] | |||
| print(Gn[1].nodes(data=True)) | |||
| print(Gn[1].edges(data=True)) | |||
| print(y[1]) | |||
| # Convert a graph to the proper NetworkX format that can be recognized by library gedlib. | |||
| Gn_new = [] | |||
| for G in Gn: | |||
| G_new = nx.Graph() | |||
| for nd, attrs in G.nodes(data=True): | |||
| G_new.add_node(str(nd), chem=attrs['atom']) | |||
| for nd1, nd2, attrs in G.edges(data=True): | |||
| G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) | |||
| # G_new.add_edge(str(nd1), str(nd2)) | |||
| Gn_new.append(G_new) | |||
| print(Gn_new[1].nodes(data=True)) | |||
| print(Gn_new[1].edges(data=True)) | |||
| print(Gn_new[1]) | |||
| filename = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/gxl/monoterpenoides' | |||
| xparams = {'method': 'gedlib'} | |||
| saveDataset(Gn, y, gformat='gxl', group='xml', filename=filename, xparams=xparams) | |||
| # ds = {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat', | |||
| # 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb | |||
| # Gn, y = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||