update pre-image.

6 years ago · b7112cd60f
--- a/notebooks/run_spkernel.py
+++ b/notebooks/run_spkernel.py
@@ -74,6 +74,7 @@ for ds in dslist:
        extra_params=(ds['extra_params'] if 'extra_params' in ds else None),
        ds_name=ds['name'],
        n_jobs=multiprocessing.cpu_count(),
 #        n_jobs=7,
        read_gm_from_file=False,
        verbose=True)
    print()
--- a/preimage/fitDistance.py
+++ b/preimage/fitDistance.py
@@ -18,31 +18,44 @@ from scipy import optimize
 import cvxpy as cp

 import sys
 sys.path.insert(0, "../")
 #sys.path.insert(0, "../")
 from ged import GED, get_nb_edit_operations
 from utils import kernel_distance_matrix

 def fit_GED_to_kernel_distance(Gn, gkernel, itr_max):
 def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max, 
                               fitkernel=None, gamma=1.0):
    # c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them.
    random.seed(1)
    cost_rdm = random.sample(range(1, 10), 5)
    edit_costs = cost_rdm + [0]
 #    random.seed(1)
    cost_rdm = random.sample(range(1, 10), 6)
 #    edit_costs = cost_rdm + [0]
    edit_costs = cost_rdm
 #    edit_costs = [i * 0.01 for i in cost_rdm] + [0]
 #    edit_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0]
 #    edit_costs = [0, 0, 0.9544, 0.026, 0.0196, 0]
 #    edit_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0]
    idx_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0]
    idx_cost_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0]
    
    # compute distances in feature space.
    dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, gkernel=gkernel)
    coef_dk = 1
    dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, gkernel=gkernel)
    dis_k_vec = []
    for i in range(len(dis_k_mat)):
        for j in range(i, len(dis_k_mat)):
            dis_k_vec.append(dis_k_mat[i, j])
    dis_k_vec = np.array(dis_k_vec)
    if fitkernel == None:
        dis_k_vec_ajusted = dis_k_vec
    elif fitkernel == 'gaussian':
        coef_dk = 1 / np.max(dis_k_vec)
        idx_dk_nonzeros = np.where(dis_k_vec != 0)[0]
        # remove 0's and constraint d_k between 0 and 1.
        dis_k_vec = dis_k_vec[idx_dk_nonzeros] * coef_dk
        dis_k_vec_ajusted = np.sqrt(-np.log(dis_k_vec) / gamma)
    
    residual_list = []
    edit_cost_list = []
    time_list = []
    nb_cost_mat_list = []
    
    for itr in range(itr_max):
        print('\niteration', itr)
@@ -52,15 +65,23 @@ def fit_GED_to_kernel_distance(Gn, gkernel, itr_max):
        edit_cost_list.append(edit_cost_constant)
        
        ged_all, ged_mat, n_edit_operations = compute_geds(Gn, edit_cost_constant, 
            idx_nonzeros, parallel=True)
            idx_cost_nonzeros, parallel=True)
                
        residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec)))
        if fitkernel == None:
            residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec)))
        elif fitkernel == 'gaussian':
            ged_all = np.array(ged_all)[idx_dk_nonzeros]
            residual = np.sqrt(np.sum(np.square(
                    np.exp(-gamma * ged_all ** 2) / coef_dk - dis_k_vec)))
        residual_list.append(residual)
        
        # "fit" geds to distances in feature space by tuning edit costs using the
        # Least Squares Method.
        nb_cost_mat = np.array(n_edit_operations).T
        edit_costs_new, residual = compute_better_costs(nb_cost_mat, dis_k_vec)
        if fitkernel == 'gaussian':
            nb_cost_mat = nb_cost_mat[idx_dk_nonzeros]
        nb_cost_mat_list.append(nb_cost_mat)
        edit_costs_new, residual = compute_better_costs(nb_cost_mat, dis_k_vec_ajusted)

        print('pseudo residual:', residual)
        for i in range(len(edit_costs_new)):
@@ -70,7 +91,7 @@ def fit_GED_to_kernel_distance(Gn, gkernel, itr_max):
                else:
                    raise ValueError('The edit cost is negative.')
        
        for idx, item in enumerate(idx_nonzeros):
        for idx, item in enumerate(idx_cost_nonzeros):
            edit_costs[item] = edit_costs_new[idx]
        
        time_list.append(time.time() - time0)
@@ -78,14 +99,21 @@ def fit_GED_to_kernel_distance(Gn, gkernel, itr_max):
        print('edit_costs:', edit_costs)
        print('residual_list:', residual_list)
        
        
    print()
    edit_cost_list.append(edit_costs)
    ged_all, ged_mat, n_edit_operations = compute_geds(Gn, edit_costs, 
            idx_nonzeros, parallel=True)
    residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec)))
            idx_cost_nonzeros, parallel=True)
    if fitkernel == 0:
        residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec)))
    elif fitkernel == 'gaussian':
        ged_all = np.array(ged_all)[idx_dk_nonzeros]
        residual = np.sqrt(np.sum(np.square(
                np.exp(-gamma * ged_all ** 2) / coef_dk - dis_k_vec)))
    residual_list.append(residual)
    nb_cost_mat_list.append(np.array(n_edit_operations).T)
    
    return edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list
    return edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, \
        time_list, nb_cost_mat_list, coef_dk


 def compute_geds(Gn, edit_cost_constant, idx_nonzeros, parallel=False):
@@ -193,7 +221,10 @@ def compute_better_costs(nb_cost_mat, dis_k_vec):
 #    h = np.array([0 for i in range(nb_cost_mat.shape[1])])
    x = cp.Variable(nb_cost_mat.shape[1])
    cost = cp.sum_squares(nb_cost_mat * x - dis_k_vec)
    constraints = [x >= [0 for i in range(nb_cost_mat.shape[1])]]
    constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])],
 #                   np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
                   np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
                   np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
    prob = cp.Problem(cp.Minimize(cost), constraints)
    prob.solve()
    edit_costs_new = x.value
--- a/preimage/ged.py
+++ b/preimage/ged.py
@@ -9,11 +9,14 @@ import numpy as np
 import networkx as nx
 from tqdm import tqdm
 import sys
 import multiprocessing
 from multiprocessing import Pool
 from functools import partial

 from gedlibpy import librariesImport, gedlibpy

 def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP', 
        edit_cost_constant=[], saveGXL='benoit', stabilizer='min', repeat=50):
        edit_cost_constant=[], stabilizer='min', repeat=50):
    """
    Compute GED for 2 graphs.
    """
@@ -25,9 +28,11 @@ def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP',
            G_new = nx.Graph()
            for nd, attrs in G.nodes(data=True):
                G_new.add_node(str(nd), chem=attrs['atom'])
 #                G_new.add_node(str(nd), x=str(attrs['attributes'][0]), 
 #                               y=str(attrs['attributes'][1]))
            for nd1, nd2, attrs in G.edges(data=True):
 #                G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type'])
                G_new.add_edge(str(nd1), str(nd2))
                G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type'])
 #                G_new.add_edge(str(nd1), str(nd2))
                
            return G_new
        
@@ -49,6 +54,32 @@ def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP',
            pi_backward = gedlibpy.get_backward_map(g, h)
            upper = gedlibpy.get_upper_bound(g, h)
            lower = gedlibpy.get_lower_bound(g, h)        
        elif stabilizer == 'mean':
            # @todo: to be finished...
            upper_list = [np.inf] * repeat
            for itr in range(repeat):                
                gedlibpy.run_method(g, h)                
                upper_list[itr] = gedlibpy.get_upper_bound(g, h)
                pi_forward = gedlibpy.get_forward_map(g, h)
                pi_backward = gedlibpy.get_backward_map(g, h)
                lower = gedlibpy.get_lower_bound(g, h)
            upper = np.mean(upper_list)
        elif stabilizer == 'median':
            if repeat % 2 == 0:
                repeat += 1
            upper_list = [np.inf] * repeat
            pi_forward_list = [0] * repeat
            pi_backward_list = [0] * repeat
            for itr in range(repeat):                
                gedlibpy.run_method(g, h)                
                upper_list[itr] = gedlibpy.get_upper_bound(g, h)
                pi_forward_list[itr] = gedlibpy.get_forward_map(g, h)
                pi_backward_list[itr] = gedlibpy.get_backward_map(g, h)
                lower = gedlibpy.get_lower_bound(g, h)
            upper = np.median(upper_list)
            idx_median = upper_list.index(upper)
            pi_forward = pi_forward_list[idx_median]
            pi_backward = pi_backward_list[idx_median]
        elif stabilizer == 'min':
            upper = np.inf
            for itr in range(repeat):                
@@ -61,6 +92,18 @@ def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP',
                    lower = gedlibpy.get_lower_bound(g, h)
                if upper == 0:
                    break
        elif stabilizer == 'max':
            upper = 0
            for itr in range(repeat):                
                gedlibpy.run_method(g, h)                
                upper_tmp = gedlibpy.get_upper_bound(g, h)                
                if upper_tmp > upper:
                    upper = upper_tmp
                    pi_forward = gedlibpy.get_forward_map(g, h)
                    pi_backward = gedlibpy.get_backward_map(g, h)
                    lower = gedlibpy.get_lower_bound(g, h)
        elif stabilizer == 'gaussian':
            pass
                    
        dis = upper
        
@@ -138,23 +181,69 @@ def GED_n(Gn, lib='gedlibpy', cost='CHEM_1', method='IPFP',
    return dis, pi_forward, pi_backward


 def ged_median(Gn, Gn_median, measure='ged', verbose=False, 
                    ged_cost='CHEM_1', ged_method='IPFP', saveGXL='benoit'):
    dis_list = []
    pi_forward_list = []
    for idx, G in tqdm(enumerate(Gn), desc='computing median distances', 
                       file=sys.stdout) if verbose else enumerate(Gn):
        dis_sum = 0
        pi_forward_list.append([])
        for G_p in Gn_median:
            dis_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p, 
                cost=ged_cost, method=ged_method, saveGXL=saveGXL)
            pi_forward_list[idx].append(pi_tmp_forward)
            dis_sum += dis_tmp
        dis_list.append(dis_sum)
 def ged_median(Gn, Gn_median, verbose=False, params_ged={'lib': 'gedlibpy', 
               'cost': 'CHEM_1', 'method': 'IPFP', 'edit_cost_constant': [], 
               'stabilizer': 'min', 'repeat': 50}, parallel=False):
    if parallel:
        len_itr = int(len(Gn))
        pi_forward_list = [[] for i in range(len_itr)]
        dis_list = [0 for i in range(len_itr)]
               
        itr = range(0, len_itr)
        n_jobs = multiprocessing.cpu_count()
        if len_itr < 100 * n_jobs:
            chunksize = int(len_itr / n_jobs) + 1
        else:
            chunksize = 100
        def init_worker(gn_toshare, gn_median_toshare):
            global G_gn, G_gn_median
            G_gn = gn_toshare
            G_gn_median = gn_median_toshare
        do_partial = partial(_compute_ged_median, params_ged)
        pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn, Gn_median))
        if verbose:
            iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize),
                            desc='computing GEDs', file=sys.stdout)
        else:
            iterator = pool.imap_unordered(do_partial, itr, chunksize)
        for i, dis_sum, pi_forward in iterator:
            pi_forward_list[i] = pi_forward
            dis_list[i] = dis_sum
 #            print('\n-------------------------------------------')
 #            print(i, j, idx_itr, dis)
        pool.close()
        pool.join()
        
    else:
        dis_list = []
        pi_forward_list = []
        for idx, G in tqdm(enumerate(Gn), desc='computing median distances', 
                           file=sys.stdout) if verbose else enumerate(Gn):
            dis_sum = 0
            pi_forward_list.append([])
            for G_p in Gn_median:
                dis_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p, 
                    **params_ged)
                pi_forward_list[idx].append(pi_tmp_forward)
                dis_sum += dis_tmp
            dis_list.append(dis_sum)
            
    return dis_list, pi_forward_list


 def _compute_ged_median(params_ged, itr):
 #    print(itr)
    dis_sum = 0
    pi_forward = []
    for G_p in G_gn_median:
        dis_tmp, pi_tmp_forward, pi_tmp_backward = GED(G_gn[itr], G_p, 
                    **params_ged)
        pi_forward.append(pi_tmp_forward)
        dis_sum += dis_tmp
        
    return itr, dis_sum, pi_forward


 def get_nb_edit_operations(g1, g2, forward_map, backward_map):
    """Compute the number of each edit operations.
    """
--- a/preimage/iam.py
+++ b/preimage/iam.py
@@ -22,20 +22,22 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
        epsilon=0.001, node_label='atom', edge_label='bond_type', 
        connected=False, removeNodes=True, allBestInit=False, allBestNodes=False,
        allBestEdges=False, allBestOutput=False,
        params_ged={'ged_cost': 'CHEM_1', 'ged_method': 'IPFP', 'saveGXL': 'benoit'}):
        params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', 'method': 'IPFP', 
                    'edit_cost_constant': [], 'stabilizer': 'min', 'repeat': 50}):
    """See my name, then you know what I do.
    """
 #    Gn_median = Gn_median[0:10]
 #    Gn_median = [nx.convert_node_labels_to_integers(g) for g in Gn_median]
    if removeNodes:
        node_ir = np.inf # corresponding to the node remove and insertion.
        label_r = 'thanksdanny' # the label for node remove. # @todo: make this label unrepeatable.
    node_ir = np.inf # corresponding to the node remove and insertion.
    label_r = 'thanksdanny' # the label for node remove. # @todo: make this label unrepeatable.
    ds_attrs = get_dataset_attributes(Gn_median + Gn_candidate, 
                                      attr_names=['edge_labeled', 'node_attr_dim', 'edge_attr_dim'], 
                                      edge_label=edge_label)
    node_label_set = get_node_labels(Gn_median, node_label)
    edge_label_set = get_edge_labels(Gn_median, edge_label)

    
    def generate_graph(G, pi_p_forward, label_set):
    def generate_graph(G, pi_p_forward):
        G_new_list = [G.copy()] # all "best" graphs generated in this iteration.
 #        nx.draw_networkx(G)
 #        import matplotlib.pyplot as plt
@@ -52,7 +54,7 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
            for ndi, (nd, _) in enumerate(G.nodes(data=True)):
                h_i0_list = []
                label_list = []
                for label in label_set:
                for label in node_label_set:
                    h_i0 = 0
                    for idx, g in enumerate(Gn_median):
                        pi_i = pi_p_forward[idx][ndi]
@@ -62,7 +64,7 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
                    label_list.append(label)
                # case when the node is to be removed.
                if removeNodes:
                    h_i0_remove = 0 # @todo: maybe this can be added to the label_set above.
                    h_i0_remove = 0 # @todo: maybe this can be added to the node_label_set above.
                    for idx, g in enumerate(Gn_median):
                        pi_i = pi_p_forward[idx][ndi]
                        if pi_i == node_ir:
@@ -91,11 +93,10 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
                    G_new_list = [ggg.copy() for ggg in G_new_list_nd]
                else: 
                    # choose one of the best randomly.
                    h_ij0_max = h_i0_list[idx_max[0]]
                    idx_rdm = random.randint(0, len(idx_max) - 1)
                    best_label = label_list[idx_max[idx_rdm]]
                           
                    # check whether a_ij is 0 or 1.
                    h_i0_max = h_i0_list[idx_max[idx_rdm]]

                    g_new = G_new_list[0]
                    if best_label == label_r:
                        g_new.remove_node(nd) 
@@ -134,8 +135,7 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
 #                for nd1, nd2, _ in g_new.edges(data=True): 
                        h_ij0_list = []
                        label_list = []
                        # @todo: compute edge label set before.
                        for label in get_edge_labels(Gn_median, edge_label):
                        for label in edge_label_set:
                            h_ij0 = 0
                            for idx, g in enumerate(Gn_median):
                                pi_i = pi_p_forward[idx][nd1i]
@@ -176,9 +176,9 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
                                    G_new_list_ed.append(g_tmp_copy)
                            g_tmp_list = [ggg.copy() for ggg in G_new_list_ed]
                        else: # choose one of the best randomly.
                            h_ij0_max = h_ij0_list[idx_max[0]]
                            idx_rdm = random.randint(0, len(idx_max) - 1)
                            best_label = label_list[idx_max[idx_rdm]]
                            h_ij0_max = h_ij0_list[idx_max[idx_rdm]]
                                   
                            # check whether a_ij is 0 or 1.
                            sij_norm = 0
@@ -192,6 +192,7 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
                                    g_new.add_edge(nd1, nd2)
                                g_new.edges[nd1, nd2][edge_label] = best_label
                            else:
 #                            elif h_ij0_max < len(Gn_median) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
                                if g_new.has_edge(nd1, nd2):
                                    g_new.remove_edge(nd1, nd2) 
                            g_tmp_list = [g_new]
@@ -221,8 +222,8 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
                            if g_tmp.has_node(nd1) and g_tmp.has_node(nd2) \
                                and not g_tmp.has_edge(nd1, nd2):
                                g_tmp.add_edge(nd1, nd2)
 #                        else: # @todo: which to use?
                        elif sij_norm < len(Gn_median) * c_er / (c_er + c_ei):
                        else: # @todo: which to use?
 #                        elif sij_norm < len(Gn_median) * c_er / (c_er + c_ei):
                            if g_tmp.has_edge(nd1, nd2):
                                g_tmp.remove_edge(nd1, nd2)
                        # do not change anything when equal.     
@@ -238,7 +239,7 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
 #        # find the best graph generated in this iteration and update pi_p.
        # @todo: should we update all graphs generated or just the best ones?
        dis_list, pi_forward_list = ged_median(G_new_list, Gn_median, 
            **params_ged)
            params_ged=params_ged)
        # @todo: should we remove the identical and connectivity check? 
        # Don't know which is faster.
        if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0:
@@ -283,15 +284,16 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
 #        while itr < ite_max and (np.abs(old_sod - cur_sod) > epsilon or
 #                                 np.abs(old_sod - cur_sod) == 0):
        while itr < ite_max and np.abs(old_sod - cur_sod) > epsilon:
 #        while itr < ite_max:
 #        for itr in range(0, 5): # the convergence condition?
            print('itr_iam is', itr)
            G_new_list = []
            pi_forward_new_list = []
            dis_new_list = []
            for idx, g in enumerate(G_list):
                label_set = get_node_labels(Gn_median + [g], node_label)                        
 #                label_set = get_node_labels(Gn_median + [g], node_label)                        
                G_tmp_list, pi_forward_tmp_list, dis_tmp_list = generate_graph(
                    g, pi_forward_list[idx], label_set)
                    g, pi_forward_list[idx])
                G_new_list += G_tmp_list
                pi_forward_new_list += pi_forward_tmp_list
                dis_new_list += dis_tmp_list
@@ -325,7 +327,7 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
            
        print('\nsods:', sod_list, '\n')
            
        return G_list, pi_forward_list, dis_min
        return G_list, pi_forward_list, dis_min, sod_list
    
    
    def remove_duplicates(Gn):
@@ -363,7 +365,8 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
    # compute set-median.
    dis_min = np.inf
    dis_list, pi_forward_all = ged_median(Gn_candidate, Gn_median,
        **params_ged)
        params_ged=params_ged, parallel=True)
    print('finish computing GEDs.')
    # find all smallest distances.
    if allBestInit: # try all best init graphs.
        idx_min_list = range(len(dis_list))
@@ -371,19 +374,26 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
    else:
        idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist()
        dis_min = [dis_list[idx_min_list[0]]] * len(idx_min_list)
        idx_min_rdm = random.randint(0, len(idx_min_list) - 1)
        idx_min_list = [idx_min_list[idx_min_rdm]]
    sod_set_median = np.min(dis_min)
        
    
    # phase 2: iteration.
    G_list = []
    dis_list = []
    pi_forward_list = []
    G_set_median_list = []
 #    sod_list = []
    for idx_tmp, idx_min in enumerate(idx_min_list):
 #        print('idx_min is', idx_min)
        G = Gn_candidate[idx_min].copy()
        G_set_median_list.append(G.copy())
        # list of edit operations.        
        pi_p_forward = pi_forward_all[idx_min]
 #        pi_p_backward = pi_all_backward[idx_min]        
        Gi_list, pi_i_forward_list, dis_i_min = iteration_proc(G, pi_p_forward, dis_min[idx_tmp])            
        Gi_list, pi_i_forward_list, dis_i_min, sod_list = iteration_proc(G, 
                                                pi_p_forward, dis_min[idx_tmp])            
        G_list += Gi_list
        dis_list += [dis_i_min] * len(Gi_list)
        pi_forward_list += pi_i_forward_list
@@ -409,9 +419,9 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
 #        print(g.edges(data=True))
    
    # get the best median graphs
    G_min_list, pi_forward_min_list, dis_min = best_median_graphs(
    G_gen_median_list, pi_forward_min_list, sod_gen_median = best_median_graphs(
            G_list, pi_forward_list, dis_list)
 #    for g in G_min_list:
 #    for g in G_gen_median_list:
 #        nx.draw_networkx(g)
 #        plt.show()
 #        print(g.nodes(data=True))
@@ -419,10 +429,10 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
    
    if not allBestOutput:
        # randomly choose one graph.
        idx_rdm = random.randint(0, len(G_min_list) - 1)
        G_min_list = [G_min_list[idx_rdm]]
        idx_rdm = random.randint(0, len(G_gen_median_list) - 1)
        G_gen_median_list = [G_gen_median_list[idx_rdm]]
    
    return G_min_list, dis_min
    return G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median



--- a/preimage/median.py
+++ b/preimage/median.py
@@ -5,8 +5,8 @@ import numpy as np
 import networkx as nx
 import time

 import librariesImport
 import script
 from gedlibpy import librariesImport, gedlibpy
 #import script
 sys.path.insert(0, "/home/bgauzere/dev/optim-graphes/")
 import pygraph
 from pygraph.utils.graphfiles import loadDataset
--- a/preimage/preimage_iam.py
+++ b/preimage/preimage_iam.py
@@ -27,8 +27,9 @@ def preimage_iam(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max,
                 params_iam={'c_ei': 1, 'c_er': 1, 'c_es': 1, 
                             'ite_max': 50, 'epsilon': 0.001, 
                             'removeNodes': True, 'connected': False},
                 params_ged={'ged_cost': 'CHEM_1', 'ged_method': 'IPFP', 
                             'saveGXL': 'benoit'}):
                 params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', 'method': 'IPFP', 
                             'edit_cost_constant': [], 'stabilizer': 'min', 
                             'repeat': 50}):
    """This function constructs graph pre-image by the iterative pre-image 
    framework in reference [1], algorithm 1, where the step of generating new 
    graphs randomly is replaced by the IAM algorithm in reference [2].
@@ -91,12 +92,12 @@ def preimage_iam(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max,
            ghat_new_list = []
            for g_tmp in Gk:
                Gn_nearest_init = [g_tmp.copy()]
                ghat_new_list_tmp, _ = iam_upgraded(Gn_nearest_median, 
                ghat_new_list_tmp, _, _ = iam_upgraded(Gn_nearest_median, 
                        Gn_nearest_init, params_ged=params_ged, **params_iam)
                ghat_new_list += ghat_new_list_tmp
        else: # only the best graph in D_k is used to initialize IAM.
            Gn_nearest_init = [g.copy() for g in Gk]
            ghat_new_list, _ = iam_upgraded(Gn_nearest_median, Gn_nearest_init, 
            ghat_new_list, _, _ = iam_upgraded(Gn_nearest_median, Gn_nearest_init, 
                    params_ged=params_ged, **params_iam)

 #        for g in g_tmp_list:
@@ -181,8 +182,9 @@ def preimage_iam_random_mix(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max
                            params_iam={'c_ei': 1, 'c_er': 1, 'c_es': 1, 
                                        'ite_max': 50, 'epsilon': 0.001, 
                                        'removeNodes': True, 'connected': False},
                            params_ged={'ged_cost': 'CHEM_1', 'ged_method': 'IPFP', 
                                        'saveGXL': 'benoit'}):
                            params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', 
                                        'method': 'IPFP', 'edit_cost_constant': [], 
                                        'stabilizer': 'min', 'repeat': 50}):
    """This function constructs graph pre-image by the iterative pre-image 
    framework in reference [1], algorithm 1, where new graphs are generated 
    randomly and by the IAM algorithm in reference [2].
--- a/preimage/test_fitDistance.py
+++ b/preimage/test_fitDistance.py
@@ -7,7 +7,10 @@ Created on Thu Oct 24 11:50:56 2019
 """
 from matplotlib import pyplot as plt
 import numpy as np
 from tqdm import tqdm

 import sys
 sys.path.insert(0, "../")
 from pygraph.utils.graphfiles import loadDataset
 from utils import remove_edges
 from fitDistance import fit_GED_to_kernel_distance
@@ -21,21 +24,22 @@ def test_anycosts():
    remove_edges(Gn)
    gkernel = 'marginalizedkernel'
    itr_max = 10
    edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list = \
        fit_GED_to_kernel_distance(Gn, gkernel, itr_max)
    edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
        nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, gkernel, itr_max)
    total_time = np.sum(time_list)
    print('\nedit_costs:', edit_costs)
    print('\nresidual_list:', residual_list)
    print('\nedit_cost_list:', edit_cost_list)
    print('\ndistance matrix in kernel space:', dis_k_mat)
    print('\nged matrix:', ged_mat)
    print('total time:', total_time)
    print('\ntotal time:', total_time)
    print('\nnb_cost_mat:', nb_cost_mat_list[-1])
    np.savez('results/fit_distance.any_costs.gm', edit_costs=edit_costs, 
             residual_list=residual_list, edit_cost_list=edit_cost_list,
             dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, 
             total_time=total_time)
             total_time=total_time, nb_cost_mat_list=nb_cost_mat_list)
    
    # normalized distance matrices.
 #    # normalized distance matrices.
 #    gmfile = np.load('results/fit_distance.any_costs.gm.npz')
 #    edit_costs = gmfile['edit_costs']
 #    residual_list = gmfile['residual_list']
@@ -43,72 +47,256 @@ def test_anycosts():
 #    dis_k_mat = gmfile['dis_k_mat']
 #    ged_mat = gmfile['ged_mat']
 #    total_time = gmfile['total_time']
 ##    nb_cost_mat_list = gmfile['nb_cost_mat_list']
    
    norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
    plt.imshow(norm_dis_k_mat)
    plt.colorbar()
    plt.savefig('results/norm_dis_k_mat.any_costs' + '.eps', format='eps', dpi=300)
 #    plt.savefig('results/norm_dis_k_mat.any_costs' + '.jpg', format='jpg')
 #    plt.savefig('results/norm_dis_k_mat.any_costs' + '.png', format='png')
 #    plt.show()
    plt.clf()
    
    norm_ged_mat = normalize_distance_matrix(ged_mat)
    plt.imshow(norm_ged_mat)
    plt.colorbar()
    plt.savefig('results/norm_ged_mat.any_costs' + '.eps', format='eps', dpi=300)
 #    plt.savefig('results/norm_ged_mat.any_costs' + '.jpg', format='jpg')
 #    plt.savefig('results/norm_ged_mat.any_costs' + '.png', format='png')
 #    plt.show()
    plt.clf()
    
    norm_diff = norm_ged_mat - norm_dis_k_mat
    plt.imshow(norm_diff)
    plt.colorbar()
    plt.savefig('results/diff_mat_norm_ged_dis_k.any_costs' + '.eps', format='eps', dpi=300)
 #    plt.savefig('results/diff_mat_norm_ged_dis_k.any_costs' + '.png', format='png')
 #    plt.show()
    plt.clf()
 #    draw_count_bar(norm_diff)
    

 def test_cs_leq_ci_plus_cr():
    """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er
    """
    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
          'extra_params': {}}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
    ds = {'name': 'monoterpenoides', 
          'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'])
 #    Gn = Gn[0:10]
    remove_edges(Gn)
    gkernel = 'marginalizedkernel'
    gkernel = 'untilhpathkernel'
    node_label = 'atom'
    edge_label = 'bond_type'
    itr_max = 10
    edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list = \
        fit_GED_to_kernel_distance(Gn, gkernel, itr_max)
    edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
        nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label, 
                                                      gkernel, itr_max,
                                                      fitkernel='gaussian')
    total_time = np.sum(time_list)
    print('\nedit_costs:', edit_costs)
    print('\nresidual_list:', residual_list)
    print('\nedit_cost_list:', edit_cost_list)
    print('\ndistance matrix in kernel space:', dis_k_mat)
    print('\nged matrix:', ged_mat)
    print('total time:', total_time)
    np.savez('results/fit_distance.cs_leq_ci_plus_cr.gm', edit_costs=edit_costs, 
    print('\ntotal time:', total_time)
    print('\nnb_cost_mat:', nb_cost_mat_list[-1])
    np.savez('results/fit_distance.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel.gm', 
             edit_costs=edit_costs, 
             residual_list=residual_list, edit_cost_list=edit_cost_list,
             dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, 
             total_time=total_time)
             total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, 
             coef_dk=coef_dk)
    
 #    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
 #          'extra_params': {}}  # node/edge symb
 #    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
 ##    Gn = Gn[0:10]
 ##    remove_edges(Gn)
 #    gkernel = 'untilhpathkernel'
 #    node_label = 'atom'
 #    edge_label = 'bond_type'
 #    itr_max = 10
 #    edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
 #        nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label, 
 #                                                      gkernel, itr_max)
 #    total_time = np.sum(time_list)
 #    print('\nedit_costs:', edit_costs)
 #    print('\nresidual_list:', residual_list)
 #    print('\nedit_cost_list:', edit_cost_list)
 #    print('\ndistance matrix in kernel space:', dis_k_mat)
 #    print('\nged matrix:', ged_mat)
 #    print('\ntotal time:', total_time)
 #    print('\nnb_cost_mat:', nb_cost_mat_list[-1])
 #    np.savez('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.mutag.elabeled.uhpkernel.gm', 
 #             edit_costs=edit_costs, 
 #             residual_list=residual_list, edit_cost_list=edit_cost_list,
 #             dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, 
 #             total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, coef_dk)
    
    
 #    # normalized distance matrices.
 #    gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz')
 #    edit_costs = gmfile['edit_costs']
 #    residual_list = gmfile['residual_list']
 #    edit_cost_list = gmfile['edit_cost_list']
 #    dis_k_mat = gmfile['dis_k_mat']
 #    ged_mat = gmfile['ged_mat']
 #    total_time = gmfile['total_time']
 #    nb_cost_mat_list = gmfile['nb_cost_mat_list']
 #    coef_dk = gmfile['coef_dk']
    
    nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
    print(nb_consistent, nb_inconsistent, ratio_consistent)
    
 #    dis_k_sub = pairwise_substitution(dis_k_mat)
 #    ged_sub = pairwise_substitution(ged_mat)    
 #    np.savez('results/sub_dis_mat.cs_leq_ci_plus_cr.cost_leq_1en2.gm', 
 #             dis_k_sub=dis_k_sub, ged_sub=ged_sub)
    
    
    norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
    plt.imshow(norm_dis_k_mat)
    plt.colorbar()
    plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel' 
                + '.eps', format='eps', dpi=300)
    plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel' 
                + '.png', format='png')
 #    plt.show()
    plt.clf()
    
    norm_ged_mat = normalize_distance_matrix(ged_mat)
    plt.imshow(norm_ged_mat)
    plt.colorbar()
    plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel' 
                + '.eps', format='eps', dpi=300)
    plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel' 
                + '.png', format='png')
 #    plt.show()
    plt.clf()
    
    norm_diff = norm_ged_mat - norm_dis_k_mat
    plt.imshow(norm_diff)
    plt.colorbar()
    plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel' 
                + '.eps', format='eps', dpi=300)
    plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel' 
                + '.png', format='png')
 #    plt.show()
    plt.clf()
 #    draw_count_bar(norm_diff)
    
    
 def test_unfitted():
    """unfitted.
    """  
    from fitDistance import compute_geds
    from utils import kernel_distance_matrix
    ds = {'name': 'monoterpenoides', 
          'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'])
 #    Gn = Gn[0:10]
    gkernel = 'untilhpathkernel'
    node_label = 'atom'
    edge_label = 'bond_type'
        

 #    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
 #          'extra_params': {}}  # node/edge symb
 #    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
 ##    Gn = Gn[0:10]
 ##    remove_edges(Gn)
 #    gkernel = 'marginalizedkernel'

    dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, gkernel=gkernel)
    ged_all, ged_mat, n_edit_operations = compute_geds(Gn, [3, 3, 1, 3, 3, 1], 
            [0, 1, 2, 3, 4, 5], parallel=True)
    print('\ndistance matrix in kernel space:', dis_k_mat)
    print('\nged matrix:', ged_mat)
 #    np.savez('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.gm', edit_costs=edit_costs, 
 #             residual_list=residual_list, edit_cost_list=edit_cost_list,
 #             dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, 
 #             total_time=total_time, nb_cost_mat_list=nb_cost_mat_list) 
    
    # normalized distance matrices.
 #    gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.gm.npz')
 #    gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en3.gm.npz')
 #    edit_costs = gmfile['edit_costs']
 #    residual_list = gmfile['residual_list']
 #    edit_cost_list = gmfile['edit_cost_list']
 #    dis_k_mat = gmfile['dis_k_mat']
 #    ged_mat = gmfile['ged_mat']
 #    total_time = gmfile['total_time']
 #    nb_cost_mat_list = gmfile['nb_cost_mat_list']
    
    nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
    print(nb_consistent, nb_inconsistent, ratio_consistent)
    
    norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
    plt.imshow(norm_dis_k_mat)
    plt.colorbar()
    plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr' + '.eps', format='eps', dpi=300)
 #    plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr' + '.jpg', format='jpg')
    plt.savefig('results/norm_dis_k_mat.unfitted.MUTAG' + '.eps', format='eps', dpi=300)
    plt.savefig('results/norm_dis_k_mat.unfitted.MUTAG' + '.png', format='png')
 #    plt.show()
    plt.clf()
    
    norm_ged_mat = normalize_distance_matrix(ged_mat)
    plt.imshow(norm_ged_mat)
    plt.colorbar()
    plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr' + '.eps', format='eps', dpi=300)
 #    plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr' + '.jpg', format='jpg')
    plt.savefig('results/norm_ged_mat.unfitted.MUTAG' + '.eps', format='eps', dpi=300)
    plt.savefig('results/norm_ged_mat.unfitted.MUTAG' + '.png', format='png')
 #    plt.show()
    plt.clf()
    
    norm_diff = norm_ged_mat - norm_dis_k_mat
    plt.imshow(norm_diff)
    plt.colorbar()
    plt.savefig('results/diff_mat_norm_ged_dis_k.unfitted.MUTAG' + '.eps', format='eps', dpi=300)
    plt.savefig('results/diff_mat_norm_ged_dis_k.unfitted.MUTAG' + '.png', format='png')
 #    plt.show()
    plt.clf()
    draw_count_bar(norm_diff)
    
    
 def pairwise_substitution_consistence(mat1, mat2):
    """
    """
    nb_consistent = 0
    nb_inconsistent = 0
    # the matrix is considered symmetric.
    upper_tri1 = mat1[np.triu_indices_from(mat1)]
    upper_tri2 = mat2[np.tril_indices_from(mat2)]
    for i in tqdm(range(len(upper_tri1)), desc='computing consistence', file=sys.stdout):
        for j in range(i, len(upper_tri1)):
            if np.sign(upper_tri1[i] - upper_tri1[j]) == np.sign(upper_tri2[i] - upper_tri2[j]):
                nb_consistent += 1
            else:
                nb_inconsistent += 1
    return nb_consistent, nb_inconsistent, nb_consistent / (nb_consistent + nb_inconsistent)


 def pairwise_substitution(mat):
    # the matrix is considered symmetric.
    upper_tri = mat[np.triu_indices_from(mat)]
    sub_list = []
    for i in tqdm(range(len(upper_tri)), desc='computing', file=sys.stdout):
        for j in range(i, len(upper_tri)):
            sub_list.append(upper_tri[i] - upper_tri[j])
    return sub_list
    
    
 def draw_count_bar(norm_diff):
    import pandas
    from collections import Counter, OrderedDict
    norm_diff_cnt = norm_diff.flatten()
    norm_diff_cnt = norm_diff_cnt * 10
    norm_diff_cnt = np.floor(norm_diff_cnt)
    norm_diff_cnt = Counter(norm_diff_cnt)
    norm_diff_cnt = OrderedDict(sorted(norm_diff_cnt.items()))
    df = pandas.DataFrame.from_dict(norm_diff_cnt, orient='index')
    df.plot(kind='bar')
    
    
 if __name__ == '__main__':
    test_anycosts()
    test_cs_leq_ci_plus_cr()
 #    test_anycosts()
 #    test_cs_leq_ci_plus_cr()
    test_unfitted()
    
 #    x = np.array([[1,2,3],[4,5,6],[7,8,9]])
 #    xx = pairwise_substitution(x)
--- a/preimage/test_iam.py
+++ b/preimage/test_iam.py
@@ -17,9 +17,363 @@ import random
 import sys
 sys.path.insert(0, "../")
 from pygraph.utils.graphfiles import loadDataset
 #from pygraph.utils.logger2file import *
 from iam import iam_upgraded
 from utils import remove_edges, compute_kernel, get_same_item_indices
 from ged import ged_median
 from utils import remove_edges, compute_kernel, get_same_item_indices, dis_gstar
 #from ged import ged_median

 def test_iam_monoterpenoides():
    ds = {'name': 'monoterpenoides', 
          'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'])
 #    Gn = Gn[0:50]
    gkernel = 'untilhpathkernel'
    node_label = 'atom'
    edge_label = 'bond_type'
    
    # parameters for GED function from the IAM paper.
    # fitted edit costs (Gaussian).
    c_vi = 0.03620133402089074
    c_vr = 0.0417574590207099
    c_vs = 0.009992282328587499
    c_ei = 0.08293120042342755
    c_er = 0.09512220476358019
    c_es = 0.09222529696841467
 #    # fitted edit costs (linear combinations).
 #    c_vi = 0.1749684054238749
 #    c_vr = 0.0734054228711457
 #    c_vs = 0.05017781726016715
 #    c_ei = 0.1869431164806936
 #    c_er = 0.32055856948274
 #    c_es = 0.2569469379247611
 #    # unfitted edit costs.
 #    c_vi = 3
 #    c_vr = 3
 #    c_vs = 1
 #    c_ei = 3
 #    c_er = 3
 #    c_es = 1
    ite_max_iam = 50
    epsilon_iam = 0.001
    removeNodes = False
    connected_iam = False
    # parameters for IAM function
 #    ged_cost = 'CONSTANT'
    ged_cost = 'CONSTANT'
    ged_method = 'IPFP'
    edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
 #    edit_cost_constant = []
    ged_stabilizer = 'min'
    ged_repeat = 50
    params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, 
                  'edit_cost_constant': edit_cost_constant, 
                  'stabilizer': ged_stabilizer, 'repeat': ged_repeat}
    
    # classify graphs according to letters.
    time_list = []
    dis_ks_min_list = []
    dis_ks_set_median_list = []
    sod_gs_list = []
    g_best = []
    sod_set_median_list = []
    sod_list_list = []
    idx_dict = get_same_item_indices(y_all)
    for y_class in idx_dict:
        print('\n-------------------------------------------------------')
        print('class of y:', y_class)
        Gn_class = [Gn[i].copy() for i in idx_dict[y_class]]
        
        time_list.append([])
        dis_ks_min_list.append([])
        dis_ks_set_median_list.append([])
        sod_gs_list.append([])
        g_best.append([])
        sod_set_median_list.append([])
        
        for repeat in range(50):
            idx_rdm = random.sample(range(len(Gn_class)), 10)
            print('graphs chosen:', idx_rdm)
            Gn_median = [Gn_class[idx].copy() for idx in idx_rdm]
            Gn_candidate = [g.copy() for g in Gn_median]
        
            alpha_range = [1 / len(Gn_median)] * len(Gn_median)
            time0 = time.time()
            G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \
            = iam_upgraded(Gn_median, 
                Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam,
                epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes, 
                params_ged=params_ged)
            time_total = time.time() - time0
            print('\ntime: ', time_total)
            time_list[-1].append(time_total)
            g_best[-1].append(G_gen_median_list[0])
            sod_set_median_list[-1].append(sod_set_median)
            print('\nsmallest sod of the set median:', sod_set_median)
            sod_gs_list[-1].append(sod_gen_median)
            print('\nsmallest sod in graph space:', sod_gen_median)
            sod_list_list.append(sod_list)
            
            # show the best graph and save it to file.
            print('one of the possible corresponding pre-images is')
            nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'), 
                    with_labels=True)
 #            plt.show()
    #        plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) + 
 #            plt.savefig('results/iam/paper_compare/monoter_y' + str(y_class) + 
 #                        '_repeat' + str(repeat) + '_' + str(time.time()) +
 #                        '.png', format="PNG")
            plt.clf()
    #        print(G_gen_median_list[0].nodes(data=True))
    #        print(G_gen_median_list[0].edges(data=True))
            
    
            # compute distance between \psi and the set median graph.
            knew_set_median = compute_kernel(G_set_median_list + Gn_median, 
                gkernel, node_label, edge_label, False)
            dhat_new_set_median_list = []
            for idx, g_tmp in enumerate(G_set_median_list):
                # @todo: the term3 below could use the one at the beginning of the function.
                dhat_new_set_median_list.append(dis_gstar(idx, range(len(G_set_median_list), 
                    len(G_set_median_list) + len(Gn_median) + 1), 
                    alpha_range, knew_set_median, withterm3=False))
                
            print('\ndistance in kernel space of set median: ', dhat_new_set_median_list[0]) 
            dis_ks_set_median_list[-1].append(dhat_new_set_median_list[0])
            
            
            # compute distance between \psi and the new generated graphs.
            knew = compute_kernel(G_gen_median_list + Gn_median, gkernel, node_label,
                              edge_label, False)
            dhat_new_list = []
            for idx, g_tmp in enumerate(G_gen_median_list):
                # @todo: the term3 below could use the one at the beginning of the function.
                dhat_new_list.append(dis_gstar(idx, range(len(G_gen_median_list), 
                                    len(G_gen_median_list) + len(Gn_median) + 1), 
                                    alpha_range, knew, withterm3=False))
                
            print('\nsmallest distance in kernel space: ', dhat_new_list[0]) 
            dis_ks_min_list[-1].append(dhat_new_list[0])
            

        print('\nsods of the set median for this class:', sod_set_median_list[-1])
        print('\nsods in graph space for this class:', sod_gs_list[-1])
        print('\ndistance in kernel space of set median for this class:', 
              dis_ks_set_median_list[-1])
        print('\nsmallest distances in kernel space for this class:', 
              dis_ks_min_list[-1])   
        print('\ntimes for this class:', time_list[-1])
        
        sod_set_median_list[-1] = np.mean(sod_set_median_list[-1])
        sod_gs_list[-1] = np.mean(sod_gs_list[-1])
        dis_ks_set_median_list[-1] = np.mean(dis_ks_set_median_list[-1])
        dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1])
        time_list[-1] = np.mean(time_list[-1])
        
    print()
    print('\nmean sods of the set median for each class:', sod_set_median_list)
    print('\nmean sods in graph space for each class:', sod_gs_list)
    print('\ndistances in kernel space of set median for each class:', 
            dis_ks_set_median_list)
    print('\nmean smallest distances in kernel space for each class:', 
            dis_ks_min_list)
    print('\nmean times for each class:', time_list)
    
    print('\nmean sods of the set median of all:', np.mean(sod_set_median_list))
    print('\nmean sods in graph space of all:', np.mean(sod_gs_list))
    print('\nmean distances in kernel space of set median of all:', 
            np.mean(dis_ks_set_median_list))
    print('\nmean smallest distances in kernel space of all:', 
            np.mean(dis_ks_min_list))
    print('\nmean times of all:', np.mean(time_list))
    
    nb_better_sods = 0
    nb_worse_sods = 0
    nb_same_sods = 0
    for sods in sod_list_list:
        if sods[0] > sods[-1]:
            nb_better_sods += 1
        elif sods[0] < sods[-1]:
            nb_worse_sods += 1
        else:
            nb_same_sods += 1
    print('\n In', str(len(sod_list_list)), 'sod lists,', str(nb_better_sods), 
          'are getting better,', str(nb_worse_sods), 'are getting worse,', 
          str(nb_same_sods), 'are not changed; ', str(nb_better_sods / len(sod_list_list)),
          'sods are improved.')
    
    
 def test_iam_mutag():
    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
          'extra_params': {}}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
 #    Gn = Gn[0:50]
    gkernel = 'untilhpathkernel'
    node_label = 'atom'
    edge_label = 'bond_type'
    
    # parameters for GED function from the IAM paper.
    # fitted edit costs.
    c_vi = 0.03523843108436513
    c_vr = 0.03347339739350128
    c_vs = 0.06871290673612238
    c_ei = 0.08591999846720685
    c_er = 0.07962086440894103
    c_es = 0.08596855855478233
    # unfitted edit costs.
 #    c_vi = 3
 #    c_vr = 3
 #    c_vs = 1
 #    c_ei = 3
 #    c_er = 3
 #    c_es = 1
    ite_max_iam = 50
    epsilon_iam = 0.001
    removeNodes = False
    connected_iam = False
    # parameters for IAM function
 #    ged_cost = 'CONSTANT'
    ged_cost = 'CONSTANT'
    ged_method = 'IPFP'
    edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
 #    edit_cost_constant = []
    ged_stabilizer = 'min'
    ged_repeat = 50
    params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, 
                  'edit_cost_constant': edit_cost_constant, 
                  'stabilizer': ged_stabilizer, 'repeat': ged_repeat}
    
    # classify graphs according to letters.
    time_list = []
    dis_ks_min_list = []
    dis_ks_set_median_list = []
    sod_gs_list = []
    g_best = []
    sod_set_median_list = []
    sod_list_list = []
    idx_dict = get_same_item_indices(y_all)
    for y_class in idx_dict:
        print('\n-------------------------------------------------------')
        print('class of y:', y_class)
        Gn_class = [Gn[i].copy() for i in idx_dict[y_class]]
        
        time_list.append([])
        dis_ks_min_list.append([])
        dis_ks_set_median_list.append([])
        sod_gs_list.append([])
        g_best.append([])
        sod_set_median_list.append([])
        
        for repeat in range(50):
            idx_rdm = random.sample(range(len(Gn_class)), 10)
            print('graphs chosen:', idx_rdm)
            Gn_median = [Gn_class[idx].copy() for idx in idx_rdm]
            Gn_candidate = [g.copy() for g in Gn_median]
        
            alpha_range = [1 / len(Gn_median)] * len(Gn_median)
            time0 = time.time()
            G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \
            = iam_upgraded(Gn_median, 
                Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam,
                epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes, 
                params_ged=params_ged)
            time_total = time.time() - time0
            print('\ntime: ', time_total)
            time_list[-1].append(time_total)
            g_best[-1].append(G_gen_median_list[0])
            sod_set_median_list[-1].append(sod_set_median)
            print('\nsmallest sod of the set median:', sod_set_median)
            sod_gs_list[-1].append(sod_gen_median)
            print('\nsmallest sod in graph space:', sod_gen_median)
            sod_list_list.append(sod_list)
            
            # show the best graph and save it to file.
            print('one of the possible corresponding pre-images is')
            nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'), 
                    with_labels=True)
 #            plt.show()
    #        plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) + 
 #            plt.savefig('results/iam/paper_compare/mutag_y' + str(y_class) + 
 #                        '_repeat' + str(repeat) + '_' + str(time.time()) +
 #                        '.png', format="PNG")
            plt.clf()
    #        print(G_gen_median_list[0].nodes(data=True))
    #        print(G_gen_median_list[0].edges(data=True))
            
    
            # compute distance between \psi and the set median graph.
            knew_set_median = compute_kernel(G_set_median_list + Gn_median, 
                gkernel, node_label, edge_label, False)
            dhat_new_set_median_list = []
            for idx, g_tmp in enumerate(G_set_median_list):
                # @todo: the term3 below could use the one at the beginning of the function.
                dhat_new_set_median_list.append(dis_gstar(idx, range(len(G_set_median_list), 
                    len(G_set_median_list) + len(Gn_median) + 1), 
                    alpha_range, knew_set_median, withterm3=False))
                
            print('\ndistance in kernel space of set median: ', dhat_new_set_median_list[0]) 
            dis_ks_set_median_list[-1].append(dhat_new_set_median_list[0])
            
            
            # compute distance between \psi and the new generated graphs.
            knew = compute_kernel(G_gen_median_list + Gn_median, gkernel, node_label,
                              edge_label, False)
            dhat_new_list = []
            for idx, g_tmp in enumerate(G_gen_median_list):
                # @todo: the term3 below could use the one at the beginning of the function.
                dhat_new_list.append(dis_gstar(idx, range(len(G_gen_median_list), 
                                    len(G_gen_median_list) + len(Gn_median) + 1), 
                                    alpha_range, knew, withterm3=False))
                
            print('\nsmallest distance in kernel space: ', dhat_new_list[0]) 
            dis_ks_min_list[-1].append(dhat_new_list[0])
            

        print('\nsods of the set median for this class:', sod_set_median_list[-1])
        print('\nsods in graph space for this class:', sod_gs_list[-1])
        print('\ndistance in kernel space of set median for this class:', 
              dis_ks_set_median_list[-1])
        print('\nsmallest distances in kernel space for this class:', 
              dis_ks_min_list[-1])   
        print('\ntimes for this class:', time_list[-1])
        
        sod_set_median_list[-1] = np.mean(sod_set_median_list[-1])
        sod_gs_list[-1] = np.mean(sod_gs_list[-1])
        dis_ks_set_median_list[-1] = np.mean(dis_ks_set_median_list[-1])
        dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1])
        time_list[-1] = np.mean(time_list[-1])
        
    print()
    print('\nmean sods of the set median for each class:', sod_set_median_list)
    print('\nmean sods in graph space for each class:', sod_gs_list)
    print('\ndistances in kernel space of set median for each class:', 
            dis_ks_set_median_list)
    print('\nmean smallest distances in kernel space for each class:', 
            dis_ks_min_list)
    print('\nmean times for each class:', time_list)
    
    print('\nmean sods of the set median of all:', np.mean(sod_set_median_list))
    print('\nmean sods in graph space of all:', np.mean(sod_gs_list))
    print('\nmean distances in kernel space of set median of all:', 
            np.mean(dis_ks_set_median_list))
    print('\nmean smallest distances in kernel space of all:', 
            np.mean(dis_ks_min_list))
    print('\nmean times of all:', np.mean(time_list))
    
    nb_better_sods = 0
    nb_worse_sods = 0
    nb_same_sods = 0
    for sods in sod_list_list:
        if sods[0] > sods[-1]:
            nb_better_sods += 1
        elif sods[0] < sods[-1]:
            nb_worse_sods += 1
        else:
            nb_same_sods += 1
    print('\n In', str(len(sod_list_list)), 'sod lists,', str(nb_better_sods), 
          'are getting better,', str(nb_worse_sods), 'are getting worse,', 
          str(nb_same_sods), 'are not changed; ', str(nb_better_sods / len(sod_list_list)),
          'sods are improved.')
    

 ###############################################################################
 # tests on different numbers of median-sets.
@@ -33,46 +387,352 @@ def test_iam_median_nb():
    remove_edges(Gn)
    gkernel = 'marginalizedkernel'
    
 #    lmbda = 0.03 # termination probalility
 #    r_max = 10 # iteration limit for pre-image.
 #    alpha_range = np.linspace(0.5, 0.5, 1)
 #    k = 5 # k nearest neighbors
 #    epsilon = 1e-6
 #    InitIAMWithAllDk = True
    lmbda = 0.03 # termination probalility
 #    # parameters for GED function
 #    c_vi = 0.037
 #    c_vr = 0.038
 #    c_vs = 0.075
 #    c_ei = 0.001
 #    c_er = 0.001
 #    c_es = 0.0
 #    ite_max_iam = 50
 #    epsilon_iam = 0.001
 #    removeNodes = False
 #    connected_iam = False
 #    # parameters for IAM function
 #    ged_cost = 'CONSTANT'
 #    ged_method = 'IPFP'
 #    edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
 #    ged_stabilizer = 'min'
 #    ged_repeat = 50
 #    params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, 
 #                  'edit_cost_constant': edit_cost_constant, 
 #                  'stabilizer': ged_stabilizer, 'repeat': ged_repeat}
    
    # parameters for GED function
    ged_cost='CHEM_1'
    ged_method='IPFP'
    saveGXL='gedlib'
    # parameters for IAM function
    c_ei=1
    c_er=1
    c_es=1
    c_vi = 4
    c_vr = 4
    c_vs = 2
    c_ei = 1
    c_er = 1
    c_es = 1
    ite_max_iam = 50
    epsilon_iam = 0.001
    removeNodes = False
    connected_iam = False
    
    # number of graphs; we what to compute the median of these graphs. 
    nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100]
    # parameters for IAM function
    ged_cost = 'CHEM_1'
    ged_method = 'IPFP'
    edit_cost_constant = []
    ged_stabilizer = 'min'
    ged_repeat = 50
    params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, 
                  'edit_cost_constant': edit_cost_constant, 
                  'stabilizer': ged_stabilizer, 'repeat': ged_repeat}
    
    # find out all the graphs classified to positive group 1.
    idx_dict = get_same_item_indices(y_all)
    Gn = [Gn[i] for i in idx_dict[1]]
    
    # number of graphs; we what to compute the median of these graphs. 
 #    nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100]
    nb_median_range = [len(Gn)]
    
 #    # compute Gram matrix.
 #    time0 = time.time()
 #    km = compute_kernel(Gn, gkernel, True)
 #    time_km = time.time() - time0    
 #    # write Gram matrix to file.
 #    np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km)
    
    time_list = []
    dis_ks_min_list = []
    sod_gs_list = []
 #    sod_gs_min_list = []
 #    nb_updated_list = []
 #    nb_updated_k_list = []
    g_best = []
    for nb_median in nb_median_range:
        print('\n-------------------------------------------------------')
        print('number of median graphs =', nb_median)
        random.seed(1)
        idx_rdm = random.sample(range(len(Gn)), nb_median)
        print('graphs chosen:', idx_rdm)
        Gn_median = [Gn[idx].copy() for idx in idx_rdm]
        Gn_candidate = [g.copy() for g in Gn]
        
 #        for g in Gn_median:
 #            nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
 ##            plt.savefig("results/preimage_mix/mutag.png", format="PNG")
 #            plt.show()
 #            plt.clf()                         
                    
        ###################################################################
 #        gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz')
 #        km_tmp = gmfile['gm']
 #        time_km = gmfile['gmtime']
 #        # modify mixed gram matrix.
 #        km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median))
 #        for i in range(len(Gn)):
 #            for j in range(i, len(Gn)):
 #                km[i, j] = km_tmp[i, j]
 #                km[j, i] = km[i, j]
 #        for i in range(len(Gn)):
 #            for j, idx in enumerate(idx_rdm):
 #                km[i, len(Gn) + j] = km[i, idx]
 #                km[len(Gn) + j, i] = km[i, idx]
 #        for i, idx1 in enumerate(idx_rdm):
 #            for j, idx2 in enumerate(idx_rdm):
 #                km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2]
                
        ###################################################################
        alpha_range = [1 / nb_median] * nb_median
        time0 = time.time()
        ghat_new_list, sod_min = iam_upgraded(Gn_median, Gn_candidate, 
            c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam,
            epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes, 
            params_ged=params_ged)
            
        time_total = time.time() - time0
        print('\ntime: ', time_total)
        time_list.append(time_total)
        
        # compute distance between \psi and the new generated graphs.
        knew = compute_kernel(ghat_new_list + Gn_median, gkernel, False)
        dhat_new_list = []
        for idx, g_tmp in enumerate(ghat_new_list):
            # @todo: the term3 below could use the one at the beginning of the function.
            dhat_new_list.append(dis_gstar(idx, range(len(ghat_new_list), 
                                len(ghat_new_list) + len(Gn_median) + 1), 
                                alpha_range, knew, withterm3=False))
            
        print('\nsmallest distance in kernel space: ', dhat_new_list[0]) 
        dis_ks_min_list.append(dhat_new_list[0])
        g_best.append(ghat_new_list[0])
        
        # show the best graph and save it to file.
 #        print('the shortest distance is', dhat)
        print('one of the possible corresponding pre-images is')
        nx.draw(ghat_new_list[0], labels=nx.get_node_attributes(ghat_new_list[0], 'atom'), 
                with_labels=True)
        plt.show()
 #        plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) + 
        plt.savefig('results/iam/mutag_median_unfit2.nb' + str(nb_median) + 
                    '.png', format="PNG")
        plt.clf()
 #        print(ghat_list[0].nodes(data=True))
 #        print(ghat_list[0].edges(data=True))
    
        sod_gs_list.append(sod_min)
 #        sod_gs_min_list.append(np.min(sod_min))
        print('\nsmallest sod in graph space: ', sod_min)
        
    print('\nsods in graph space: ', sod_gs_list)
 #    print('\nsmallest sod in graph space for each set of median graphs: ', sod_gs_min_list)  
    print('\nsmallest distance in kernel space for each set of median graphs: ', 
          dis_ks_min_list) 
 #    print('\nnumber of updates of the best graph for each set of median graphs by IAM: ', 
 #          nb_updated_list)
 #    print('\nnumber of updates of k nearest graphs for each set of median graphs by IAM: ', 
 #          nb_updated_k_list)
    print('\ntimes:', time_list)
    
    
 def test_iam_letter_h():
    from median import draw_Letter_graph
    ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
          'extra_params': {}} # node nsymb
 #    ds = {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt',
 #          'extra_params': {}} # node nsymb
 #    Gn = Gn[0:50]
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
    gkernel = 'structuralspkernel'
    
    # parameters for GED function from the IAM paper.
    c_vi = 3
    c_vr = 3
    c_vs = 1
    c_ei = 3
    c_er = 3
    c_es = 1
    ite_max_iam = 50
    epsilon_iam = 0.001
    removeNodes = False
    connected_iam = False
    # parameters for IAM function
 #    ged_cost = 'CONSTANT'
    ged_cost = 'LETTER'
    ged_method = 'IPFP'
 #    edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
    edit_cost_constant = []
    ged_stabilizer = 'min'
    ged_repeat = 50
    params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, 
                  'edit_cost_constant': edit_cost_constant, 
                  'stabilizer': ged_stabilizer, 'repeat': ged_repeat}
    
    # classify graphs according to letters.
    time_list = []
    dis_ks_min_list = []
    sod_gs_list = []
    g_best = []
    sod_set_median_list = []
    idx_dict = get_same_item_indices(y_all)
    for letter in idx_dict:
        print('\n-------------------------------------------------------')
        print('letter', letter)
        Gn_let = [Gn[i].copy() for i in idx_dict[letter]]
        
        time_list.append([])
        dis_ks_min_list.append([])
        sod_gs_list.append([])
        g_best.append([])
        sod_set_median_list.append([])
        
        for repeat in range(50):
            idx_rdm = random.sample(range(len(Gn_let)), 50)
            print('graphs chosen:', idx_rdm)
            Gn_median = [Gn_let[idx].copy() for idx in idx_rdm]
            Gn_candidate = [g.copy() for g in Gn_median]
        
            alpha_range = [1 / len(Gn_median)] * len(Gn_median)
            time0 = time.time()
            ghat_new_list, sod_min, sod_set_median = iam_upgraded(Gn_median, 
                Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam,
                epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes, 
                params_ged=params_ged)
            time_total = time.time() - time0
            print('\ntime: ', time_total)
            time_list[-1].append(time_total)
            g_best[-1].append(ghat_new_list[0])
            sod_set_median_list[-1].append(sod_set_median)
            print('\nsmallest sod of the set median:', sod_set_median)
            sod_gs_list[-1].append(sod_min)
            print('\nsmallest sod in graph space:', sod_min)
            
            # show the best graph and save it to file.
            print('one of the possible corresponding pre-images is')
            draw_Letter_graph(ghat_new_list[0], savepath='results/iam/paper_compare/')
            
            # compute distance between \psi and the new generated graphs.
            knew = compute_kernel(ghat_new_list + Gn_median, gkernel, False)
            dhat_new_list = []
            for idx, g_tmp in enumerate(ghat_new_list):
                # @todo: the term3 below could use the one at the beginning of the function.
                dhat_new_list.append(dis_gstar(idx, range(len(ghat_new_list), 
                                    len(ghat_new_list) + len(Gn_median) + 1), 
                                    alpha_range, knew, withterm3=False))
                
            print('\nsmallest distance in kernel space: ', dhat_new_list[0]) 
            dis_ks_min_list[-1].append(dhat_new_list[0])            
        
        print('\nsods of the set median for this letter:', sod_set_median_list[-1])
        print('\nsods in graph space for this letter:', sod_gs_list[-1])
        print('\nsmallest distances in kernel space for this letter:', 
              dis_ks_min_list[-1])
        print('\ntimes for this letter:', time_list[-1])
        
        sod_set_median_list[-1] = np.mean(sod_set_median_list[-1])
        sod_gs_list[-1] = np.mean(sod_gs_list[-1])
        dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1])
        time_list[-1] = np.mean(time_list[-1])
        
    print('\nmean sods of the set median for each letter:', sod_set_median_list)
    print('\nmean sods in graph space for each letter:', sod_gs_list)
    print('\nmean smallest distances in kernel space for each letter:', 
            dis_ks_min_list)
    print('\nmean times for each letter:', time_list)
    
    print('\nmean sods of the set median of all:', np.mean(sod_set_median_list))
    print('\nmean sods in graph space of all:', np.mean(sod_gs_list))
    print('\nmean smallest distances in kernel space of all:', 
            np.mean(dis_ks_min_list))
    print('\nmean times of all:', np.mean(time_list))
    
    

    


    
    

 def test_iam_fitdistance():
    
    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
          'extra_params': {}}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
 #    Gn = Gn[0:50]
 #    remove_edges(Gn)
    gkernel = 'marginalizedkernel'
    node_label = 'atom'
    edge_label = 'bond_type'
    
 #    lmbda = 0.03 # termination probalility
 #    # parameters for GED function
 #    c_vi = 0.037
 #    c_vr = 0.038
 #    c_vs = 0.075
 #    c_ei = 0.001
 #    c_er = 0.001
 #    c_es = 0.0
 #    ite_max_iam = 50
 #    epsilon_iam = 0.001
 #    removeNodes = False
 #    connected_iam = False
 #    # parameters for IAM function
 #    ged_cost = 'CONSTANT'
 #    ged_method = 'IPFP'
 #    edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
 #    ged_stabilizer = 'min'
 #    ged_repeat = 50
 #    params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, 
 #                  'edit_cost_constant': edit_cost_constant, 
 #                  'stabilizer': ged_stabilizer, 'repeat': ged_repeat}
    
    # parameters for GED function
    c_vi = 4
    c_vr = 4
    c_vs = 2
    c_ei = 1
    c_er = 1
    c_es = 1
    ite_max_iam = 50
    epsilon_iam = 0.001
    removeNodes = False
    connected_iam = False
    # parameters for IAM function
    ged_cost = 'CHEM_1'
    ged_method = 'IPFP'
    edit_cost_constant = []
    ged_stabilizer = 'min'
    ged_repeat = 50
    params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, 
                  'edit_cost_constant': edit_cost_constant, 
                  'stabilizer': ged_stabilizer, 'repeat': ged_repeat}
    
    # find out all the graphs classified to positive group 1.
    idx_dict = get_same_item_indices(y_all)
    Gn = [Gn[i] for i in idx_dict[1]]
    
    # number of graphs; we what to compute the median of these graphs. 
 #    nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100]
    nb_median_range = [10]
    
 #    # compute Gram matrix.
 #    time0 = time.time()
 #    km = compute_kernel(Gn, gkernel, True)
 #    time_km = time.time() - time0
 #    # write Gram matrix to file.
 #    np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km)
    
    time_list = []
    dis_ks_min_list = []
    dis_ks_gen_median_list = []
    sod_gs_list = []
    sod_gs_min_list = []
    nb_updated_list = []
    nb_updated_k_list = []
 #    sod_gs_min_list = []
 #    nb_updated_list = []
 #    nb_updated_k_list = []
    g_best = []
    for nb_median in nb_median_range:
        print('\n-------------------------------------------------------')
@@ -90,72 +750,80 @@ def test_iam_median_nb():
 #            plt.clf()                         
                    
        ###################################################################
        gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz')
        km_tmp = gmfile['gm']
        time_km = gmfile['gmtime']
        # modify mixed gram matrix.
        km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median))
        for i in range(len(Gn)):
            for j in range(i, len(Gn)):
                km[i, j] = km_tmp[i, j]
                km[j, i] = km[i, j]
        for i in range(len(Gn)):
            for j, idx in enumerate(idx_rdm):
                km[i, len(Gn) + j] = km[i, idx]
                km[len(Gn) + j, i] = km[i, idx]
        for i, idx1 in enumerate(idx_rdm):
            for j, idx2 in enumerate(idx_rdm):
                km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2]
 #        gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz')
 #        km_tmp = gmfile['gm']
 #        time_km = gmfile['gmtime']
 #        # modify mixed gram matrix.
 #        km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median))
 #        for i in range(len(Gn)):
 #            for j in range(i, len(Gn)):
 #                km[i, j] = km_tmp[i, j]
 #                km[j, i] = km[i, j]
 #        for i in range(len(Gn)):
 #            for j, idx in enumerate(idx_rdm):
 #                km[i, len(Gn) + j] = km[i, idx]
 #                km[len(Gn) + j, i] = km[i, idx]
 #        for i, idx1 in enumerate(idx_rdm):
 #            for j, idx2 in enumerate(idx_rdm):
 #                km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2]
                
        ###################################################################
        alpha_range = [1 / nb_median] * nb_median
        time0 = time.time()
        ghat_new_list, dis_min = iam_upgraded(Gn_median, Gn_candidate, 
            c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam, 
            epsilon=epsilon_iam, removeNodes=removeNodes, 
            connected=connected_iam, 
            params_ged={'ged_cost': ged_cost, 'ged_method': ged_method, 
                        'saveGXL': saveGXL})
        G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \
            = iam_upgraded(Gn_median, Gn_candidate, 
            c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam,
            epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes, 
            params_ged=params_ged)
            
        time_total = time.time() - time0
        print('\ntime: ', time_total)
        time_list.append(time_total)
        print('\nsmallest distance in kernel space: ', dhat) 
        dis_ks_min_list.append(dhat)
        g_best.append(ghat_list)
        print('\nnumber of updates of the best graph: ', nb_updated)
        nb_updated_list.append(nb_updated)
        print('\nnumber of updates of k nearest graphs: ', nb_updated_k)
        nb_updated_k_list.append(nb_updated_k)
        
        # compute distance between \psi and the new generated graphs.
        knew = compute_kernel(G_gen_median_list + Gn_median, gkernel, node_label,
                              edge_label, False)
        dhat_new_list = []
        for idx, g_tmp in enumerate(G_gen_median_list):
            # @todo: the term3 below could use the one at the beginning of the function.
            dhat_new_list.append(dis_gstar(idx, range(len(G_gen_median_list), 
                                len(G_gen_median_list) + len(Gn_median) + 1), 
                                alpha_range, knew, withterm3=False))
            
        print('\nsmallest distance in kernel space: ', dhat_new_list[0]) 
        dis_ks_min_list.append(dhat_new_list[0])
        g_best.append(G_gen_median_list[0])
        
        # show the best graph and save it to file.
        print('the shortest distance is', dhat)
 #        print('the shortest distance is', dhat)
        print('one of the possible corresponding pre-images is')
        nx.draw(ghat_list[0], labels=nx.get_node_attributes(ghat_list[0], 'atom'), 
        nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'), 
                with_labels=True)
        plt.show()
        plt.savefig('results/preimage_iam/mutag_median_nb' + str(nb_median) + 
                    '.png', format="PNG")
 #        plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) + 
 #        plt.savefig('results/iam/mutag_median_unfit2.nb' + str(nb_median) + 
 #                    '.png', format="PNG")
        plt.clf()
 #        print(ghat_list[0].nodes(data=True))
 #        print(ghat_list[0].edges(data=True))
    
        # compute the corresponding sod in graph space.
        sod_tmp, _ = ged_median([ghat_list[0]], Gn_median, ged_cost=ged_cost, 
                                     ged_method=ged_method, saveGXL=saveGXL)
        sod_gs_list.append(sod_tmp)
        sod_gs_min_list.append(np.min(sod_tmp))
        print('\nsmallest sod in graph space: ', np.min(sod_tmp))
        sod_gs_list.append(sod_gen_median)
 #        sod_gs_min_list.append(np.min(sod_gen_median))
        print('\nsmallest sod in graph space: ', sod_gen_median)
        print('\nsmallest sod of set median in graph space: ', sod_set_median)
        
    print('\nsods in graph space: ', sod_gs_list)
    print('\nsmallest sod in graph space for each set of median graphs: ', sod_gs_min_list)  
 #    print('\nsmallest sod in graph space for each set of median graphs: ', sod_gs_min_list)  
    print('\nsmallest distance in kernel space for each set of median graphs: ', 
          dis_ks_min_list) 
    print('\nnumber of updates of the best graph for each set of median graphs by IAM: ', 
          nb_updated_list)
    print('\nnumber of updates of k nearest graphs for each set of median graphs by IAM: ', 
          nb_updated_k_list)
 #    print('\nnumber of updates of the best graph for each set of median graphs by IAM: ', 
 #          nb_updated_list)
 #    print('\nnumber of updates of k nearest graphs for each set of median graphs by IAM: ', 
 #          nb_updated_k_list)
    print('\ntimes:', time_list)
        
    
            
    
    
 ###############################################################################
@@ -164,4 +832,11 @@ def test_iam_median_nb():
 if __name__ == '__main__':
 ###############################################################################
 # tests on different numbers of median-sets.
    test_iam_median_nb()
 #    test_iam_median_nb()
 #    test_iam_letter_h()
    test_iam_monoterpenoides()
 #    test_iam_mutag()
    
 #    test_iam_fitdistance()
 #    print("test log")
    
--- a/preimage/test_preimage_iam.py
+++ b/preimage/test_preimage_iam.py
@@ -192,26 +192,42 @@ def test_preimage_iam_median_nb():
    gkernel = 'marginalizedkernel'
    
    lmbda = 0.03 # termination probalility
    r_max = 10 # iteration limit for pre-image.
    r_max = 3 # iteration limit for pre-image.
 #    alpha_range = np.linspace(0.5, 0.5, 1)
    k = 5 # k nearest neighbors
    epsilon = 1e-6
    InitIAMWithAllDk = True
    # parameters for GED function
    ged_cost='CHEM_1'
    ged_method='IPFP'
    saveGXL='gedlib'
    # parameters for IAM function
    c_ei=1
    c_er=1
    c_es=1
 #    c_vi = 0.037
 #    c_vr = 0.038
 #    c_vs = 0.075
 #    c_ei = 0.001
 #    c_er = 0.001
 #    c_es = 0.0
    c_vi = 4
    c_vr = 4
    c_vs = 2
    c_ei = 1
    c_er = 1
    c_es = 1
    ite_max_iam = 50
    epsilon_iam = 0.001
    removeNodes = True
    connected_iam = False
    # parameters for GED function
 #    ged_cost='CHEM_1'
    ged_cost = 'CONSTANT'
    ged_method = 'IPFP'
    edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
    ged_stabilizer = 'min'
    ged_repeat = 50
    params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, 
                  'edit_cost_constant': edit_cost_constant, 
                  'stabilizer': ged_stabilizer, 'repeat': ged_repeat}
    
    # number of graphs; we what to compute the median of these graphs. 
    nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100]
 #    nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100]
    nb_median_range = [2]
    
    # find out all the graphs classified to positive group 1.
    idx_dict = get_same_item_indices(y_all)
@@ -274,8 +290,7 @@ def test_preimage_iam_median_nb():
            params_iam={'c_ei': c_ei, 'c_er': c_er, 'c_es': c_es, 
                        'ite_max': ite_max_iam, 'epsilon': epsilon_iam,
                        'removeNodes': removeNodes, 'connected': connected_iam},
            params_ged={'ged_cost': ged_cost, 'ged_method': ged_method, 
                        'saveGXL': saveGXL})
            params_ged=params_ged)
            
        time_total = time.time() - time0 + time_km
        print('\ntime: ', time_total)
@@ -293,16 +308,15 @@ def test_preimage_iam_median_nb():
        print('one of the possible corresponding pre-images is')
        nx.draw(ghat_list[0], labels=nx.get_node_attributes(ghat_list[0], 'atom'), 
                with_labels=True)
 #        plt.show()
        plt.savefig('results/preimage_iam/mutag_median_nb' + str(nb_median) + 
                    '.png', format="PNG")
        plt.show()
 #        plt.savefig('results/preimage_iam/mutag_median_cs.001_nb' + str(nb_median) + 
 #                    '.png', format="PNG")
        plt.clf()
 #        print(ghat_list[0].nodes(data=True))
 #        print(ghat_list[0].edges(data=True))
    
        # compute the corresponding sod in graph space.
        sod_tmp, _ = ged_median([ghat_list[0]], Gn_median, ged_cost=ged_cost, 
                                     ged_method=ged_method, saveGXL=saveGXL)
        sod_tmp, _ = ged_median([ghat_list[0]], Gn_median, params_ged=params_ged)
        sod_gs_list.append(sod_tmp)
        sod_gs_min_list.append(np.min(sod_tmp))
        print('\nsmallest sod in graph space: ', np.min(sod_tmp))
--- a/preimage/utils.py
+++ b/preimage/utils.py
@@ -39,13 +39,13 @@ def dis_gstar(idx_g, idx_gi, alpha, Kmatrix, term3=0, withterm3=True):
    return np.sqrt(term1 - term2 + term3)


 def compute_kernel(Gn, graph_kernel, verbose):
 def compute_kernel(Gn, graph_kernel, node_label, edge_label, verbose):
    if graph_kernel == 'marginalizedkernel':
        Kmatrix, _ = marginalizedkernel(Gn, node_label='atom', edge_label=None,
        Kmatrix, _ = marginalizedkernel(Gn, node_label=node_label, edge_label=edge_label,
                                  p_quit=0.03, n_iteration=10, remove_totters=False,
                                  n_jobs=multiprocessing.cpu_count(), verbose=verbose)
    elif graph_kernel == 'untilhpathkernel':
        Kmatrix, _ = untilhpathkernel(Gn, node_label='atom', edge_label=None,
        Kmatrix, _ = untilhpathkernel(Gn, node_label=node_label, edge_label=edge_label,
                                  depth=10, k_func='MinMax', compute_method='trie',
                                  n_jobs=multiprocessing.cpu_count(), verbose=verbose)
    elif graph_kernel == 'spkernel':
@@ -77,10 +77,10 @@ def gram2distances(Kmatrix):
    return dmatrix


 def kernel_distance_matrix(Gn, Kmatrix=None, gkernel=None):
 def kernel_distance_matrix(Gn, node_label, edge_label, Kmatrix=None, gkernel=None):
    dis_mat = np.empty((len(Gn), len(Gn)))
    if Kmatrix == None:
        Kmatrix = compute_kernel(Gn, gkernel, True)
        Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True)
    for i in range(len(Gn)):
        for j in range(i, len(Gn)):
            dis = Kmatrix[i, i] + Kmatrix[j, j] - 2 * Kmatrix[i, j]
--- a/pygraph/utils/graphfiles.py
+++ b/pygraph/utils/graphfiles.py
@@ -1,9 +1,9 @@
 """ Utilities function to manage graph files
 """

 from os.path import dirname, splitext

 def loadCT(filename):
    """load data from .ct file.
    """load data from a Chemical Table (.ct) file.

    Notes
    ------
@@ -13,8 +13,11 @@ def loadCT(filename):
        0.0000    0.0000    0.0000 C <- each line describes a node (x,y,z + label)
        0.0000    0.0000    0.0000 C
        0.0000    0.0000    0.0000 O
      1  3  1  1 <- each line describes an edge : to, from,?, label
      1  3  1  1 <- each line describes an edge : to, from, bond type, bond stereo
      2  3  1  1
      
    Check https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=10&ved=2ahUKEwivhaSdjsTlAhVhx4UKHczHA8gQFjAJegQIARAC&url=https%3A%2F%2Fwww.daylight.com%2Fmeetings%2Fmug05%2FKappler%2Fctfile.pdf&usg=AOvVaw1cDNrrmMClkFPqodlF2inS
    for detailed format discription.
    """
    import networkx as nx
    from os.path import basename
@@ -35,22 +38,15 @@ def loadCT(filename):
        for i in range(0, nb_nodes):
            tmp = content[i + 2].split(" ")
            tmp = [x for x in tmp if x != '']
            g.add_node(i, atom=tmp[3], label=tmp[3])
            g.add_node(i, atom=tmp[3].strip(), 
                       label=[item.strip() for item in tmp[3:]], 
                       attributes=[item.strip() for item in tmp[0:3]])
        for i in range(0, nb_edges):
            tmp = content[i + g.number_of_nodes() + 2].split(" ")
            tmp = [x for x in tmp if x != '']
            g.add_edge(
                int(tmp[0]) - 1,
                int(tmp[1]) - 1,
                bond_type=tmp[3].strip(),
                label=tmp[3].strip())


 #         for i in range(0, nb_edges):
 #             tmp = content[i + g.number_of_nodes() + 2]
 #             tmp = [tmp[i:i+3] for i in range(0, len(tmp), 3)]
 #             g.add_edge(int(tmp[0]) - 1, int(tmp[1]) - 1,
 #                        bond_type=tmp[3].strip(), label=tmp[3].strip())
            g.add_edge(int(tmp[0]) - 1, int(tmp[1]) - 1,
                       bond_type=tmp[2].strip(),
                       label=[item.strip() for item in tmp[2:]])
    return g


@@ -71,6 +67,7 @@ def loadGXL(filename):
            labels[attr.attrib['name']] = attr[0].text
        if 'chem' in labels:
            labels['label'] = labels['chem']
            labels['atom'] = labels['chem']
        g.add_node(index, **labels)
        index += 1

@@ -80,6 +77,7 @@ def loadGXL(filename):
            labels[attr.attrib['name']] = attr[0].text
        if 'valence' in labels:
            labels['label'] = labels['valence']
            labels['bond_type'] = labels['valence']
        g.add_edge(dic[edge.attrib['from']], dic[edge.attrib['to']], **labels)
    return g

@@ -392,7 +390,7 @@ def loadDataset(filename, filename_y=None, extra_params=None):
    Notes
    -----
    This function supports following graph dataset formats:
    'ds': load data from .ct file. See comments of function loadCT for a example.
    'ds': load data from .ds file. See comments of function loadFromDS for a example.
    'cxl': load data from Graph eXchange Language file (.cxl file). See 
        http://www.gupro.de/GXL/Introduction/background.html, 2019 for detail.
    'sdf': load data from structured data file (.sdf file). See 
@@ -406,45 +404,24 @@ def loadDataset(filename, filename_y=None, extra_params=None):
        2019 for details. Note here filename is the name of either .txt file in
        the dataset directory.
    """
    from os.path import dirname, splitext

    dirname_dataset = dirname(filename)
    extension = splitext(filename)[1][1:]
    data = []
    y = []
    if extension == "ds":
        content = open(filename).read().splitlines()
        if filename_y is None or filename_y == '':
            for i in range(0, len(content)):
                tmp = content[i].split(' ')
                # remove the '#'s in file names
                data.append(
                    loadCT(dirname_dataset + '/' + tmp[0].replace('#', '', 1)))
                y.append(float(tmp[1]))
        else:  # y in a seperate file
            for i in range(0, len(content)):
                tmp = content[i]
                # remove the '#'s in file names
                data.append(
                    loadCT(dirname_dataset + '/' + tmp.replace('#', '', 1)))
            content_y = open(filename_y).read().splitlines()
            # assume entries in filename and filename_y have the same order.
            for item in content_y:
                tmp = item.split(' ')
                # assume the 3rd entry in a line is y (for Alkane dataset)
                y.append(float(tmp[2]))
        data, y = loadFromDS(filename, filename_y)
    elif extension == "cxl":
        import xml.etree.ElementTree as ET

        
        dirname_dataset = dirname(filename)
        tree = ET.parse(filename)
        root = tree.getroot()
        data = []
        y = []
        for graph in root.iter('print'):
        for graph in root.iter('graph'):
            mol_filename = graph.attrib['file']
            mol_class = graph.attrib['class']
            data.append(loadGXL(dirname_dataset + '/' + mol_filename))
            y.append(mol_class)
    elif extension == 'xml':
        data, y = loadFromXML(filename, extra_params)
    elif extension == "sdf":
        import numpy as np
        from tqdm import tqdm
@@ -471,6 +448,7 @@ def loadDataset(filename, filename_y=None, extra_params=None):
    elif extension == "mat":
        data, y = loadMAT(filename, extra_params)
    elif extension == 'txt':
        dirname_dataset = dirname(filename)
        data, y = loadTXT(dirname_dataset)
        # print(len(y))
        # print(y)
@@ -485,6 +463,75 @@ def loadDataset(filename, filename_y=None, extra_params=None):
    return data, y


 def loadFromXML(filename, extra_params):
    import xml.etree.ElementTree as ET
    
    dirname_dataset = dirname(filename)
    tree = ET.parse(filename)
    root = tree.getroot()
    data = []
    y = []
    for graph in root.iter('print'):
        mol_filename = graph.attrib['file']
        mol_class = graph.attrib['class']
        data.append(loadGXL(dirname_dataset + '/' + mol_filename))
        y.append(mol_class)
        
    return data, y
            

 def loadFromDS(filename, filename_y):
    """Load data from .ds file.
    Possible graph formats include:
        '.ct': see function loadCT for detail.
        '.gxl': see dunction loadGXL for detail.
    Note these graph formats are checked automatically by the extensions of 
    graph files.
    """
    dirname_dataset = dirname(filename)
    data = []
    y = []
    content = open(filename).read().splitlines()
    extension = splitext(content[0].split(' ')[0])[1][1:]
    if filename_y is None or filename_y == '':
        if extension == 'ct': 
            for i in range(0, len(content)):
                tmp = content[i].split(' ')
                # remove the '#'s in file names
                data.append(
                    loadCT(dirname_dataset + '/' + tmp[0].replace('#', '', 1)))
                y.append(float(tmp[1]))
        elif extension == 'gxl':
            for i in range(0, len(content)):
                tmp = content[i].split(' ')
                # remove the '#'s in file names
                data.append(
                    loadGXL(dirname_dataset + '/' + tmp[0].replace('#', '', 1)))
                y.append(float(tmp[1]))
    else:  # y in a seperate file
        if extension == 'ct':
            for i in range(0, len(content)):
                tmp = content[i]
                # remove the '#'s in file names
                data.append(
                    loadCT(dirname_dataset + '/' + tmp.replace('#', '', 1)))
        elif extension == 'gxl':
            for i in range(0, len(content)):
                tmp = content[i]
                # remove the '#'s in file names
                data.append(
                    loadGXL(dirname_dataset + '/' + tmp.replace('#', '', 1)))                                                       
                                                               
        content_y = open(filename_y).read().splitlines()
        # assume entries in filename and filename_y have the same order.
        for item in content_y:
            tmp = item.split(' ')
            # assume the 3rd entry in a line is y (for Alkane dataset)
            y.append(float(tmp[2]))
            
    return data, y
                

 def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=None):
    """Save list of graphs.
    """
@@ -509,7 +556,30 @@ def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=None
            
            
 if __name__ == '__main__':    
    ds = {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat',
          'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}  # node/edge symb
    Gn, y = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
    saveDataset(Gn, y, group='xml', filename='temp/temp')
 #    ### Load dataset from .ds file.
 #    # .ct files.
    ds = {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds',
        'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt'}
    Gn, y = loadDataset(ds['dataset'], filename_y=ds['dataset_y'])
 #    ds = {'name': 'Acyclic', 'dataset': '../../datasets/acyclic/dataset_bps.ds'}  # node symb
 #    Gn, y = loadDataset(ds['dataset'])
 #    ds = {'name': 'MAO', 'dataset': '../../datasets/MAO/dataset.ds'} # node/edge symb
 #    Gn, y = loadDataset(ds['dataset'])
 #    ds = {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds'} # unlabeled
 #    Gn, y = loadDataset(ds['dataset'])
    print(Gn[1].nodes(data=True))
    print(Gn[1].edges(data=True))
    print(y[1])
    
 #    # .gxl file.
 #    ds = {'name': 'monoterpenoides', 
 #          'dataset': '../../datasets/monoterpenoides/dataset_10+.ds'}  # node/edge symb
 #    Gn, y = loadDataset(ds['dataset'])
 #    print(Gn[1].nodes(data=True))
 #    print(Gn[1].edges(data=True))
 #    print(y[1])
    
 #    ds = {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat',
 #          'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}  # node/edge symb
 #    Gn, y = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
 #    saveDataset(Gn, y, group='xml', filename='temp/temp')