| @@ -0,0 +1,307 @@ | |||||
| """ | |||||
| @author: linlin | |||||
| @references: | |||||
| [1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between | |||||
| labeled graphs. In Proceedings of the 20th International Conference on | |||||
| Machine Learning, Washington, DC, United States, 2003. | |||||
| [2] Pierre Mahé, Nobuhisa Ueda, Tatsuya Akutsu, Jean-Luc Perret, and | |||||
| Jean-Philippe Vert. Extensions of marginalized graph kernels. In | |||||
| Proceedings of the twenty-first international conference on Machine | |||||
| learning, page 70. ACM, 2004. | |||||
| """ | |||||
| import sys | |||||
| import time | |||||
| from functools import partial | |||||
| from multiprocessing import Pool | |||||
| from tqdm import tqdm | |||||
| tqdm.monitor_interval = 0 | |||||
| #import traceback | |||||
| import networkx as nx | |||||
| import numpy as np | |||||
| from gklearn.utils.kernels import deltakernel | |||||
| from gklearn.utils.utils import untotterTransformation | |||||
| from gklearn.utils.graphdataset import get_dataset_attributes | |||||
| from gklearn.utils.parallel import parallel_gm | |||||
| def marginalizedkernel(*args, | |||||
| node_label='atom', | |||||
| edge_label='bond_type', | |||||
| p_quit=0.5, | |||||
| n_iteration=20, | |||||
| remove_totters=False, | |||||
| n_jobs=None, | |||||
| chunksize=None, | |||||
| verbose=True): | |||||
| """Calculate marginalized graph kernels between graphs. | |||||
| Parameters | |||||
| ---------- | |||||
| Gn : List of NetworkX graph | |||||
| List of graphs between which the kernels are calculated. | |||||
| G1, G2 : NetworkX graphs | |||||
| Two graphs between which the kernel is calculated. | |||||
| node_label : string | |||||
| Node attribute used as symbolic label. The default node label is 'atom'. | |||||
| edge_label : string | |||||
| Edge attribute used as symbolic label. The default edge label is 'bond_type'. | |||||
| p_quit : integer | |||||
| The termination probability in the random walks generating step. | |||||
| n_iteration : integer | |||||
| Time of iterations to calculate R_inf. | |||||
| remove_totters : boolean | |||||
| Whether to remove totterings by method introduced in [2]. The default | |||||
| value is False. | |||||
| n_jobs : int | |||||
| Number of jobs for parallelization. | |||||
| Return | |||||
| ------ | |||||
| Kmatrix : Numpy matrix | |||||
| Kernel matrix, each element of which is the marginalized kernel between | |||||
| 2 praphs. | |||||
| """ | |||||
| # pre-process | |||||
| n_iteration = int(n_iteration) | |||||
| Gn = args[0][:] if len(args) == 1 else [args[0].copy(), args[1].copy()] | |||||
| Gn = [g.copy() for g in Gn] | |||||
| ds_attrs = get_dataset_attributes( | |||||
| Gn, | |||||
| attr_names=['node_labeled', 'edge_labeled', 'is_directed'], | |||||
| node_label=node_label, edge_label=edge_label) | |||||
| if not ds_attrs['node_labeled'] or node_label == None: | |||||
| node_label = 'atom' | |||||
| for G in Gn: | |||||
| nx.set_node_attributes(G, '0', 'atom') | |||||
| if not ds_attrs['edge_labeled'] or edge_label == None: | |||||
| edge_label = 'bond_type' | |||||
| for G in Gn: | |||||
| nx.set_edge_attributes(G, '0', 'bond_type') | |||||
| start_time = time.time() | |||||
| if remove_totters: | |||||
| # ---- use pool.imap_unordered to parallel and track progress. ---- | |||||
| pool = Pool(n_jobs) | |||||
| untotter_partial = partial(wrapper_untotter, Gn, node_label, edge_label) | |||||
| if chunksize is None: | |||||
| if len(Gn) < 100 * n_jobs: | |||||
| chunksize = int(len(Gn) / n_jobs) + 1 | |||||
| else: | |||||
| chunksize = 100 | |||||
| for i, g in tqdm( | |||||
| pool.imap_unordered( | |||||
| untotter_partial, range(0, len(Gn)), chunksize), | |||||
| desc='removing tottering', | |||||
| file=sys.stdout): | |||||
| Gn[i] = g | |||||
| pool.close() | |||||
| pool.join() | |||||
| # # ---- direct running, normally use single CPU core. ---- | |||||
| # Gn = [ | |||||
| # untotterTransformation(G, node_label, edge_label) | |||||
| # for G in tqdm(Gn, desc='removing tottering', file=sys.stdout) | |||||
| # ] | |||||
| Kmatrix = np.zeros((len(Gn), len(Gn))) | |||||
| # ---- use pool.imap_unordered to parallel and track progress. ---- | |||||
| def init_worker(gn_toshare): | |||||
| global G_gn | |||||
| G_gn = gn_toshare | |||||
| do_partial = partial(wrapper_marg_do, node_label, edge_label, | |||||
| p_quit, n_iteration) | |||||
| parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | |||||
| glbv=(Gn,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) | |||||
| # # ---- direct running, normally use single CPU core. ---- | |||||
| ## pbar = tqdm( | |||||
| ## total=(1 + len(Gn)) * len(Gn) / 2, | |||||
| ## desc='calculating kernels', | |||||
| ## file=sys.stdout) | |||||
| # for i in range(0, len(Gn)): | |||||
| # for j in range(i, len(Gn)): | |||||
| ## print(i, j) | |||||
| # Kmatrix[i][j] = _marginalizedkernel_do(Gn[i], Gn[j], node_label, | |||||
| # edge_label, p_quit, n_iteration) | |||||
| # Kmatrix[j][i] = Kmatrix[i][j] | |||||
| ## pbar.update(1) | |||||
| run_time = time.time() - start_time | |||||
| if verbose: | |||||
| print("\n --- marginalized kernel matrix of size %d built in %s seconds ---" | |||||
| % (len(Gn), run_time)) | |||||
| return Kmatrix, run_time | |||||
| def _marginalizedkernel_do(g1, g2, node_label, edge_label, p_quit, n_iteration): | |||||
| """Calculate marginalized graph kernel between 2 graphs. | |||||
| Parameters | |||||
| ---------- | |||||
| G1, G2 : NetworkX graphs | |||||
| 2 graphs between which the kernel is calculated. | |||||
| node_label : string | |||||
| node attribute used as label. | |||||
| edge_label : string | |||||
| edge attribute used as label. | |||||
| p_quit : integer | |||||
| the termination probability in the random walks generating step. | |||||
| n_iteration : integer | |||||
| time of iterations to calculate R_inf. | |||||
| Return | |||||
| ------ | |||||
| kernel : float | |||||
| Marginalized Kernel between 2 graphs. | |||||
| """ | |||||
| # init parameters | |||||
| kernel = 0 | |||||
| num_nodes_G1 = nx.number_of_nodes(g1) | |||||
| num_nodes_G2 = nx.number_of_nodes(g2) | |||||
| # the initial probability distribution in the random walks generating step | |||||
| # (uniform distribution over |G|) | |||||
| p_init_G1 = 1 / num_nodes_G1 | |||||
| p_init_G2 = 1 / num_nodes_G2 | |||||
| q = p_quit * p_quit | |||||
| r1 = q | |||||
| # # initial R_inf | |||||
| # # matrix to save all the R_inf for all pairs of nodes | |||||
| # R_inf = np.zeros([num_nodes_G1, num_nodes_G2]) | |||||
| # | |||||
| # # calculate R_inf with a simple interative method | |||||
| # for i in range(1, n_iteration): | |||||
| # R_inf_new = np.zeros([num_nodes_G1, num_nodes_G2]) | |||||
| # R_inf_new.fill(r1) | |||||
| # | |||||
| # # calculate R_inf for each pair of nodes | |||||
| # for node1 in g1.nodes(data=True): | |||||
| # neighbor_n1 = g1[node1[0]] | |||||
| # # the transition probability distribution in the random walks | |||||
| # # generating step (uniform distribution over the vertices adjacent | |||||
| # # to the current vertex) | |||||
| # if len(neighbor_n1) > 0: | |||||
| # p_trans_n1 = (1 - p_quit) / len(neighbor_n1) | |||||
| # for node2 in g2.nodes(data=True): | |||||
| # neighbor_n2 = g2[node2[0]] | |||||
| # if len(neighbor_n2) > 0: | |||||
| # p_trans_n2 = (1 - p_quit) / len(neighbor_n2) | |||||
| # | |||||
| # for neighbor1 in neighbor_n1: | |||||
| # for neighbor2 in neighbor_n2: | |||||
| # t = p_trans_n1 * p_trans_n2 * \ | |||||
| # deltakernel(g1.node[neighbor1][node_label], | |||||
| # g2.node[neighbor2][node_label]) * \ | |||||
| # deltakernel( | |||||
| # neighbor_n1[neighbor1][edge_label], | |||||
| # neighbor_n2[neighbor2][edge_label]) | |||||
| # | |||||
| # R_inf_new[node1[0]][node2[0]] += t * R_inf[neighbor1][ | |||||
| # neighbor2] # ref [1] equation (8) | |||||
| # R_inf[:] = R_inf_new | |||||
| # | |||||
| # # add elements of R_inf up and calculate kernel | |||||
| # for node1 in g1.nodes(data=True): | |||||
| # for node2 in g2.nodes(data=True): | |||||
| # s = p_init_G1 * p_init_G2 * deltakernel( | |||||
| # node1[1][node_label], node2[1][node_label]) | |||||
| # kernel += s * R_inf[node1[0]][node2[0]] # ref [1] equation (6) | |||||
| R_inf = {} # dict to save all the R_inf for all pairs of nodes | |||||
| # initial R_inf, the 1st iteration. | |||||
| for node1 in g1.nodes(): | |||||
| for node2 in g2.nodes(): | |||||
| # R_inf[(node1[0], node2[0])] = r1 | |||||
| if len(g1[node1]) > 0: | |||||
| if len(g2[node2]) > 0: | |||||
| R_inf[(node1, node2)] = r1 | |||||
| else: | |||||
| R_inf[(node1, node2)] = p_quit | |||||
| else: | |||||
| if len(g2[node2]) > 0: | |||||
| R_inf[(node1, node2)] = p_quit | |||||
| else: | |||||
| R_inf[(node1, node2)] = 1 | |||||
| # compute all transition probability first. | |||||
| t_dict = {} | |||||
| if n_iteration > 1: | |||||
| for node1 in g1.nodes(): | |||||
| neighbor_n1 = g1[node1] | |||||
| # the transition probability distribution in the random walks | |||||
| # generating step (uniform distribution over the vertices adjacent | |||||
| # to the current vertex) | |||||
| if len(neighbor_n1) > 0: | |||||
| p_trans_n1 = (1 - p_quit) / len(neighbor_n1) | |||||
| for node2 in g2.nodes(): | |||||
| neighbor_n2 = g2[node2] | |||||
| if len(neighbor_n2) > 0: | |||||
| p_trans_n2 = (1 - p_quit) / len(neighbor_n2) | |||||
| for neighbor1 in neighbor_n1: | |||||
| for neighbor2 in neighbor_n2: | |||||
| t_dict[(node1, node2, neighbor1, neighbor2)] = \ | |||||
| p_trans_n1 * p_trans_n2 * \ | |||||
| deltakernel(g1.nodes[neighbor1][node_label], | |||||
| g2.nodes[neighbor2][node_label]) * \ | |||||
| deltakernel( | |||||
| neighbor_n1[neighbor1][edge_label], | |||||
| neighbor_n2[neighbor2][edge_label]) | |||||
| # calculate R_inf with a simple interative method | |||||
| for i in range(2, n_iteration + 1): | |||||
| R_inf_old = R_inf.copy() | |||||
| # calculate R_inf for each pair of nodes | |||||
| for node1 in g1.nodes(): | |||||
| neighbor_n1 = g1[node1] | |||||
| # the transition probability distribution in the random walks | |||||
| # generating step (uniform distribution over the vertices adjacent | |||||
| # to the current vertex) | |||||
| if len(neighbor_n1) > 0: | |||||
| for node2 in g2.nodes(): | |||||
| neighbor_n2 = g2[node2] | |||||
| if len(neighbor_n2) > 0: | |||||
| R_inf[(node1, node2)] = r1 | |||||
| for neighbor1 in neighbor_n1: | |||||
| for neighbor2 in neighbor_n2: | |||||
| R_inf[(node1, node2)] += \ | |||||
| (t_dict[(node1, node2, neighbor1, neighbor2)] * \ | |||||
| R_inf_old[(neighbor1, neighbor2)]) # ref [1] equation (8) | |||||
| # add elements of R_inf up and calculate kernel | |||||
| for (n1, n2), value in R_inf.items(): | |||||
| s = p_init_G1 * p_init_G2 * deltakernel( | |||||
| g1.nodes[n1][node_label], g2.nodes[n2][node_label]) | |||||
| kernel += s * value # ref [1] equation (6) | |||||
| return kernel | |||||
| def wrapper_marg_do(node_label, edge_label, p_quit, n_iteration, itr): | |||||
| i= itr[0] | |||||
| j = itr[1] | |||||
| return i, j, _marginalizedkernel_do(G_gn[i], G_gn[j], node_label, edge_label, p_quit, n_iteration) | |||||
| def wrapper_untotter(Gn, node_label, edge_label, i): | |||||
| return i, untotterTransformation(Gn[i], node_label, edge_label) | |||||