New translations marginalizedKernel.py (Chinese Simplified)

5 years ago · b3cd59e46c
--- a/lang/zh/gklearn/kernels/marginalizedKernel.py
+++ b/lang/zh/gklearn/kernels/marginalizedKernel.py
@@ -0,0 +1,307 @@
 """
@author: linlin

@references:

 	[1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between 
 	labeled graphs. In Proceedings of the 20th International Conference on 
 	Machine Learning, Washington, DC, United States, 2003.

 	[2] Pierre Mahé, Nobuhisa Ueda, Tatsuya Akutsu, Jean-Luc Perret, and 
 	Jean-Philippe Vert. Extensions of marginalized graph kernels. In 
 	Proceedings of the twenty-first international conference on Machine 
 	learning, page 70. ACM, 2004.
 """

 import sys
 import time
 from functools import partial
 from multiprocessing import Pool
 from tqdm import tqdm
 tqdm.monitor_interval = 0
 #import traceback

 import networkx as nx
 import numpy as np

 from gklearn.utils.kernels import deltakernel
 from gklearn.utils.utils import untotterTransformation
 from gklearn.utils.graphdataset import get_dataset_attributes
 from gklearn.utils.parallel import parallel_gm


 def marginalizedkernel(*args,
 					   node_label='atom',
 					   edge_label='bond_type',
 					   p_quit=0.5,
 					   n_iteration=20,
 					   remove_totters=False,
 					   n_jobs=None,
 					   chunksize=None,
 					   verbose=True):
 	"""Calculate marginalized graph kernels between graphs.

 	Parameters
 	----------
 	Gn : List of NetworkX graph
 		List of graphs between which the kernels are calculated.
 	
 	G1, G2 : NetworkX graphs
 		Two graphs between which the kernel is calculated.

 	node_label : string
 		Node attribute used as symbolic label. The default node label is 'atom'.

 	edge_label : string
 		Edge attribute used as symbolic label. The default edge label is 'bond_type'.

 	p_quit : integer
 		The termination probability in the random walks generating step.

 	n_iteration : integer
 		Time of iterations to calculate R_inf.

 	remove_totters : boolean
 		Whether to remove totterings by method introduced in [2]. The default 
 		value is False.

 	n_jobs : int
 		Number of jobs for parallelization.   

 	Return
 	------
 	Kmatrix : Numpy matrix
 		Kernel matrix, each element of which is the marginalized kernel between
 		2 praphs.
 	"""
 	# pre-process
 	n_iteration = int(n_iteration)
 	Gn = args[0][:] if len(args) == 1 else [args[0].copy(), args[1].copy()]
 	Gn = [g.copy() for g in Gn]
 	
 	ds_attrs = get_dataset_attributes(
 		Gn,
 		attr_names=['node_labeled', 'edge_labeled', 'is_directed'],
 		node_label=node_label, edge_label=edge_label)
 	if not ds_attrs['node_labeled'] or node_label == None:
 		node_label = 'atom'
 		for G in Gn:
 			nx.set_node_attributes(G, '0', 'atom')
 	if not ds_attrs['edge_labeled'] or edge_label == None:
 		edge_label = 'bond_type'
 		for G in Gn:
 			nx.set_edge_attributes(G, '0', 'bond_type')

 	start_time = time.time()
 	
 	if remove_totters:
 		# ---- use pool.imap_unordered to parallel and track progress. ----
 		pool = Pool(n_jobs)
 		untotter_partial = partial(wrapper_untotter, Gn, node_label, edge_label)
 		if chunksize is None:
 			if len(Gn) < 100 * n_jobs:
 				chunksize = int(len(Gn) / n_jobs) + 1
 			else:
 				chunksize = 100
 		for i, g in tqdm(
 				pool.imap_unordered(
 					untotter_partial, range(0, len(Gn)), chunksize),
 				desc='removing tottering',
 				file=sys.stdout):
 			Gn[i] = g
 		pool.close()
 		pool.join()

 #		# ---- direct running, normally use single CPU core. ----
 #		Gn = [
 #			untotterTransformation(G, node_label, edge_label)
 #			for G in tqdm(Gn, desc='removing tottering', file=sys.stdout)
 #		]

 	Kmatrix = np.zeros((len(Gn), len(Gn)))

 	# ---- use pool.imap_unordered to parallel and track progress. ----
 	def init_worker(gn_toshare):
 				global G_gn
 				G_gn = gn_toshare
 	do_partial = partial(wrapper_marg_do, node_label, edge_label,
 						 p_quit, n_iteration)   
 	parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, 
 				glbv=(Gn,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose)


 #	# ---- direct running, normally use single CPU core. ----
 ##	pbar = tqdm(
 ##		total=(1 + len(Gn)) * len(Gn) / 2,
 ##		desc='calculating kernels',
 ##		file=sys.stdout)
 #	for i in range(0, len(Gn)):
 #		for j in range(i, len(Gn)):
 ##			print(i, j)
 #			Kmatrix[i][j] = _marginalizedkernel_do(Gn[i], Gn[j], node_label,
 #												   edge_label, p_quit, n_iteration)
 #			Kmatrix[j][i] = Kmatrix[i][j]
 ##			pbar.update(1)

 	run_time = time.time() - start_time
 	if verbose:
 		print("\n --- marginalized kernel matrix of size %d built in %s seconds ---"
 			  % (len(Gn), run_time))

 	return Kmatrix, run_time


 def _marginalizedkernel_do(g1, g2, node_label, edge_label, p_quit, n_iteration):
 	"""Calculate marginalized graph kernel between 2 graphs.

 	Parameters
 	----------
 	G1, G2 : NetworkX graphs
 		2 graphs between which the kernel is calculated.
 	node_label : string
 		node attribute used as label.
 	edge_label : string
 		edge attribute used as label.
 	p_quit : integer
 		the termination probability in the random walks generating step.
 	n_iteration : integer
 		time of iterations to calculate R_inf.

 	Return
 	------
 	kernel : float
 		Marginalized Kernel between 2 graphs.
 	"""
 	# init parameters
 	kernel = 0
 	num_nodes_G1 = nx.number_of_nodes(g1)
 	num_nodes_G2 = nx.number_of_nodes(g2)
 	# the initial probability distribution in the random walks generating step
 	# (uniform distribution over |G|)
 	p_init_G1 = 1 / num_nodes_G1
 	p_init_G2 = 1 / num_nodes_G2

 	q = p_quit * p_quit
 	r1 = q

 #	# initial R_inf
 #	# matrix to save all the R_inf for all pairs of nodes
 #	R_inf = np.zeros([num_nodes_G1, num_nodes_G2])
 #
 #	# calculate R_inf with a simple interative method
 #	for i in range(1, n_iteration):
 #		R_inf_new = np.zeros([num_nodes_G1, num_nodes_G2])
 #		R_inf_new.fill(r1)
 #
 #		# calculate R_inf for each pair of nodes
 #		for node1 in g1.nodes(data=True):
 #			neighbor_n1 = g1[node1[0]]
 #			# the transition probability distribution in the random walks
 #			# generating step (uniform distribution over the vertices adjacent
 #			# to the current vertex)
 #			if len(neighbor_n1) > 0:
 #				p_trans_n1 = (1 - p_quit) / len(neighbor_n1)
 #				for node2 in g2.nodes(data=True):
 #					neighbor_n2 = g2[node2[0]]
 #					if len(neighbor_n2) > 0:
 #						p_trans_n2 = (1 - p_quit) / len(neighbor_n2)
 #		
 #						for neighbor1 in neighbor_n1:
 #							for neighbor2 in neighbor_n2:
 #								t = p_trans_n1 * p_trans_n2 * \
 #									deltakernel(g1.node[neighbor1][node_label],
 #												g2.node[neighbor2][node_label]) * \
 #									deltakernel(
 #										neighbor_n1[neighbor1][edge_label],
 #										neighbor_n2[neighbor2][edge_label])
 #		
 #								R_inf_new[node1[0]][node2[0]] += t * R_inf[neighbor1][
 #									neighbor2]  # ref [1] equation (8)
 #		R_inf[:] = R_inf_new
 #
 #	# add elements of R_inf up and calculate kernel
 #	for node1 in g1.nodes(data=True):
 #		for node2 in g2.nodes(data=True):
 #			s = p_init_G1 * p_init_G2 * deltakernel(
 #				node1[1][node_label], node2[1][node_label])
 #			kernel += s * R_inf[node1[0]][node2[0]]  # ref [1] equation (6)
 	
 	
 	R_inf = {} # dict to save all the R_inf for all pairs of nodes
 	# initial R_inf, the 1st iteration.
 	for node1 in g1.nodes():
 		for node2 in g2.nodes():
 #			R_inf[(node1[0], node2[0])] = r1
 			if len(g1[node1]) > 0:
 				if len(g2[node2]) > 0:
 					R_inf[(node1, node2)] = r1
 				else:
 					R_inf[(node1, node2)] = p_quit
 			else:
 				if len(g2[node2]) > 0:
 					R_inf[(node1, node2)] = p_quit
 				else:
 					R_inf[(node1, node2)] = 1
 			
 	# compute all transition probability first.
 	t_dict = {}
 	if n_iteration > 1:
 		for node1 in g1.nodes():
 			neighbor_n1 = g1[node1]
 			# the transition probability distribution in the random walks
 			# generating step (uniform distribution over the vertices adjacent
 			# to the current vertex)
 			if len(neighbor_n1) > 0:
 				p_trans_n1 = (1 - p_quit) / len(neighbor_n1)
 				for node2 in g2.nodes():
 					neighbor_n2 = g2[node2]
 					if len(neighbor_n2) > 0:
 						p_trans_n2 = (1 - p_quit) / len(neighbor_n2)
 						for neighbor1 in neighbor_n1:
 							for neighbor2 in neighbor_n2:
 								t_dict[(node1, node2, neighbor1, neighbor2)] = \
 									p_trans_n1 * p_trans_n2 * \
 									deltakernel(g1.nodes[neighbor1][node_label],
 												g2.nodes[neighbor2][node_label]) * \
 									deltakernel(
 										neighbor_n1[neighbor1][edge_label],
 										neighbor_n2[neighbor2][edge_label])

 	# calculate R_inf with a simple interative method
 	for i in range(2, n_iteration + 1):
 		R_inf_old = R_inf.copy()

 		# calculate R_inf for each pair of nodes
 		for node1 in g1.nodes():
 			neighbor_n1 = g1[node1]
 			# the transition probability distribution in the random walks
 			# generating step (uniform distribution over the vertices adjacent
 			# to the current vertex)
 			if len(neighbor_n1) > 0:
 				for node2 in g2.nodes():
 					neighbor_n2 = g2[node2]
 					if len(neighbor_n2) > 0:   
 						R_inf[(node1, node2)] = r1
 						for neighbor1 in neighbor_n1:
 							for neighbor2 in neighbor_n2:
 								R_inf[(node1, node2)] += \
 									(t_dict[(node1, node2, neighbor1, neighbor2)] * \
 									R_inf_old[(neighbor1, neighbor2)])  # ref [1] equation (8)

 	# add elements of R_inf up and calculate kernel
 	for (n1, n2), value in R_inf.items():
 		s = p_init_G1 * p_init_G2 * deltakernel(
 				g1.nodes[n1][node_label], g2.nodes[n2][node_label])
 		kernel += s * value  # ref [1] equation (6)

 	return kernel
 		
 		
 def wrapper_marg_do(node_label, edge_label, p_quit, n_iteration, itr):
 	i= itr[0]
 	j = itr[1]
 	return i, j, _marginalizedkernel_do(G_gn[i], G_gn[j], node_label, edge_label, p_quit, n_iteration)
 	

 def wrapper_untotter(Gn, node_label, edge_label, i):
 	return i, untotterTransformation(Gn[i], node_label, edge_label)