| @@ -0,0 +1,22 @@ | |||
| # -*-coding:utf-8 -*- | |||
| """gklearn - datasets module | |||
| Implement some methods to manage graph datasets | |||
| graph_fetcher.py : fetch graph datasets from the Internet. | |||
| """ | |||
| # info | |||
| __version__ = "0.2" | |||
| __author__ = "Linlin Jia" | |||
| __date__ = "October 2020" | |||
| from gklearn.dataset.metadata import DATABASES, DATASET_META | |||
| from gklearn.dataset.metadata import GREYC_META, IAM_META, TUDataset_META | |||
| from gklearn.dataset.metadata import list_of_databases, list_of_datasets | |||
| from gklearn.dataset.graph_synthesizer import GraphSynthesizer | |||
| from gklearn.dataset.data_fetcher import DataFetcher | |||
| from gklearn.dataset.file_managers import DataLoader, DataSaver | |||
| from gklearn.dataset.dataset import Dataset, split_dataset_by_target | |||
| @@ -0,0 +1,823 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Thu Mar 26 18:48:27 2020 | |||
| @author: ljia | |||
| """ | |||
| import numpy as np | |||
| import networkx as nx | |||
| from gklearn.utils.graph_files import load_dataset | |||
| import os | |||
| class Dataset(object): | |||
| def __init__(self, filename=None, filename_targets=None, **kwargs): | |||
| if filename is None: | |||
| self._graphs = None | |||
| self._targets = None | |||
| self._node_labels = None | |||
| self._edge_labels = None | |||
| self._node_attrs = None | |||
| self._edge_attrs = None | |||
| else: | |||
| self.load_dataset(filename, filename_targets=filename_targets, **kwargs) | |||
| self._substructures = None | |||
| self._node_label_dim = None | |||
| self._edge_label_dim = None | |||
| self._directed = None | |||
| self._dataset_size = None | |||
| self._total_node_num = None | |||
| self._ave_node_num = None | |||
| self._min_node_num = None | |||
| self._max_node_num = None | |||
| self._total_edge_num = None | |||
| self._ave_edge_num = None | |||
| self._min_edge_num = None | |||
| self._max_edge_num = None | |||
| self._ave_node_degree = None | |||
| self._min_node_degree = None | |||
| self._max_node_degree = None | |||
| self._ave_fill_factor = None | |||
| self._min_fill_factor = None | |||
| self._max_fill_factor = None | |||
| self._node_label_nums = None | |||
| self._edge_label_nums = None | |||
| self._node_attr_dim = None | |||
| self._edge_attr_dim = None | |||
| self._class_number = None | |||
| def load_dataset(self, filename, filename_targets=None, **kwargs): | |||
| self._graphs, self._targets, label_names = load_dataset(filename, filename_targets=filename_targets, **kwargs) | |||
| self._node_labels = label_names['node_labels'] | |||
| self._node_attrs = label_names['node_attrs'] | |||
| self._edge_labels = label_names['edge_labels'] | |||
| self._edge_attrs = label_names['edge_attrs'] | |||
| self.clean_labels() | |||
| def load_graphs(self, graphs, targets=None): | |||
| # this has to be followed by set_labels(). | |||
| self._graphs = graphs | |||
| self._targets = targets | |||
| # self.set_labels_attrs() # @todo | |||
| def load_predefined_dataset(self, ds_name): | |||
| current_path = os.path.dirname(os.path.realpath(__file__)) + '/' | |||
| if ds_name == 'Acyclic': | |||
| ds_file = current_path + '../../datasets/Acyclic/dataset_bps.ds' | |||
| self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
| elif ds_name == 'AIDS': | |||
| ds_file = current_path + '../../datasets/AIDS/AIDS_A.txt' | |||
| self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
| elif ds_name == 'Alkane': | |||
| ds_file = current_path + '../../datasets/Alkane/dataset.ds' | |||
| fn_targets = current_path + '../../datasets/Alkane/dataset_boiling_point_names.txt' | |||
| self._graphs, self._targets, label_names = load_dataset(ds_file, filename_targets=fn_targets) | |||
| elif ds_name == 'COIL-DEL': | |||
| ds_file = current_path + '../../datasets/COIL-DEL/COIL-DEL_A.txt' | |||
| self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
| elif ds_name == 'COIL-RAG': | |||
| ds_file = current_path + '../../datasets/COIL-RAG/COIL-RAG_A.txt' | |||
| self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
| elif ds_name == 'COLORS-3': | |||
| ds_file = current_path + '../../datasets/COLORS-3/COLORS-3_A.txt' | |||
| self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
| elif ds_name == 'Cuneiform': | |||
| ds_file = current_path + '../../datasets/Cuneiform/Cuneiform_A.txt' | |||
| self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
| elif ds_name == 'DD': | |||
| ds_file = current_path + '../../datasets/DD/DD_A.txt' | |||
| self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
| elif ds_name == 'ENZYMES': | |||
| ds_file = current_path + '../../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt' | |||
| self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
| elif ds_name == 'Fingerprint': | |||
| ds_file = current_path + '../../datasets/Fingerprint/Fingerprint_A.txt' | |||
| self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
| elif ds_name == 'FRANKENSTEIN': | |||
| ds_file = current_path + '../../datasets/FRANKENSTEIN/FRANKENSTEIN_A.txt' | |||
| self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
| elif ds_name == 'Letter-high': # node non-symb | |||
| ds_file = current_path + '../../datasets/Letter-high/Letter-high_A.txt' | |||
| self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
| elif ds_name == 'Letter-low': # node non-symb | |||
| ds_file = current_path + '../../datasets/Letter-low/Letter-low_A.txt' | |||
| self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
| elif ds_name == 'Letter-med': # node non-symb | |||
| ds_file = current_path + '../../datasets/Letter-med/Letter-med_A.txt' | |||
| self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
| elif ds_name == 'MAO': | |||
| ds_file = current_path + '../../datasets/MAO/dataset.ds' | |||
| self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
| elif ds_name == 'Monoterpenoides': | |||
| ds_file = current_path + '../../datasets/Monoterpenoides/dataset_10+.ds' | |||
| self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
| elif ds_name == 'MUTAG': | |||
| ds_file = current_path + '../../datasets/MUTAG/MUTAG_A.txt' | |||
| self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
| elif ds_name == 'NCI1': | |||
| ds_file = current_path + '../../datasets/NCI1/NCI1_A.txt' | |||
| self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
| elif ds_name == 'NCI109': | |||
| ds_file = current_path + '../../datasets/NCI109/NCI109_A.txt' | |||
| self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
| elif ds_name == 'PAH': | |||
| ds_file = current_path + '../../datasets/PAH/dataset.ds' | |||
| self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
| elif ds_name == 'SYNTHETIC': | |||
| pass | |||
| elif ds_name == 'SYNTHETICnew': | |||
| ds_file = current_path + '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt' | |||
| self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
| elif ds_name == 'Synthie': | |||
| pass | |||
| else: | |||
| raise Exception('The dataset name "', ds_name, '" is not pre-defined.') | |||
| self._node_labels = label_names['node_labels'] | |||
| self._node_attrs = label_names['node_attrs'] | |||
| self._edge_labels = label_names['edge_labels'] | |||
| self._edge_attrs = label_names['edge_attrs'] | |||
| self.clean_labels() | |||
| def set_labels(self, node_labels=[], node_attrs=[], edge_labels=[], edge_attrs=[]): | |||
| self._node_labels = node_labels | |||
| self._node_attrs = node_attrs | |||
| self._edge_labels = edge_labels | |||
| self._edge_attrs = edge_attrs | |||
| def set_labels_attrs(self, node_labels=None, node_attrs=None, edge_labels=None, edge_attrs=None): | |||
| # @todo: remove labels which have only one possible values. | |||
| if node_labels is None: | |||
| self._node_labels = self._graphs[0].graph['node_labels'] | |||
| # # graphs are considered node unlabeled if all nodes have the same label. | |||
| # infos.update({'node_labeled': is_nl if node_label_num > 1 else False}) | |||
| if node_attrs is None: | |||
| self._node_attrs = self._graphs[0].graph['node_attrs'] | |||
| # for G in Gn: | |||
| # for n in G.nodes(data=True): | |||
| # if 'attributes' in n[1]: | |||
| # return len(n[1]['attributes']) | |||
| # return 0 | |||
| if edge_labels is None: | |||
| self._edge_labels = self._graphs[0].graph['edge_labels'] | |||
| # # graphs are considered edge unlabeled if all edges have the same label. | |||
| # infos.update({'edge_labeled': is_el if edge_label_num > 1 else False}) | |||
| if edge_attrs is None: | |||
| self._edge_attrs = self._graphs[0].graph['edge_attrs'] | |||
| # for G in Gn: | |||
| # if nx.number_of_edges(G) > 0: | |||
| # for e in G.edges(data=True): | |||
| # if 'attributes' in e[2]: | |||
| # return len(e[2]['attributes']) | |||
| # return 0 | |||
| def get_dataset_infos(self, keys=None, params=None): | |||
| """Computes and returns the structure and property information of the graph dataset. | |||
| Parameters | |||
| ---------- | |||
| keys : list, optional | |||
| A list of strings which indicate which informations will be returned. The | |||
| possible choices includes: | |||
| 'substructures': sub-structures graphs contains, including 'linear', 'non | |||
| linear' and 'cyclic'. | |||
| 'node_label_dim': whether vertices have symbolic labels. | |||
| 'edge_label_dim': whether egdes have symbolic labels. | |||
| 'directed': whether graphs in dataset are directed. | |||
| 'dataset_size': number of graphs in dataset. | |||
| 'total_node_num': total number of vertices of all graphs in dataset. | |||
| 'ave_node_num': average number of vertices of graphs in dataset. | |||
| 'min_node_num': minimum number of vertices of graphs in dataset. | |||
| 'max_node_num': maximum number of vertices of graphs in dataset. | |||
| 'total_edge_num': total number of edges of all graphs in dataset. | |||
| 'ave_edge_num': average number of edges of graphs in dataset. | |||
| 'min_edge_num': minimum number of edges of graphs in dataset. | |||
| 'max_edge_num': maximum number of edges of graphs in dataset. | |||
| 'ave_node_degree': average vertex degree of graphs in dataset. | |||
| 'min_node_degree': minimum vertex degree of graphs in dataset. | |||
| 'max_node_degree': maximum vertex degree of graphs in dataset. | |||
| 'ave_fill_factor': average fill factor (number_of_edges / | |||
| (number_of_nodes ** 2)) of graphs in dataset. | |||
| 'min_fill_factor': minimum fill factor of graphs in dataset. | |||
| 'max_fill_factor': maximum fill factor of graphs in dataset. | |||
| 'node_label_nums': list of numbers of symbolic vertex labels of graphs in dataset. | |||
| 'edge_label_nums': list number of symbolic edge labels of graphs in dataset. | |||
| 'node_attr_dim': number of dimensions of non-symbolic vertex labels. | |||
| Extracted from the 'attributes' attribute of graph nodes. | |||
| 'edge_attr_dim': number of dimensions of non-symbolic edge labels. | |||
| Extracted from the 'attributes' attribute of graph edges. | |||
| 'class_number': number of classes. Only available for classification problems. | |||
| 'all_degree_entropy': the entropy of degree distribution of each graph. | |||
| 'ave_degree_entropy': the average entropy of degree distribution of all graphs. | |||
| All informations above will be returned if `keys` is not given. | |||
| params: dict of dict, optional | |||
| A dictinary which contains extra parameters for each possible | |||
| element in ``keys``. | |||
| Return | |||
| ------ | |||
| dict | |||
| Information of the graph dataset keyed by `keys`. | |||
| """ | |||
| infos = {} | |||
| if keys == None: | |||
| keys = [ | |||
| 'substructures', | |||
| 'node_label_dim', | |||
| 'edge_label_dim', | |||
| 'directed', | |||
| 'dataset_size', | |||
| 'total_node_num', | |||
| 'ave_node_num', | |||
| 'min_node_num', | |||
| 'max_node_num', | |||
| 'total_edge_num', | |||
| 'ave_edge_num', | |||
| 'min_edge_num', | |||
| 'max_edge_num', | |||
| 'ave_node_degree', | |||
| 'min_node_degree', | |||
| 'max_node_degree', | |||
| 'ave_fill_factor', | |||
| 'min_fill_factor', | |||
| 'max_fill_factor', | |||
| 'node_label_nums', | |||
| 'edge_label_nums', | |||
| 'node_attr_dim', | |||
| 'edge_attr_dim', | |||
| 'class_number', | |||
| 'all_degree_entropy', | |||
| 'ave_degree_entropy' | |||
| ] | |||
| # dataset size | |||
| if 'dataset_size' in keys: | |||
| if self._dataset_size is None: | |||
| self._dataset_size = self._get_dataset_size() | |||
| infos['dataset_size'] = self._dataset_size | |||
| # graph node number | |||
| if any(i in keys for i in ['total_node_num', 'ave_node_num', 'min_node_num', 'max_node_num']): | |||
| all_node_nums = self._get_all_node_nums() | |||
| if 'total_node_num' in keys: | |||
| if self._total_node_num is None: | |||
| self._total_node_num = self._get_total_node_num(all_node_nums) | |||
| infos['total_node_num'] = self._total_node_num | |||
| if 'ave_node_num' in keys: | |||
| if self._ave_node_num is None: | |||
| self._ave_node_num = self._get_ave_node_num(all_node_nums) | |||
| infos['ave_node_num'] = self._ave_node_num | |||
| if 'min_node_num' in keys: | |||
| if self._min_node_num is None: | |||
| self._min_node_num = self._get_min_node_num(all_node_nums) | |||
| infos['min_node_num'] = self._min_node_num | |||
| if 'max_node_num' in keys: | |||
| if self._max_node_num is None: | |||
| self._max_node_num = self._get_max_node_num(all_node_nums) | |||
| infos['max_node_num'] = self._max_node_num | |||
| # graph edge number | |||
| if any(i in keys for i in ['total_edge_num', 'ave_edge_num', 'min_edge_num', 'max_edge_num']): | |||
| all_edge_nums = self._get_all_edge_nums() | |||
| if 'total_edge_num' in keys: | |||
| if self._total_edge_num is None: | |||
| self._total_edge_num = self._get_total_edge_num(all_edge_nums) | |||
| infos['total_edge_num'] = self._total_edge_num | |||
| if 'ave_edge_num' in keys: | |||
| if self._ave_edge_num is None: | |||
| self._ave_edge_num = self._get_ave_edge_num(all_edge_nums) | |||
| infos['ave_edge_num'] = self._ave_edge_num | |||
| if 'max_edge_num' in keys: | |||
| if self._max_edge_num is None: | |||
| self._max_edge_num = self._get_max_edge_num(all_edge_nums) | |||
| infos['max_edge_num'] = self._max_edge_num | |||
| if 'min_edge_num' in keys: | |||
| if self._min_edge_num is None: | |||
| self._min_edge_num = self._get_min_edge_num(all_edge_nums) | |||
| infos['min_edge_num'] = self._min_edge_num | |||
| # label number | |||
| if 'node_label_dim' in keys: | |||
| if self._node_label_dim is None: | |||
| self._node_label_dim = self._get_node_label_dim() | |||
| infos['node_label_dim'] = self._node_label_dim | |||
| if 'node_label_nums' in keys: | |||
| if self._node_label_nums is None: | |||
| self._node_label_nums = {} | |||
| for node_label in self._node_labels: | |||
| self._node_label_nums[node_label] = self._get_node_label_num(node_label) | |||
| infos['node_label_nums'] = self._node_label_nums | |||
| if 'edge_label_dim' in keys: | |||
| if self._edge_label_dim is None: | |||
| self._edge_label_dim = self._get_edge_label_dim() | |||
| infos['edge_label_dim'] = self._edge_label_dim | |||
| if 'edge_label_nums' in keys: | |||
| if self._edge_label_nums is None: | |||
| self._edge_label_nums = {} | |||
| for edge_label in self._edge_labels: | |||
| self._edge_label_nums[edge_label] = self._get_edge_label_num(edge_label) | |||
| infos['edge_label_nums'] = self._edge_label_nums | |||
| if 'directed' in keys or 'substructures' in keys: | |||
| if self._directed is None: | |||
| self._directed = self._is_directed() | |||
| infos['directed'] = self._directed | |||
| # node degree | |||
| if any(i in keys for i in ['ave_node_degree', 'max_node_degree', 'min_node_degree']): | |||
| all_node_degrees = self._get_all_node_degrees() | |||
| if 'ave_node_degree' in keys: | |||
| if self._ave_node_degree is None: | |||
| self._ave_node_degree = self._get_ave_node_degree(all_node_degrees) | |||
| infos['ave_node_degree'] = self._ave_node_degree | |||
| if 'max_node_degree' in keys: | |||
| if self._max_node_degree is None: | |||
| self._max_node_degree = self._get_max_node_degree(all_node_degrees) | |||
| infos['max_node_degree'] = self._max_node_degree | |||
| if 'min_node_degree' in keys: | |||
| if self._min_node_degree is None: | |||
| self._min_node_degree = self._get_min_node_degree(all_node_degrees) | |||
| infos['min_node_degree'] = self._min_node_degree | |||
| # fill factor | |||
| if any(i in keys for i in ['ave_fill_factor', 'max_fill_factor', 'min_fill_factor']): | |||
| all_fill_factors = self._get_all_fill_factors() | |||
| if 'ave_fill_factor' in keys: | |||
| if self._ave_fill_factor is None: | |||
| self._ave_fill_factor = self._get_ave_fill_factor(all_fill_factors) | |||
| infos['ave_fill_factor'] = self._ave_fill_factor | |||
| if 'max_fill_factor' in keys: | |||
| if self._max_fill_factor is None: | |||
| self._max_fill_factor = self._get_max_fill_factor(all_fill_factors) | |||
| infos['max_fill_factor'] = self._max_fill_factor | |||
| if 'min_fill_factor' in keys: | |||
| if self._min_fill_factor is None: | |||
| self._min_fill_factor = self._get_min_fill_factor(all_fill_factors) | |||
| infos['min_fill_factor'] = self._min_fill_factor | |||
| if 'substructures' in keys: | |||
| if self._substructures is None: | |||
| self._substructures = self._get_substructures() | |||
| infos['substructures'] = self._substructures | |||
| if 'class_number' in keys: | |||
| if self._class_number is None: | |||
| self._class_number = self._get_class_number() | |||
| infos['class_number'] = self._class_number | |||
| if 'node_attr_dim' in keys: | |||
| if self._node_attr_dim is None: | |||
| self._node_attr_dim = self._get_node_attr_dim() | |||
| infos['node_attr_dim'] = self._node_attr_dim | |||
| if 'edge_attr_dim' in keys: | |||
| if self._edge_attr_dim is None: | |||
| self._edge_attr_dim = self._get_edge_attr_dim() | |||
| infos['edge_attr_dim'] = self._edge_attr_dim | |||
| # entropy of degree distribution. | |||
| if 'all_degree_entropy' in keys: | |||
| if params is not None and ('all_degree_entropy' in params) and ('base' in params['all_degree_entropy']): | |||
| base = params['all_degree_entropy']['base'] | |||
| else: | |||
| base = None | |||
| infos['all_degree_entropy'] = self._compute_all_degree_entropy(base=base) | |||
| if 'ave_degree_entropy' in keys: | |||
| if params is not None and ('ave_degree_entropy' in params) and ('base' in params['ave_degree_entropy']): | |||
| base = params['ave_degree_entropy']['base'] | |||
| else: | |||
| base = None | |||
| infos['ave_degree_entropy'] = np.mean(self._compute_all_degree_entropy(base=base)) | |||
| return infos | |||
| def print_graph_infos(self, infos): | |||
| from collections import OrderedDict | |||
| keys = list(infos.keys()) | |||
| print(OrderedDict(sorted(infos.items(), key=lambda i: keys.index(i[0])))) | |||
| def remove_labels(self, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]): | |||
| node_labels = [item for item in node_labels if item in self._node_labels] | |||
| edge_labels = [item for item in edge_labels if item in self._edge_labels] | |||
| node_attrs = [item for item in node_attrs if item in self._node_attrs] | |||
| edge_attrs = [item for item in edge_attrs if item in self._edge_attrs] | |||
| for g in self._graphs: | |||
| for nd in g.nodes(): | |||
| for nl in node_labels: | |||
| del g.nodes[nd][nl] | |||
| for na in node_attrs: | |||
| del g.nodes[nd][na] | |||
| for ed in g.edges(): | |||
| for el in edge_labels: | |||
| del g.edges[ed][el] | |||
| for ea in edge_attrs: | |||
| del g.edges[ed][ea] | |||
| if len(node_labels) > 0: | |||
| self._node_labels = [nl for nl in self._node_labels if nl not in node_labels] | |||
| if len(edge_labels) > 0: | |||
| self._edge_labels = [el for el in self._edge_labels if el not in edge_labels] | |||
| if len(node_attrs) > 0: | |||
| self._node_attrs = [na for na in self._node_attrs if na not in node_attrs] | |||
| if len(edge_attrs) > 0: | |||
| self._edge_attrs = [ea for ea in self._edge_attrs if ea not in edge_attrs] | |||
| def clean_labels(self): | |||
| labels = [] | |||
| for name in self._node_labels: | |||
| label = set() | |||
| for G in self._graphs: | |||
| label = label | set(nx.get_node_attributes(G, name).values()) | |||
| if len(label) > 1: | |||
| labels.append(name) | |||
| break | |||
| if len(label) < 2: | |||
| for G in self._graphs: | |||
| for nd in G.nodes(): | |||
| del G.nodes[nd][name] | |||
| self._node_labels = labels | |||
| labels = [] | |||
| for name in self._edge_labels: | |||
| label = set() | |||
| for G in self._graphs: | |||
| label = label | set(nx.get_edge_attributes(G, name).values()) | |||
| if len(label) > 1: | |||
| labels.append(name) | |||
| break | |||
| if len(label) < 2: | |||
| for G in self._graphs: | |||
| for ed in G.edges(): | |||
| del G.edges[ed][name] | |||
| self._edge_labels = labels | |||
| labels = [] | |||
| for name in self._node_attrs: | |||
| label = set() | |||
| for G in self._graphs: | |||
| label = label | set(nx.get_node_attributes(G, name).values()) | |||
| if len(label) > 1: | |||
| labels.append(name) | |||
| break | |||
| if len(label) < 2: | |||
| for G in self._graphs: | |||
| for nd in G.nodes(): | |||
| del G.nodes[nd][name] | |||
| self._node_attrs = labels | |||
| labels = [] | |||
| for name in self._edge_attrs: | |||
| label = set() | |||
| for G in self._graphs: | |||
| label = label | set(nx.get_edge_attributes(G, name).values()) | |||
| if len(label) > 1: | |||
| labels.append(name) | |||
| break | |||
| if len(label) < 2: | |||
| for G in self._graphs: | |||
| for ed in G.edges(): | |||
| del G.edges[ed][name] | |||
| self._edge_attrs = labels | |||
| def cut_graphs(self, range_): | |||
| self._graphs = [self._graphs[i] for i in range_] | |||
| if self._targets is not None: | |||
| self._targets = [self._targets[i] for i in range_] | |||
| self.clean_labels() | |||
| def trim_dataset(self, edge_required=False): | |||
| if edge_required: | |||
| trimed_pairs = [(idx, g) for idx, g in enumerate(self._graphs) if (nx.number_of_nodes(g) != 0 and nx.number_of_edges(g) != 0)] | |||
| else: | |||
| trimed_pairs = [(idx, g) for idx, g in enumerate(self._graphs) if nx.number_of_nodes(g) != 0] | |||
| idx = [p[0] for p in trimed_pairs] | |||
| self._graphs = [p[1] for p in trimed_pairs] | |||
| self._targets = [self._targets[i] for i in idx] | |||
| self.clean_labels() | |||
| def copy(self): | |||
| dataset = Dataset() | |||
| graphs = [g.copy() for g in self._graphs] if self._graphs is not None else None | |||
| target = self._targets.copy() if self._targets is not None else None | |||
| node_labels = self._node_labels.copy() if self._node_labels is not None else None | |||
| node_attrs = self._node_attrs.copy() if self._node_attrs is not None else None | |||
| edge_labels = self._edge_labels.copy() if self._edge_labels is not None else None | |||
| edge_attrs = self._edge_attrs.copy() if self._edge_attrs is not None else None | |||
| dataset.load_graphs(graphs, target) | |||
| dataset.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs) | |||
| # @todo: clean_labels and add other class members? | |||
| return dataset | |||
| def get_all_node_labels(self): | |||
| node_labels = [] | |||
| for g in self._graphs: | |||
| for n in g.nodes(): | |||
| nl = tuple(g.nodes[n].items()) | |||
| if nl not in node_labels: | |||
| node_labels.append(nl) | |||
| return node_labels | |||
| def get_all_edge_labels(self): | |||
| edge_labels = [] | |||
| for g in self._graphs: | |||
| for e in g.edges(): | |||
| el = tuple(g.edges[e].items()) | |||
| if el not in edge_labels: | |||
| edge_labels.append(el) | |||
| return edge_labels | |||
| def _get_dataset_size(self): | |||
| return len(self._graphs) | |||
| def _get_all_node_nums(self): | |||
| return [nx.number_of_nodes(G) for G in self._graphs] | |||
| def _get_total_node_nums(self, all_node_nums): | |||
| return np.sum(all_node_nums) | |||
| def _get_ave_node_num(self, all_node_nums): | |||
| return np.mean(all_node_nums) | |||
| def _get_min_node_num(self, all_node_nums): | |||
| return np.amin(all_node_nums) | |||
| def _get_max_node_num(self, all_node_nums): | |||
| return np.amax(all_node_nums) | |||
| def _get_all_edge_nums(self): | |||
| return [nx.number_of_edges(G) for G in self._graphs] | |||
| def _get_total_edge_nums(self, all_edge_nums): | |||
| return np.sum(all_edge_nums) | |||
| def _get_ave_edge_num(self, all_edge_nums): | |||
| return np.mean(all_edge_nums) | |||
| def _get_min_edge_num(self, all_edge_nums): | |||
| return np.amin(all_edge_nums) | |||
| def _get_max_edge_num(self, all_edge_nums): | |||
| return np.amax(all_edge_nums) | |||
| def _get_node_label_dim(self): | |||
| return len(self._node_labels) | |||
| def _get_node_label_num(self, node_label): | |||
| nl = set() | |||
| for G in self._graphs: | |||
| nl = nl | set(nx.get_node_attributes(G, node_label).values()) | |||
| return len(nl) | |||
| def _get_edge_label_dim(self): | |||
| return len(self._edge_labels) | |||
| def _get_edge_label_num(self, edge_label): | |||
| el = set() | |||
| for G in self._graphs: | |||
| el = el | set(nx.get_edge_attributes(G, edge_label).values()) | |||
| return len(el) | |||
| def _is_directed(self): | |||
| return nx.is_directed(self._graphs[0]) | |||
| def _get_all_node_degrees(self): | |||
| return [np.mean(list(dict(G.degree()).values())) for G in self._graphs] | |||
| def _get_ave_node_degree(self, all_node_degrees): | |||
| return np.mean(all_node_degrees) | |||
| def _get_max_node_degree(self, all_node_degrees): | |||
| return np.amax(all_node_degrees) | |||
| def _get_min_node_degree(self, all_node_degrees): | |||
| return np.amin(all_node_degrees) | |||
| def _get_all_fill_factors(self): | |||
| """Get fill factor, the number of non-zero entries in the adjacency matrix. | |||
| Returns | |||
| ------- | |||
| list[float] | |||
| List of fill factors for all graphs. | |||
| """ | |||
| return [nx.number_of_edges(G) / (nx.number_of_nodes(G) ** 2) for G in self._graphs] | |||
| def _get_ave_fill_factor(self, all_fill_factors): | |||
| return np.mean(all_fill_factors) | |||
| def _get_max_fill_factor(self, all_fill_factors): | |||
| return np.amax(all_fill_factors) | |||
| def _get_min_fill_factor(self, all_fill_factors): | |||
| return np.amin(all_fill_factors) | |||
| def _get_substructures(self): | |||
| subs = set() | |||
| for G in self._graphs: | |||
| degrees = list(dict(G.degree()).values()) | |||
| if any(i == 2 for i in degrees): | |||
| subs.add('linear') | |||
| if np.amax(degrees) >= 3: | |||
| subs.add('non linear') | |||
| if 'linear' in subs and 'non linear' in subs: | |||
| break | |||
| if self._directed: | |||
| for G in self._graphs: | |||
| if len(list(nx.find_cycle(G))) > 0: | |||
| subs.add('cyclic') | |||
| break | |||
| # else: | |||
| # # @todo: this method does not work for big graph with large amount of edges like D&D, try a better way. | |||
| # upper = np.amin([nx.number_of_edges(G) for G in Gn]) * 2 + 10 | |||
| # for G in Gn: | |||
| # if (nx.number_of_edges(G) < upper): | |||
| # cyc = list(nx.simple_cycles(G.to_directed())) | |||
| # if any(len(i) > 2 for i in cyc): | |||
| # subs.add('cyclic') | |||
| # break | |||
| # if 'cyclic' not in subs: | |||
| # for G in Gn: | |||
| # cyc = list(nx.simple_cycles(G.to_directed())) | |||
| # if any(len(i) > 2 for i in cyc): | |||
| # subs.add('cyclic') | |||
| # break | |||
| return subs | |||
| def _get_class_num(self): | |||
| return len(set(self._targets)) | |||
| def _get_node_attr_dim(self): | |||
| return len(self._node_attrs) | |||
| def _get_edge_attr_dim(self): | |||
| return len(self._edge_attrs) | |||
| def _compute_all_degree_entropy(self, base=None): | |||
| """Compute the entropy of degree distribution of each graph. | |||
| Parameters | |||
| ---------- | |||
| base : float, optional | |||
| The logarithmic base to use. The default is ``e`` (natural logarithm). | |||
| Returns | |||
| ------- | |||
| degree_entropy : float | |||
| The calculated entropy. | |||
| """ | |||
| from gklearn.utils.stats import entropy | |||
| degree_entropy = [] | |||
| for g in self._graphs: | |||
| degrees = list(dict(g.degree()).values()) | |||
| en = entropy(degrees, base=base) | |||
| degree_entropy.append(en) | |||
| return degree_entropy | |||
| @property | |||
| def graphs(self): | |||
| return self._graphs | |||
| @property | |||
| def targets(self): | |||
| return self._targets | |||
| @property | |||
| def node_labels(self): | |||
| return self._node_labels | |||
| @property | |||
| def edge_labels(self): | |||
| return self._edge_labels | |||
| @property | |||
| def node_attrs(self): | |||
| return self._node_attrs | |||
| @property | |||
| def edge_attrs(self): | |||
| return self._edge_attrs | |||
| def split_dataset_by_target(dataset): | |||
| from gklearn.preimage.utils import get_same_item_indices | |||
| graphs = dataset.graphs | |||
| targets = dataset.targets | |||
| datasets = [] | |||
| idx_targets = get_same_item_indices(targets) | |||
| for key, val in idx_targets.items(): | |||
| sub_graphs = [graphs[i] for i in val] | |||
| sub_dataset = Dataset() | |||
| sub_dataset.load_graphs(sub_graphs, [key] * len(val)) | |||
| node_labels = dataset.node_labels.copy() if dataset.node_labels is not None else None | |||
| node_attrs = dataset.node_attrs.copy() if dataset.node_attrs is not None else None | |||
| edge_labels = dataset.edge_labels.copy() if dataset.edge_labels is not None else None | |||
| edge_attrs = dataset.edge_attrs.copy() if dataset.edge_attrs is not None else None | |||
| sub_dataset.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs) | |||
| datasets.append(sub_dataset) | |||
| # @todo: clean_labels? | |||
| return datasets | |||
| @@ -0,0 +1,824 @@ | |||
| """ Utilities function to manage graph files | |||
| """ | |||
| from os.path import dirname, splitext | |||
| class DataLoader(): | |||
| def __init__(self, filename, filename_targets=None, gformat=None, **kwargs): | |||
| """Read graph data from filename and load them as NetworkX graphs. | |||
| Parameters | |||
| ---------- | |||
| filename : string | |||
| The name of the file from where the dataset is read. | |||
| filename_targets : string | |||
| The name of file of the targets corresponding to graphs. | |||
| Notes | |||
| ----- | |||
| This function supports following graph dataset formats: | |||
| 'ds': load data from .ds file. See comments of function loadFromDS for a example. | |||
| 'cxl': load data from Graph eXchange Language file (.cxl file). See | |||
| `here <http://www.gupro.de/GXL/Introduction/background.html>`__ for detail. | |||
| 'sdf': load data from structured data file (.sdf file). See | |||
| `here <http://www.nonlinear.com/progenesis/sdf-studio/v0.9/faq/sdf-file-format-guidance.aspx>`__ | |||
| for details. | |||
| 'mat': Load graph data from a MATLAB (up to version 7.1) .mat file. See | |||
| README in `downloadable file <http://mlcb.is.tuebingen.mpg.de/Mitarbeiter/Nino/WL/>`__ | |||
| for details. | |||
| 'txt': Load graph data from the TUDataset. See | |||
| `here <https://ls11-www.cs.tu-dortmund.de/staff/morris/graphkerneldatasets>`__ | |||
| for details. Note here filename is the name of either .txt file in | |||
| the dataset directory. | |||
| """ | |||
| extension = splitext(filename)[1][1:] | |||
| if extension == "ds": | |||
| self._graphs, self._targets, self._label_names = self.load_from_ds(filename, filename_targets) | |||
| elif extension == "cxl": | |||
| dir_dataset = kwargs.get('dirname_dataset', None) | |||
| self._graphs, self._targets, self._label_names = self.load_from_xml(filename, dir_dataset) | |||
| elif extension == 'xml': | |||
| dir_dataset = kwargs.get('dirname_dataset', None) | |||
| self._graphs, self._targets, self._label_names = self.load_from_xml(filename, dir_dataset) | |||
| elif extension == "mat": | |||
| order = kwargs.get('order') | |||
| self._graphs, self._targets, self._label_names = self.load_mat(filename, order) | |||
| elif extension == 'txt': | |||
| self._graphs, self._targets, self._label_names = self.load_tud(filename) | |||
| else: | |||
| raise ValueError('The input file with the extension ".', extension, '" is not supported. The supported extensions includes: ".ds", ".cxl", ".xml", ".mat", ".txt".') | |||
| def load_from_ds(self, filename, filename_targets): | |||
| """Load data from .ds file. | |||
| Possible graph formats include: | |||
| '.ct': see function load_ct for detail. | |||
| '.gxl': see dunction load_gxl for detail. | |||
| Note these graph formats are checked automatically by the extensions of | |||
| graph files. | |||
| """ | |||
| dirname_dataset = dirname(filename) | |||
| data = [] | |||
| y = [] | |||
| label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} | |||
| with open(filename) as fn: | |||
| content = fn.read().splitlines() | |||
| extension = splitext(content[0].split(' ')[0])[1][1:] | |||
| if extension == 'ct': | |||
| load_file_fun = self.load_ct | |||
| elif extension == 'gxl' or extension == 'sdf': # @todo: .sdf not tested yet. | |||
| load_file_fun = self.load_gxl | |||
| if filename_targets is None or filename_targets == '': | |||
| for i in range(0, len(content)): | |||
| tmp = content[i].split(' ') | |||
| # remove the '#'s in file names | |||
| g, l_names = load_file_fun(dirname_dataset + '/' + tmp[0].replace('#', '', 1)) | |||
| data.append(g) | |||
| self._append_label_names(label_names, l_names) | |||
| y.append(float(tmp[1])) | |||
| else: # targets in a seperate file | |||
| for i in range(0, len(content)): | |||
| tmp = content[i] | |||
| # remove the '#'s in file names | |||
| g, l_names = load_file_fun(dirname_dataset + '/' + tmp.replace('#', '', 1)) | |||
| data.append(g) | |||
| self._append_label_names(label_names, l_names) | |||
| with open(filename_targets) as fnt: | |||
| content_y = fnt.read().splitlines() | |||
| # assume entries in filename and filename_targets have the same order. | |||
| for item in content_y: | |||
| tmp = item.split(' ') | |||
| # assume the 3rd entry in a line is y (for Alkane dataset) | |||
| y.append(float(tmp[2])) | |||
| return data, y, label_names | |||
| def load_from_xml(self, filename, dir_dataset=None): | |||
| import xml.etree.ElementTree as ET | |||
| if dir_dataset is not None: | |||
| dir_dataset = dir_dataset | |||
| else: | |||
| dir_dataset = dirname(filename) | |||
| tree = ET.parse(filename) | |||
| root = tree.getroot() | |||
| data = [] | |||
| y = [] | |||
| label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} | |||
| for graph in root.iter('graph'): | |||
| mol_filename = graph.attrib['file'] | |||
| mol_class = graph.attrib['class'] | |||
| g, l_names = self.load_gxl(dir_dataset + '/' + mol_filename) | |||
| data.append(g) | |||
| self._append_label_names(label_names, l_names) | |||
| y.append(mol_class) | |||
| return data, y, label_names | |||
| def load_mat(self, filename, order): # @todo: need to be updated (auto order) or deprecated. | |||
| """Load graph data from a MATLAB (up to version 7.1) .mat file. | |||
| Notes | |||
| ------ | |||
| A MAT file contains a struct array containing graphs, and a column vector lx containing a class label for each graph. | |||
| Check README in `downloadable file <http://mlcb.is.tuebingen.mpg.de/Mitarbeiter/Nino/WL/>`__ for detailed structure. | |||
| """ | |||
| from scipy.io import loadmat | |||
| import numpy as np | |||
| import networkx as nx | |||
| data = [] | |||
| content = loadmat(filename) | |||
| for key, value in content.items(): | |||
| if key[0] == 'l': # class label | |||
| y = np.transpose(value)[0].tolist() | |||
| elif key[0] != '_': | |||
| # if adjacency matrix is not compressed / edge label exists | |||
| if order[1] == 0: | |||
| for i, item in enumerate(value[0]): | |||
| g = nx.Graph(name=i) # set name of the graph | |||
| nl = np.transpose(item[order[3]][0][0][0]) # node label | |||
| for index, label in enumerate(nl[0]): | |||
| g.add_node(index, label_1=str(label)) | |||
| el = item[order[4]][0][0][0] # edge label | |||
| for edge in el: | |||
| g.add_edge(edge[0] - 1, edge[1] - 1, label_1=str(edge[2])) | |||
| data.append(g) | |||
| else: | |||
| for i, item in enumerate(value[0]): | |||
| g = nx.Graph(name=i) # set name of the graph | |||
| nl = np.transpose(item[order[3]][0][0][0]) # node label | |||
| for index, label in enumerate(nl[0]): | |||
| g.add_node(index, label_1=str(label)) | |||
| sam = item[order[0]] # sparse adjacency matrix | |||
| index_no0 = sam.nonzero() | |||
| for col, row in zip(index_no0[0], index_no0[1]): | |||
| g.add_edge(col, row) | |||
| data.append(g) | |||
| label_names = {'node_labels': ['label_1'], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} | |||
| if order[1] == 0: | |||
| label_names['edge_labels'].append('label_1') | |||
| return data, y, label_names | |||
| def load_tud(self, filename): | |||
| """Load graph data from TUD dataset files. | |||
| Notes | |||
| ------ | |||
| The graph data is loaded from separate files. | |||
| Check README in `downloadable file <http://tiny.cc/PK_MLJ_data>`__, 2018 for detailed structure. | |||
| """ | |||
| import networkx as nx | |||
| from os import listdir | |||
| from os.path import dirname, basename | |||
| def get_infos_from_readme(frm): # @todo: add README (cuniform), maybe node/edge label maps. | |||
| """Get information from DS_label_readme.txt file. | |||
| """ | |||
| def get_label_names_from_line(line): | |||
| """Get names of labels/attributes from a line. | |||
| """ | |||
| str_names = line.split('[')[1].split(']')[0] | |||
| names = str_names.split(',') | |||
| names = [attr.strip() for attr in names] | |||
| return names | |||
| def get_class_label_map(label_map_strings): | |||
| label_map = {} | |||
| for string in label_map_strings: | |||
| integer, label = string.split('\t') | |||
| label_map[int(integer.strip())] = label.strip() | |||
| return label_map | |||
| label_names = {'node_labels': [], 'node_attrs': [], | |||
| 'edge_labels': [], 'edge_attrs': []} | |||
| class_label_map = None | |||
| class_label_map_strings = [] | |||
| with open(frm) as rm: | |||
| content_rm = rm.read().splitlines() | |||
| i = 0 | |||
| while i < len(content_rm): | |||
| line = content_rm[i].strip() | |||
| # get node/edge labels and attributes. | |||
| if line.startswith('Node labels:'): | |||
| label_names['node_labels'] = get_label_names_from_line(line) | |||
| elif line.startswith('Node attributes:'): | |||
| label_names['node_attrs'] = get_label_names_from_line(line) | |||
| elif line.startswith('Edge labels:'): | |||
| label_names['edge_labels'] = get_label_names_from_line(line) | |||
| elif line.startswith('Edge attributes:'): | |||
| label_names['edge_attrs'] = get_label_names_from_line(line) | |||
| # get class label map. | |||
| elif line.startswith('Class labels were converted to integer values using this map:'): | |||
| i += 2 | |||
| line = content_rm[i].strip() | |||
| while line != '' and i < len(content_rm): | |||
| class_label_map_strings.append(line) | |||
| i += 1 | |||
| line = content_rm[i].strip() | |||
| class_label_map = get_class_label_map(class_label_map_strings) | |||
| i += 1 | |||
| return label_names, class_label_map | |||
| # get dataset name. | |||
| dirname_dataset = dirname(filename) | |||
| filename = basename(filename) | |||
| fn_split = filename.split('_A') | |||
| ds_name = fn_split[0].strip() | |||
| # load data file names | |||
| for name in listdir(dirname_dataset): | |||
| if ds_name + '_A' in name: | |||
| fam = dirname_dataset + '/' + name | |||
| elif ds_name + '_graph_indicator' in name: | |||
| fgi = dirname_dataset + '/' + name | |||
| elif ds_name + '_graph_labels' in name: | |||
| fgl = dirname_dataset + '/' + name | |||
| elif ds_name + '_node_labels' in name: | |||
| fnl = dirname_dataset + '/' + name | |||
| elif ds_name + '_edge_labels' in name: | |||
| fel = dirname_dataset + '/' + name | |||
| elif ds_name + '_edge_attributes' in name: | |||
| fea = dirname_dataset + '/' + name | |||
| elif ds_name + '_node_attributes' in name: | |||
| fna = dirname_dataset + '/' + name | |||
| elif ds_name + '_graph_attributes' in name: | |||
| fga = dirname_dataset + '/' + name | |||
| elif ds_name + '_label_readme' in name: | |||
| frm = dirname_dataset + '/' + name | |||
| # this is supposed to be the node attrs, make sure to put this as the last 'elif' | |||
| elif ds_name + '_attributes' in name: | |||
| fna = dirname_dataset + '/' + name | |||
| # get labels and attributes names. | |||
| if 'frm' in locals(): | |||
| label_names, class_label_map = get_infos_from_readme(frm) | |||
| else: | |||
| label_names = {'node_labels': [], 'node_attrs': [], | |||
| 'edge_labels': [], 'edge_attrs': []} | |||
| class_label_map = None | |||
| with open(fgi) as gi: | |||
| content_gi = gi.read().splitlines() # graph indicator | |||
| with open(fam) as am: | |||
| content_am = am.read().splitlines() # adjacency matrix | |||
| # load targets. | |||
| if 'fgl' in locals(): | |||
| with open(fgl) as gl: | |||
| content_targets = gl.read().splitlines() # targets (classification) | |||
| targets = [float(i) for i in content_targets] | |||
| elif 'fga' in locals(): | |||
| with open(fga) as ga: | |||
| content_targets = ga.read().splitlines() # targets (regression) | |||
| targets = [int(i) for i in content_targets] | |||
| else: | |||
| raise Exception('Can not find targets file. Please make sure there is a "', ds_name, '_graph_labels.txt" or "', ds_name, '_graph_attributes.txt"', 'file in your dataset folder.') | |||
| if class_label_map is not None: | |||
| targets = [class_label_map[t] for t in targets] | |||
| # create graphs and add nodes | |||
| data = [nx.Graph(name=str(i)) for i in range(0, len(content_targets))] | |||
| if 'fnl' in locals(): | |||
| with open(fnl) as nl: | |||
| content_nl = nl.read().splitlines() # node labels | |||
| for idx, line in enumerate(content_gi): | |||
| # transfer to int first in case of unexpected blanks | |||
| data[int(line) - 1].add_node(idx) | |||
| labels = [l.strip() for l in content_nl[idx].split(',')] | |||
| if label_names['node_labels'] == []: # @todo: need fix bug. | |||
| for i, label in enumerate(labels): | |||
| l_name = 'label_' + str(i) | |||
| data[int(line) - 1].nodes[idx][l_name] = label | |||
| label_names['node_labels'].append(l_name) | |||
| else: | |||
| for i, l_name in enumerate(label_names['node_labels']): | |||
| data[int(line) - 1].nodes[idx][l_name] = labels[i] | |||
| else: | |||
| for i, line in enumerate(content_gi): | |||
| data[int(line) - 1].add_node(i) | |||
| # add edges | |||
| for line in content_am: | |||
| tmp = line.split(',') | |||
| n1 = int(tmp[0]) - 1 | |||
| n2 = int(tmp[1]) - 1 | |||
| # ignore edge weight here. | |||
| g = int(content_gi[n1]) - 1 | |||
| data[g].add_edge(n1, n2) | |||
| # add edge labels | |||
| if 'fel' in locals(): | |||
| with open(fel) as el: | |||
| content_el = el.read().splitlines() | |||
| for idx, line in enumerate(content_el): | |||
| labels = [l.strip() for l in line.split(',')] | |||
| n = [int(i) - 1 for i in content_am[idx].split(',')] | |||
| g = int(content_gi[n[0]]) - 1 | |||
| if label_names['edge_labels'] == []: | |||
| for i, label in enumerate(labels): | |||
| l_name = 'label_' + str(i) | |||
| data[g].edges[n[0], n[1]][l_name] = label | |||
| label_names['edge_labels'].append(l_name) | |||
| else: | |||
| for i, l_name in enumerate(label_names['edge_labels']): | |||
| data[g].edges[n[0], n[1]][l_name] = labels[i] | |||
| # add node attributes | |||
| if 'fna' in locals(): | |||
| with open(fna) as na: | |||
| content_na = na.read().splitlines() | |||
| for idx, line in enumerate(content_na): | |||
| attrs = [a.strip() for a in line.split(',')] | |||
| g = int(content_gi[idx]) - 1 | |||
| if label_names['node_attrs'] == []: | |||
| for i, attr in enumerate(attrs): | |||
| a_name = 'attr_' + str(i) | |||
| data[g].nodes[idx][a_name] = attr | |||
| label_names['node_attrs'].append(a_name) | |||
| else: | |||
| for i, a_name in enumerate(label_names['node_attrs']): | |||
| data[g].nodes[idx][a_name] = attrs[i] | |||
| # add edge attributes | |||
| if 'fea' in locals(): | |||
| with open(fea) as ea: | |||
| content_ea = ea.read().splitlines() | |||
| for idx, line in enumerate(content_ea): | |||
| attrs = [a.strip() for a in line.split(',')] | |||
| n = [int(i) - 1 for i in content_am[idx].split(',')] | |||
| g = int(content_gi[n[0]]) - 1 | |||
| if label_names['edge_attrs'] == []: | |||
| for i, attr in enumerate(attrs): | |||
| a_name = 'attr_' + str(i) | |||
| data[g].edges[n[0], n[1]][a_name] = attr | |||
| label_names['edge_attrs'].append(a_name) | |||
| else: | |||
| for i, a_name in enumerate(label_names['edge_attrs']): | |||
| data[g].edges[n[0], n[1]][a_name] = attrs[i] | |||
| return data, targets, label_names | |||
| def load_ct(self, filename): # @todo: this function is only tested on CTFile V2000; header not considered; only simple cases (atoms and bonds are considered.) | |||
| """load data from a Chemical Table (.ct) file. | |||
| Notes | |||
| ------ | |||
| a typical example of data in .ct is like this: | |||
| 3 2 <- number of nodes and edges | |||
| 0.0000 0.0000 0.0000 C <- each line describes a node (x,y,z + label) | |||
| 0.0000 0.0000 0.0000 C | |||
| 0.0000 0.0000 0.0000 O | |||
| 1 3 1 1 <- each line describes an edge : to, from, bond type, bond stereo | |||
| 2 3 1 1 | |||
| Check `CTFile Formats file <https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=10&ved=2ahUKEwivhaSdjsTlAhVhx4UKHczHA8gQFjAJegQIARAC&url=https%3A%2F%2Fwww.daylight.com%2Fmeetings%2Fmug05%2FKappler%2Fctfile.pdf&usg=AOvVaw1cDNrrmMClkFPqodlF2inS>`__ | |||
| for detailed format discription. | |||
| """ | |||
| import networkx as nx | |||
| from os.path import basename | |||
| g = nx.Graph() | |||
| with open(filename) as f: | |||
| content = f.read().splitlines() | |||
| g = nx.Graph(name=str(content[0]), filename=basename(filename)) # set name of the graph | |||
| # read the counts line. | |||
| tmp = content[1].split(' ') | |||
| tmp = [x for x in tmp if x != ''] | |||
| nb_atoms = int(tmp[0].strip()) # number of atoms | |||
| nb_bonds = int(tmp[1].strip()) # number of bonds | |||
| count_line_tags = ['number_of_atoms', 'number_of_bonds', 'number_of_atom_lists', '', 'chiral_flag', 'number_of_stext_entries', '', '', '', '', 'number_of_properties', 'CT_version'] | |||
| i = 0 | |||
| while i < len(tmp): | |||
| if count_line_tags[i] != '': # if not obsoleted | |||
| g.graph[count_line_tags[i]] = tmp[i].strip() | |||
| i += 1 | |||
| # read the atom block. | |||
| atom_tags = ['x', 'y', 'z', 'atom_symbol', 'mass_difference', 'charge', 'atom_stereo_parity', 'hydrogen_count_plus_1', 'stereo_care_box', 'valence', 'h0_designator', '', '', 'atom_atom_mapping_number', 'inversion_retention_flag', 'exact_change_flag'] | |||
| for i in range(0, nb_atoms): | |||
| tmp = content[i + 2].split(' ') | |||
| tmp = [x for x in tmp if x != ''] | |||
| g.add_node(i) | |||
| j = 0 | |||
| while j < len(tmp): | |||
| if atom_tags[j] != '': | |||
| g.nodes[i][atom_tags[j]] = tmp[j].strip() | |||
| j += 1 | |||
| # read the bond block. | |||
| bond_tags = ['first_atom_number', 'second_atom_number', 'bond_type', 'bond_stereo', '', 'bond_topology', 'reacting_center_status'] | |||
| for i in range(0, nb_bonds): | |||
| tmp = content[i + g.number_of_nodes() + 2].split(' ') | |||
| tmp = [x for x in tmp if x != ''] | |||
| n1, n2 = int(tmp[0].strip()) - 1, int(tmp[1].strip()) - 1 | |||
| g.add_edge(n1, n2) | |||
| j = 2 | |||
| while j < len(tmp): | |||
| if bond_tags[j] != '': | |||
| g.edges[(n1, n2)][bond_tags[j]] = tmp[j].strip() | |||
| j += 1 | |||
| # get label names. | |||
| label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} | |||
| atom_symbolic = [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, None, None, 1, 1, 1] | |||
| for nd in g.nodes(): | |||
| for key in g.nodes[nd]: | |||
| if atom_symbolic[atom_tags.index(key)] == 1: | |||
| label_names['node_labels'].append(key) | |||
| else: | |||
| label_names['node_attrs'].append(key) | |||
| break | |||
| bond_symbolic = [None, None, 1, 1, None, 1, 1] | |||
| for ed in g.edges(): | |||
| for key in g.edges[ed]: | |||
| if bond_symbolic[bond_tags.index(key)] == 1: | |||
| label_names['edge_labels'].append(key) | |||
| else: | |||
| label_names['edge_attrs'].append(key) | |||
| break | |||
| return g, label_names | |||
| def load_gxl(self, filename): # @todo: directed graphs. | |||
| from os.path import basename | |||
| import networkx as nx | |||
| import xml.etree.ElementTree as ET | |||
| tree = ET.parse(filename) | |||
| root = tree.getroot() | |||
| index = 0 | |||
| g = nx.Graph(filename=basename(filename), name=root[0].attrib['id']) | |||
| dic = {} # used to retrieve incident nodes of edges | |||
| for node in root.iter('node'): | |||
| dic[node.attrib['id']] = index | |||
| labels = {} | |||
| for attr in node.iter('attr'): | |||
| labels[attr.attrib['name']] = attr[0].text | |||
| g.add_node(index, **labels) | |||
| index += 1 | |||
| for edge in root.iter('edge'): | |||
| labels = {} | |||
| for attr in edge.iter('attr'): | |||
| labels[attr.attrib['name']] = attr[0].text | |||
| g.add_edge(dic[edge.attrib['from']], dic[edge.attrib['to']], **labels) | |||
| # get label names. | |||
| label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} | |||
| for node in root.iter('node'): | |||
| for attr in node.iter('attr'): | |||
| if attr[0].tag == 'int': # @todo: this maybe wrong, and slow. | |||
| label_names['node_labels'].append(attr.attrib['name']) | |||
| else: | |||
| label_names['node_attrs'].append(attr.attrib['name']) | |||
| break | |||
| for edge in root.iter('edge'): | |||
| for attr in edge.iter('attr'): | |||
| if attr[0].tag == 'int': # @todo: this maybe wrong, and slow. | |||
| label_names['edge_labels'].append(attr.attrib['name']) | |||
| else: | |||
| label_names['edge_attrs'].append(attr.attrib['name']) | |||
| break | |||
| return g, label_names | |||
| def _append_label_names(self, label_names, new_names): | |||
| for key, val in label_names.items(): | |||
| label_names[key] += [name for name in new_names[key] if name not in val] | |||
| @property | |||
| def data(self): | |||
| return self._graphs, self._targets, self._label_names | |||
| @property | |||
| def graphs(self): | |||
| return self._graphs | |||
| @property | |||
| def targets(self): | |||
| return self._targets | |||
| @property | |||
| def label_names(self): | |||
| return self._label_names | |||
| class DataSaver(): | |||
| def __init__(self, graphs, targets=None, filename='gfile', gformat='gxl', group=None, **kwargs): | |||
| """Save list of graphs. | |||
| """ | |||
| import os | |||
| dirname_ds = os.path.dirname(filename) | |||
| if dirname_ds != '': | |||
| dirname_ds += '/' | |||
| os.makedirs(dirname_ds, exist_ok=True) | |||
| if 'graph_dir' in kwargs: | |||
| graph_dir = kwargs['graph_dir'] + '/' | |||
| os.makedirs(graph_dir, exist_ok=True) | |||
| del kwargs['graph_dir'] | |||
| else: | |||
| graph_dir = dirname_ds | |||
| if group == 'xml' and gformat == 'gxl': | |||
| with open(filename + '.xml', 'w') as fgroup: | |||
| fgroup.write("<?xml version=\"1.0\"?>") | |||
| fgroup.write("\n<!DOCTYPE GraphCollection SYSTEM \"http://www.inf.unibz.it/~blumenthal/dtd/GraphCollection.dtd\">") | |||
| fgroup.write("\n<GraphCollection>") | |||
| for idx, g in enumerate(graphs): | |||
| fname_tmp = "graph" + str(idx) + ".gxl" | |||
| self.save_gxl(g, graph_dir + fname_tmp, **kwargs) | |||
| fgroup.write("\n\t<graph file=\"" + fname_tmp + "\" class=\"" + str(targets[idx]) + "\"/>") | |||
| fgroup.write("\n</GraphCollection>") | |||
| fgroup.close() | |||
| def save_gxl(self, graph, filename, method='default', node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]): | |||
| if method == 'default': | |||
| gxl_file = open(filename, 'w') | |||
| gxl_file.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") | |||
| gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n") | |||
| gxl_file.write("<gxl xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n") | |||
| if 'name' in graph.graph: | |||
| name = str(graph.graph['name']) | |||
| else: | |||
| name = 'dummy' | |||
| gxl_file.write("<graph id=\"" + name + "\" edgeids=\"false\" edgemode=\"undirected\">\n") | |||
| for v, attrs in graph.nodes(data=True): | |||
| gxl_file.write("<node id=\"_" + str(v) + "\">") | |||
| for l_name in node_labels: | |||
| gxl_file.write("<attr name=\"" + l_name + "\"><int>" + | |||
| str(attrs[l_name]) + "</int></attr>") | |||
| for a_name in node_attrs: | |||
| gxl_file.write("<attr name=\"" + a_name + "\"><float>" + | |||
| str(attrs[a_name]) + "</float></attr>") | |||
| gxl_file.write("</node>\n") | |||
| for v1, v2, attrs in graph.edges(data=True): | |||
| gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\">") | |||
| for l_name in edge_labels: | |||
| gxl_file.write("<attr name=\"" + l_name + "\"><int>" + | |||
| str(attrs[l_name]) + "</int></attr>") | |||
| for a_name in edge_attrs: | |||
| gxl_file.write("<attr name=\"" + a_name + "\"><float>" + | |||
| str(attrs[a_name]) + "</float></attr>") | |||
| gxl_file.write("</edge>\n") | |||
| gxl_file.write("</graph>\n") | |||
| gxl_file.write("</gxl>") | |||
| gxl_file.close() | |||
| elif method == 'benoit': | |||
| import xml.etree.ElementTree as ET | |||
| root_node = ET.Element('gxl') | |||
| attr = dict() | |||
| attr['id'] = str(graph.graph['name']) | |||
| attr['edgeids'] = 'true' | |||
| attr['edgemode'] = 'undirected' | |||
| graph_node = ET.SubElement(root_node, 'graph', attrib=attr) | |||
| for v in graph: | |||
| current_node = ET.SubElement(graph_node, 'node', attrib={'id': str(v)}) | |||
| for attr in graph.nodes[v].keys(): | |||
| cur_attr = ET.SubElement( | |||
| current_node, 'attr', attrib={'name': attr}) | |||
| cur_value = ET.SubElement(cur_attr, | |||
| graph.nodes[v][attr].__class__.__name__) | |||
| cur_value.text = graph.nodes[v][attr] | |||
| for v1 in graph: | |||
| for v2 in graph[v1]: | |||
| if (v1 < v2): # Non oriented graphs | |||
| cur_edge = ET.SubElement( | |||
| graph_node, | |||
| 'edge', | |||
| attrib={ | |||
| 'from': str(v1), | |||
| 'to': str(v2) | |||
| }) | |||
| for attr in graph[v1][v2].keys(): | |||
| cur_attr = ET.SubElement( | |||
| cur_edge, 'attr', attrib={'name': attr}) | |||
| cur_value = ET.SubElement( | |||
| cur_attr, graph[v1][v2][attr].__class__.__name__) | |||
| cur_value.text = str(graph[v1][v2][attr]) | |||
| tree = ET.ElementTree(root_node) | |||
| tree.write(filename) | |||
| elif method == 'gedlib': | |||
| # reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22 | |||
| # pass | |||
| gxl_file = open(filename, 'w') | |||
| gxl_file.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") | |||
| gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n") | |||
| gxl_file.write("<gxl xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n") | |||
| gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"true\" edgemode=\"undirected\">\n") | |||
| for v, attrs in graph.nodes(data=True): | |||
| gxl_file.write("<node id=\"_" + str(v) + "\">") | |||
| gxl_file.write("<attr name=\"" + "chem" + "\"><int>" + str(attrs['chem']) + "</int></attr>") | |||
| gxl_file.write("</node>\n") | |||
| for v1, v2, attrs in graph.edges(data=True): | |||
| gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\">") | |||
| gxl_file.write("<attr name=\"valence\"><int>" + str(attrs['valence']) + "</int></attr>") | |||
| # gxl_file.write("<attr name=\"valence\"><int>" + "1" + "</int></attr>") | |||
| gxl_file.write("</edge>\n") | |||
| gxl_file.write("</graph>\n") | |||
| gxl_file.write("</gxl>") | |||
| gxl_file.close() | |||
| elif method == 'gedlib-letter': | |||
| # reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22 | |||
| # and https://github.com/dbblumenthal/gedlib/blob/master/data/datasets/Letter/HIGH/AP1_0000.gxl | |||
| gxl_file = open(filename, 'w') | |||
| gxl_file.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") | |||
| gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n") | |||
| gxl_file.write("<gxl xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n") | |||
| gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"false\" edgemode=\"undirected\">\n") | |||
| for v, attrs in graph.nodes(data=True): | |||
| gxl_file.write("<node id=\"_" + str(v) + "\">") | |||
| gxl_file.write("<attr name=\"x\"><float>" + str(attrs['attributes'][0]) + "</float></attr>") | |||
| gxl_file.write("<attr name=\"y\"><float>" + str(attrs['attributes'][1]) + "</float></attr>") | |||
| gxl_file.write("</node>\n") | |||
| for v1, v2, attrs in graph.edges(data=True): | |||
| gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\"/>\n") | |||
| gxl_file.write("</graph>\n") | |||
| gxl_file.write("</gxl>") | |||
| gxl_file.close() | |||
| # def loadSDF(filename): | |||
| # """load data from structured data file (.sdf file). | |||
| # Notes | |||
| # ------ | |||
| # A SDF file contains a group of molecules, represented in the similar way as in MOL format. | |||
| # Check `here <http://www.nonlinear.com/progenesis/sdf-studio/v0.9/faq/sdf-file-format-guidance.aspx>`__ for detailed structure. | |||
| # """ | |||
| # import networkx as nx | |||
| # from os.path import basename | |||
| # from tqdm import tqdm | |||
| # import sys | |||
| # data = [] | |||
| # with open(filename) as f: | |||
| # content = f.read().splitlines() | |||
| # index = 0 | |||
| # pbar = tqdm(total=len(content) + 1, desc='load SDF', file=sys.stdout) | |||
| # while index < len(content): | |||
| # index_old = index | |||
| # g = nx.Graph(name=content[index].strip()) # set name of the graph | |||
| # tmp = content[index + 3] | |||
| # nb_nodes = int(tmp[:3]) # number of the nodes | |||
| # nb_edges = int(tmp[3:6]) # number of the edges | |||
| # for i in range(0, nb_nodes): | |||
| # tmp = content[i + index + 4] | |||
| # g.add_node(i, atom=tmp[31:34].strip()) | |||
| # for i in range(0, nb_edges): | |||
| # tmp = content[i + index + g.number_of_nodes() + 4] | |||
| # tmp = [tmp[i:i + 3] for i in range(0, len(tmp), 3)] | |||
| # g.add_edge( | |||
| # int(tmp[0]) - 1, int(tmp[1]) - 1, bond_type=tmp[2].strip()) | |||
| # data.append(g) | |||
| # index += 4 + g.number_of_nodes() + g.number_of_edges() | |||
| # while content[index].strip() != '$$$$': # seperator | |||
| # index += 1 | |||
| # index += 1 | |||
| # pbar.update(index - index_old) | |||
| # pbar.update(1) | |||
| # pbar.close() | |||
| # return data | |||
| # def load_from_cxl(filename): | |||
| # import xml.etree.ElementTree as ET | |||
| # | |||
| # dirname_dataset = dirname(filename) | |||
| # tree = ET.parse(filename) | |||
| # root = tree.getroot() | |||
| # data = [] | |||
| # y = [] | |||
| # for graph in root.iter('graph'): | |||
| # mol_filename = graph.attrib['file'] | |||
| # mol_class = graph.attrib['class'] | |||
| # data.append(load_gxl(dirname_dataset + '/' + mol_filename)) | |||
| # y.append(mol_class) | |||
| if __name__ == '__main__': | |||
| # ### Load dataset from .ds file. | |||
| # # .ct files. | |||
| # ds = {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds', | |||
| # 'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt'} | |||
| # Gn, y = loadDataset(ds['dataset'], filename_y=ds['dataset_y']) | |||
| # ds_file = '../../datasets/Acyclic/dataset_bps.ds' # node symb | |||
| # Gn, targets, label_names = load_dataset(ds_file) | |||
| # ds_file = '../../datasets/MAO/dataset.ds' # node/edge symb | |||
| # Gn, targets, label_names = load_dataset(ds_file) | |||
| ## ds = {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds'} # unlabeled | |||
| ## Gn, y = loadDataset(ds['dataset']) | |||
| # print(Gn[1].graph) | |||
| # print(Gn[1].nodes(data=True)) | |||
| # print(Gn[1].edges(data=True)) | |||
| # print(targets[1]) | |||
| # # .gxl file. | |||
| # ds_file = '../../datasets/monoterpenoides/dataset_10+.ds' # node/edge symb | |||
| # Gn, y, label_names = load_dataset(ds_file) | |||
| # print(Gn[1].graph) | |||
| # print(Gn[1].nodes(data=True)) | |||
| # print(Gn[1].edges(data=True)) | |||
| # print(y[1]) | |||
| # .mat file. | |||
| ds_file = '../../datasets/MUTAG_mat/MUTAG.mat' | |||
| order = [0, 0, 3, 1, 2] | |||
| gloader = DataLoader(ds_file, order=order) | |||
| Gn, targets, label_names = gloader.data | |||
| print(Gn[1].graph) | |||
| print(Gn[1].nodes(data=True)) | |||
| print(Gn[1].edges(data=True)) | |||
| print(targets[1]) | |||
| # ### Convert graph from one format to another. | |||
| # # .gxl file. | |||
| # import networkx as nx | |||
| # ds = {'name': 'monoterpenoides', | |||
| # 'dataset': '../../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | |||
| # Gn, y = loadDataset(ds['dataset']) | |||
| # y = [int(i) for i in y] | |||
| # print(Gn[1].nodes(data=True)) | |||
| # print(Gn[1].edges(data=True)) | |||
| # print(y[1]) | |||
| # # Convert a graph to the proper NetworkX format that can be recognized by library gedlib. | |||
| # Gn_new = [] | |||
| # for G in Gn: | |||
| # G_new = nx.Graph() | |||
| # for nd, attrs in G.nodes(data=True): | |||
| # G_new.add_node(str(nd), chem=attrs['atom']) | |||
| # for nd1, nd2, attrs in G.edges(data=True): | |||
| # G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) | |||
| ## G_new.add_edge(str(nd1), str(nd2)) | |||
| # Gn_new.append(G_new) | |||
| # print(Gn_new[1].nodes(data=True)) | |||
| # print(Gn_new[1].edges(data=True)) | |||
| # print(Gn_new[1]) | |||
| # filename = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/gxl/monoterpenoides' | |||
| # xparams = {'method': 'gedlib'} | |||
| # saveDataset(Gn, y, gformat='gxl', group='xml', filename=filename, xparams=xparams) | |||
| # save dataset. | |||
| # ds = {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat', | |||
| # 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb | |||
| # Gn, y = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||
| # saveDataset(Gn, y, group='xml', filename='temp/temp') | |||
| # test - new way to add labels and attributes. | |||
| # dataset = '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt' | |||
| # filename = '../../datasets/Fingerprint/Fingerprint_A.txt' | |||
| # dataset = '../../datasets/Letter-med/Letter-med_A.txt' | |||
| # dataset = '../../datasets/AIDS/AIDS_A.txt' | |||
| # dataset = '../../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt' | |||
| # Gn, targets, label_names = load_dataset(filename) | |||
| pass | |||
| @@ -0,0 +1,61 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Fri Sep 11 18:10:06 2020 | |||
| @author: ljia | |||
| """ | |||
| import numpy as np | |||
| import networkx as nx | |||
| import random | |||
| class GraphSynthesizer(object): | |||
| def __init__(self, g_type=None, *args, **kwargs): | |||
| if g_type == 'unified': | |||
| self._graphs = self.unified_graphs(*args, *kwargs) | |||
| else: | |||
| self._graphs = None | |||
| def random_graph(self, num_nodes, num_edges, num_node_labels=0, num_edge_labels=0, seed=None, directed=False, max_num_edges=None, all_edges=None): | |||
| g = nx.Graph() | |||
| if num_node_labels > 0: | |||
| node_labels = np.random.randint(0, high=num_node_labels, size=num_nodes) | |||
| for i in range(0, num_nodes): | |||
| g.add_node(str(i), atom=node_labels[i]) # @todo: update "atom". | |||
| else: | |||
| for i in range(0, num_nodes): | |||
| g.add_node(str(i)) | |||
| if num_edge_labels > 0: | |||
| edge_labels = np.random.randint(0, high=num_edge_labels, size=num_edges) | |||
| for idx, i in enumerate(random.sample(range(0, max_num_edges), num_edges)): | |||
| node1, node2 = all_edges[i] | |||
| g.add_edge(str(node1), str(node2), bond_type=edge_labels[idx]) # @todo: update "bond_type". | |||
| else: | |||
| for i in random.sample(range(0, max_num_edges), num_edges): | |||
| node1, node2 = all_edges[i] | |||
| g.add_edge(str(node1), str(node2)) | |||
| return g | |||
| def unified_graphs(self, num_graphs=1000, num_nodes=20, num_edges=40, num_node_labels=0, num_edge_labels=0, seed=None, directed=False): | |||
| max_num_edges = int((num_nodes - 1) * num_nodes / 2) | |||
| if num_edges > max_num_edges: | |||
| raise Exception('Too many edges.') | |||
| all_edges = [(i, j) for i in range(0, num_nodes) for j in range(i + 1, num_nodes)] # @todo: optimize. No directed graphs. | |||
| graphs = [] | |||
| for idx in range(0, num_graphs): | |||
| graphs.append(self.random_graph(num_nodes, num_edges, num_node_labels=num_node_labels, num_edge_labels=num_edge_labels, seed=seed, directed=directed, max_num_edges=max_num_edges, all_edges=all_edges)) | |||
| return graphs | |||
| @property | |||
| def graphs(self): | |||
| return self._graphs | |||
| @@ -0,0 +1,142 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Wed Oct 20 11:48:02 2020 | |||
| @author: ljia | |||
| """ | |||
| # This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. | |||
| import os | |||
| import multiprocessing | |||
| import pickle | |||
| import logging | |||
| from gklearn.ged.util import compute_geds | |||
| import time | |||
| import sys | |||
| from group_results import group_trials | |||
| def generate_graphs(): | |||
| from gklearn.utils.graph_synthesizer import GraphSynthesizer | |||
| gsyzer = GraphSynthesizer() | |||
| graphs = gsyzer.unified_graphs(num_graphs=100, num_nodes=20, num_edges=20, num_node_labels=0, num_edge_labels=0, seed=None, directed=False) | |||
| return graphs | |||
| def xp_compute_ged_matrix(graphs, N, max_num_solutions, ratio, trial): | |||
| save_file_suffix = '.' + str(N) + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | |||
| # Return if the file exists. | |||
| if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): | |||
| return None, None | |||
| """**2. Set parameters.**""" | |||
| # Parameters for GED computation. | |||
| ged_options = {'method': 'BIPARTITE', # use BIPARTITE huristic. | |||
| # 'initialization_method': 'RANDOM', # or 'NODE', etc. (for GEDEnv) | |||
| 'lsape_model': 'ECBP', # | |||
| # ??when bigger than 1, then the method is considered mIPFP. | |||
| # the actual number of computed solutions might be smaller than the specified value | |||
| 'max_num_solutions': max_num_solutions, | |||
| 'edit_cost': 'CONSTANT', # use CONSTANT cost. | |||
| 'greedy_method': 'BASIC', # | |||
| # the distance between non-symbolic node/edge labels is computed by euclidean distance. | |||
| 'attr_distance': 'euclidean', | |||
| 'optimal': True, # if TRUE, the option --greedy-method has no effect | |||
| # parallel threads. Do not work if mpg_options['parallel'] = False. | |||
| 'threads': multiprocessing.cpu_count(), | |||
| 'centrality_method': 'NONE', | |||
| 'centrality_weight': 0.7, | |||
| 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' | |||
| } | |||
| edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] | |||
| # edit_cost_constants = [item * 0.01 for item in edit_cost_constants] | |||
| # pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) | |||
| options = ged_options.copy() | |||
| options['edit_cost_constants'] = edit_cost_constants | |||
| options['node_labels'] = [] | |||
| options['edge_labels'] = [] | |||
| options['node_attrs'] = [] | |||
| options['edge_attrs'] = [] | |||
| parallel = True # if num_solutions == 1 else False | |||
| """**5. Compute GED matrix.**""" | |||
| ged_mat = 'error' | |||
| runtime = 0 | |||
| try: | |||
| time0 = time.time() | |||
| ged_vec_init, ged_mat, n_edit_operations = compute_geds(graphs, options=options, repeats=1, parallel=parallel, verbose=True) | |||
| runtime = time.time() - time0 | |||
| except Exception as exp: | |||
| print('An exception occured when running this experiment:') | |||
| LOG_FILENAME = save_dir + 'error.txt' | |||
| logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||
| logging.exception(save_file_suffix) | |||
| print(repr(exp)) | |||
| """**6. Get results.**""" | |||
| with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: | |||
| pickle.dump(ged_mat, f) | |||
| with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: | |||
| pickle.dump(runtime, f) | |||
| return ged_mat, runtime | |||
| def save_trials_as_group(graphs, N, max_num_solutions, ratio): | |||
| # Return if the group file exists. | |||
| name_middle = '.' + str(N) + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.' | |||
| name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' | |||
| if os.path.isfile(name_group): | |||
| return | |||
| ged_mats = [] | |||
| runtimes = [] | |||
| for trial in range(1, 101): | |||
| print() | |||
| print('Trial:', trial) | |||
| ged_mat, runtime = xp_compute_ged_matrix(graphs, N, max_num_solutions, ratio, trial) | |||
| ged_mats.append(ged_mat) | |||
| runtimes.append(runtime) | |||
| # Group trials and Remove single files. | |||
| name_prefix = 'ged_matrix' + name_middle | |||
| group_trials(save_dir, name_prefix, True, True, False) | |||
| name_prefix = 'runtime' + name_middle | |||
| group_trials(save_dir, name_prefix, True, True, False) | |||
| def results_for_a_ratio(ratio): | |||
| for N in N_list: | |||
| print() | |||
| print('# of graphs:', N) | |||
| for max_num_solutions in [1, 20, 40, 60, 80, 100]: | |||
| print() | |||
| print('Max # of solutions:', max_num_solutions) | |||
| save_trials_as_group(graphs[:N], N, max_num_solutions, ratio) | |||
| if __name__ == '__main__': | |||
| if len(sys.argv) > 1: | |||
| N_list = [int(i) for i in sys.argv[1:]] | |||
| else: | |||
| N_list = [10, 50, 100] | |||
| # Generate graphs. | |||
| graphs = generate_graphs() | |||
| save_dir = 'outputs/edit_costs.max_num_sols.N.bipartite/' | |||
| os.makedirs(save_dir, exist_ok=True) | |||
| os.makedirs(save_dir + 'groups/', exist_ok=True) | |||
| for ratio in [10, 1, 0.1]: | |||
| print() | |||
| print('Ratio:', ratio) | |||
| results_for_a_ratio(ratio) | |||
| @@ -12,18 +12,19 @@ import multiprocessing | |||
| import pickle | |||
| import logging | |||
| from gklearn.ged.util import compute_geds | |||
| import numpy as np | |||
| import time | |||
| from utils import get_dataset | |||
| import sys | |||
| from group_results import group_trials | |||
| def xp_compute_ged_matrix(dataset, ds_name, max_num_solutions, ratio, trial): | |||
| save_file_suffix = '.' + ds_name + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | |||
| """**1. Get dataset.**""" | |||
| dataset = get_dataset(ds_name) | |||
| # Return if the file exists. | |||
| if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): | |||
| return None, None | |||
| """**2. Set parameters.**""" | |||
| @@ -83,6 +84,12 @@ def xp_compute_ged_matrix(dataset, ds_name, max_num_solutions, ratio, trial): | |||
| def save_trials_as_group(dataset, ds_name, max_num_solutions, ratio): | |||
| # Return if the group file exists. | |||
| name_middle = '.' + ds_name + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.' | |||
| name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' | |||
| if os.path.isfile(name_group): | |||
| return | |||
| ged_mats = [] | |||
| runtimes = [] | |||
| for trial in range(1, 101): | |||
| @@ -92,25 +99,36 @@ def save_trials_as_group(dataset, ds_name, max_num_solutions, ratio): | |||
| ged_mats.append(ged_mat) | |||
| runtimes.append(runtime) | |||
| save_file_suffix = '.' + ds_name + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) | |||
| with open(save_dir + 'groups/ged_mats' + save_file_suffix + '.npy', 'wb') as f: | |||
| np.save(f, np.array(ged_mats)) | |||
| with open(save_dir + 'groups/runtimes' + save_file_suffix + '.pkl', 'wb') as f: | |||
| pickle.dump(runtime, f) | |||
| # Group trials and Remove single files. | |||
| name_prefix = 'ged_matrix' + name_middle | |||
| group_trials(save_dir, name_prefix, True, True, False) | |||
| name_prefix = 'runtime' + name_middle | |||
| group_trials(save_dir, name_prefix, True, True, False) | |||
| def results_for_a_dataset(ds_name): | |||
| """**1. Get dataset.**""" | |||
| dataset = get_dataset(ds_name) | |||
| for max_num_solutions in [1, 20, 40, 60, 80, 100]: | |||
| for max_num_solutions in mnum_solutions_list: | |||
| print() | |||
| print('Max # of solutions:', max_num_solutions) | |||
| for ratio in [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]: | |||
| for ratio in ratio_list: | |||
| print() | |||
| print('Ratio:', ratio) | |||
| save_trials_as_group(dataset, ds_name, max_num_solutions, ratio) | |||
| def get_param_lists(ds_name): | |||
| if ds_name == 'AIDS_symb': | |||
| mnum_solutions_list = [1, 20, 40, 60, 80, 100] | |||
| ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] | |||
| else: | |||
| mnum_solutions_list = [1, 20, 40, 60, 80, 100] | |||
| ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] | |||
| return mnum_solutions_list, ratio_list | |||
| if __name__ == '__main__': | |||
| if len(sys.argv) > 1: | |||
| @@ -119,12 +137,11 @@ if __name__ == '__main__': | |||
| ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||
| save_dir = 'outputs/edit_costs.max_num_sols.ratios.bipartite/' | |||
| if not os.path.exists(save_dir): | |||
| os.makedirs(save_dir) | |||
| if not os.path.exists(save_dir + 'groups/'): | |||
| os.makedirs(save_dir + 'groups/') | |||
| os.makedirs(save_dir, exist_ok=True) | |||
| os.makedirs(save_dir + 'groups/', exist_ok=True) | |||
| for ds_name in ds_name_list: | |||
| print() | |||
| print('Dataset:', ds_name) | |||
| mnum_solutions_list, ratio_list = get_param_lists(ds_name) | |||
| results_for_a_dataset(ds_name) | |||
| @@ -0,0 +1,137 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Wed Oct 20 11:48:02 2020 | |||
| @author: ljia | |||
| """ | |||
| # This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. | |||
| import os | |||
| import multiprocessing | |||
| import pickle | |||
| import logging | |||
| from gklearn.ged.util import compute_geds | |||
| import time | |||
| import sys | |||
| from group_results import group_trials | |||
| def generate_graphs(): | |||
| from gklearn.utils.graph_synthesizer import GraphSynthesizer | |||
| gsyzer = GraphSynthesizer() | |||
| graphs = gsyzer.unified_graphs(num_graphs=100, num_nodes=20, num_edges=20, num_node_labels=0, num_edge_labels=0, seed=None, directed=False) | |||
| return graphs | |||
| def xp_compute_ged_matrix(graphs, N, num_solutions, ratio, trial): | |||
| save_file_suffix = '.' + str(N) + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | |||
| # Return if the file exists. | |||
| if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): | |||
| return None, None | |||
| """**2. Set parameters.**""" | |||
| # Parameters for GED computation. | |||
| ged_options = {'method': 'IPFP', # use IPFP huristic. | |||
| 'initialization_method': 'RANDOM', # or 'NODE', etc. | |||
| # when bigger than 1, then the method is considered mIPFP. | |||
| 'initial_solutions': int(num_solutions * 4), | |||
| 'edit_cost': 'CONSTANT', # use CONSTANT cost. | |||
| # the distance between non-symbolic node/edge labels is computed by euclidean distance. | |||
| 'attr_distance': 'euclidean', | |||
| 'ratio_runs_from_initial_solutions': 0.25, | |||
| # parallel threads. Do not work if mpg_options['parallel'] = False. | |||
| 'threads': multiprocessing.cpu_count(), | |||
| 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' | |||
| } | |||
| edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] | |||
| # edit_cost_constants = [item * 0.01 for item in edit_cost_constants] | |||
| # pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) | |||
| options = ged_options.copy() | |||
| options['edit_cost_constants'] = edit_cost_constants | |||
| options['node_labels'] = [] | |||
| options['edge_labels'] = [] | |||
| options['node_attrs'] = [] | |||
| options['edge_attrs'] = [] | |||
| parallel = True # if num_solutions == 1 else False | |||
| """**5. Compute GED matrix.**""" | |||
| ged_mat = 'error' | |||
| runtime = 0 | |||
| try: | |||
| time0 = time.time() | |||
| ged_vec_init, ged_mat, n_edit_operations = compute_geds(graphs, options=options, repeats=1, parallel=parallel, verbose=True) | |||
| runtime = time.time() - time0 | |||
| except Exception as exp: | |||
| print('An exception occured when running this experiment:') | |||
| LOG_FILENAME = save_dir + 'error.txt' | |||
| logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||
| logging.exception(save_file_suffix) | |||
| print(repr(exp)) | |||
| """**6. Get results.**""" | |||
| with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: | |||
| pickle.dump(ged_mat, f) | |||
| with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: | |||
| pickle.dump(runtime, f) | |||
| return ged_mat, runtime | |||
| def save_trials_as_group(graphs, N, num_solutions, ratio): | |||
| # Return if the group file exists. | |||
| name_middle = '.' + str(N) + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.' | |||
| name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' | |||
| if os.path.isfile(name_group): | |||
| return | |||
| ged_mats = [] | |||
| runtimes = [] | |||
| for trial in range(1, 101): | |||
| print() | |||
| print('Trial:', trial) | |||
| ged_mat, runtime = xp_compute_ged_matrix(graphs, N, num_solutions, ratio, trial) | |||
| ged_mats.append(ged_mat) | |||
| runtimes.append(runtime) | |||
| # Group trials and Remove single files. | |||
| name_prefix = 'ged_matrix' + name_middle | |||
| group_trials(save_dir, name_prefix, True, True, False) | |||
| name_prefix = 'runtime' + name_middle | |||
| group_trials(save_dir, name_prefix, True, True, False) | |||
| def results_for_a_ratio(ratio): | |||
| for N in N_list: | |||
| print() | |||
| print('# of graphs:', N) | |||
| for num_solutions in [1, 20, 40, 60, 80, 100]: | |||
| print() | |||
| print('# of solutions:', num_solutions) | |||
| save_trials_as_group(graphs[:N], N, num_solutions, ratio) | |||
| if __name__ == '__main__': | |||
| if len(sys.argv) > 1: | |||
| N_list = [int(i) for i in sys.argv[1:]] | |||
| else: | |||
| N_list = [10, 50, 100] | |||
| # Generate graphs. | |||
| graphs = generate_graphs() | |||
| save_dir = 'outputs/edit_costs.num_sols.N.IPFP/' | |||
| os.makedirs(save_dir, exist_ok=True) | |||
| os.makedirs(save_dir + 'groups/', exist_ok=True) | |||
| for ratio in [10, 1, 0.1]: | |||
| print() | |||
| print('Ratio:', ratio) | |||
| results_for_a_ratio(ratio) | |||
| @@ -12,15 +12,19 @@ import multiprocessing | |||
| import pickle | |||
| import logging | |||
| from gklearn.ged.util import compute_geds | |||
| import numpy as np | |||
| import time | |||
| from utils import get_dataset | |||
| import sys | |||
| from group_results import group_trials | |||
| def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): | |||
| save_file_suffix = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | |||
| # Return if the file exists. | |||
| if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): | |||
| return None, None | |||
| """**2. Set parameters.**""" | |||
| @@ -39,8 +43,8 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): | |||
| } | |||
| edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] | |||
| # edit_cost_constants = [item * 0.01 for item in edit_cost_constants] | |||
| # pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) | |||
| # edit_cost_constants = [item * 0.01 for item in edit_cost_constants] | |||
| # pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) | |||
| options = ged_options.copy() | |||
| options['edit_cost_constants'] = edit_cost_constants | |||
| @@ -55,7 +59,7 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): | |||
| runtime = 0 | |||
| try: | |||
| time0 = time.time() | |||
| ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, parallel=parallel, verbose=True) | |||
| ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, repeats=1, parallel=parallel, verbose=True) | |||
| runtime = time.time() - time0 | |||
| except Exception as exp: | |||
| print('An exception occured when running this experiment:') | |||
| @@ -70,11 +74,17 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): | |||
| pickle.dump(ged_mat, f) | |||
| with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: | |||
| pickle.dump(runtime, f) | |||
| return ged_mat, runtime | |||
| def save_trials_as_group(dataset, ds_name, num_solutions, ratio): | |||
| # Return if the group file exists. | |||
| name_middle = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.' | |||
| name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' | |||
| if os.path.isfile(name_group): | |||
| return | |||
| ged_mats = [] | |||
| runtimes = [] | |||
| for trial in range(1, 101): | |||
| @@ -84,24 +94,35 @@ def save_trials_as_group(dataset, ds_name, num_solutions, ratio): | |||
| ged_mats.append(ged_mat) | |||
| runtimes.append(runtime) | |||
| save_file_suffix = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) | |||
| with open(save_dir + 'groups/ged_mats' + save_file_suffix + '.npy', 'wb') as f: | |||
| np.save(f, np.array(ged_mats)) | |||
| with open(save_dir + 'groups/runtimes' + save_file_suffix + '.pkl', 'wb') as f: | |||
| pickle.dump(runtime, f) | |||
| # Group trials and Remove single files. | |||
| name_prefix = 'ged_matrix' + name_middle | |||
| group_trials(save_dir, name_prefix, True, True, False) | |||
| name_prefix = 'runtime' + name_middle | |||
| group_trials(save_dir, name_prefix, True, True, False) | |||
| def results_for_a_dataset(ds_name): | |||
| """**1. Get dataset.**""" | |||
| dataset = get_dataset(ds_name) | |||
| for num_solutions in [1, 20, 40, 60, 80, 100]: | |||
| for num_solutions in num_solutions_list: | |||
| print() | |||
| print('# of solutions:', num_solutions) | |||
| for ratio in [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]: | |||
| for ratio in ratio_list: | |||
| print() | |||
| print('Ratio:', ratio) | |||
| save_trials_as_group(dataset, ds_name, num_solutions, ratio) | |||
| def get_param_lists(ds_name): | |||
| if ds_name == 'AIDS_symb': | |||
| num_solutions_list = [1, 20, 40, 60, 80, 100] | |||
| ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] | |||
| else: | |||
| num_solutions_list = [1, 20, 40, 60, 80, 100] | |||
| ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] | |||
| return num_solutions_list, ratio_list | |||
| if __name__ == '__main__': | |||
| @@ -111,12 +132,11 @@ if __name__ == '__main__': | |||
| ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||
| save_dir = 'outputs/edit_costs.num_sols.ratios.IPFP/' | |||
| if not os.path.exists(save_dir): | |||
| os.makedirs(save_dir) | |||
| if not os.path.exists(save_dir + 'groups/'): | |||
| os.makedirs(save_dir + 'groups/') | |||
| os.makedirs(save_dir, exist_ok=True) | |||
| os.makedirs(save_dir + 'groups/', exist_ok=True) | |||
| for ds_name in ds_name_list: | |||
| print() | |||
| print('Dataset:', ds_name) | |||
| num_solutions_list, ratio_list = get_param_lists(ds_name) | |||
| results_for_a_dataset(ds_name) | |||
| @@ -0,0 +1,137 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Wed Oct 20 11:48:02 2020 | |||
| @author: ljia | |||
| """ | |||
| # This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. | |||
| import os | |||
| import multiprocessing | |||
| import pickle | |||
| import logging | |||
| from gklearn.ged.util import compute_geds | |||
| import time | |||
| import sys | |||
| from group_results import group_trials | |||
| def generate_graphs(): | |||
| from gklearn.utils.graph_synthesizer import GraphSynthesizer | |||
| gsyzer = GraphSynthesizer() | |||
| graphs = gsyzer.unified_graphs(num_graphs=100, num_nodes=20, num_edges=20, num_node_labels=0, num_edge_labels=0, seed=None, directed=False) | |||
| return graphs | |||
| def xp_compute_ged_matrix(graphs, N, repeats, ratio, trial): | |||
| save_file_suffix = '.' + str(N) + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | |||
| # Return if the file exists. | |||
| if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): | |||
| return None, None | |||
| """**2. Set parameters.**""" | |||
| # Parameters for GED computation. | |||
| ged_options = {'method': 'IPFP', # use IPFP huristic. | |||
| 'initialization_method': 'RANDOM', # or 'NODE', etc. | |||
| # when bigger than 1, then the method is considered mIPFP. | |||
| 'initial_solutions': 1, | |||
| 'edit_cost': 'CONSTANT', # use CONSTANT cost. | |||
| # the distance between non-symbolic node/edge labels is computed by euclidean distance. | |||
| 'attr_distance': 'euclidean', | |||
| 'ratio_runs_from_initial_solutions': 1, | |||
| # parallel threads. Do not work if mpg_options['parallel'] = False. | |||
| 'threads': multiprocessing.cpu_count(), | |||
| 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' | |||
| } | |||
| edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] | |||
| # edit_cost_constants = [item * 0.01 for item in edit_cost_constants] | |||
| # pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) | |||
| options = ged_options.copy() | |||
| options['edit_cost_constants'] = edit_cost_constants | |||
| options['node_labels'] = [] | |||
| options['edge_labels'] = [] | |||
| options['node_attrs'] = [] | |||
| options['edge_attrs'] = [] | |||
| parallel = True # if num_solutions == 1 else False | |||
| """**5. Compute GED matrix.**""" | |||
| ged_mat = 'error' | |||
| runtime = 0 | |||
| try: | |||
| time0 = time.time() | |||
| ged_vec_init, ged_mat, n_edit_operations = compute_geds(graphs, options=options, repeats=repeats, parallel=parallel, verbose=True) | |||
| runtime = time.time() - time0 | |||
| except Exception as exp: | |||
| print('An exception occured when running this experiment:') | |||
| LOG_FILENAME = save_dir + 'error.txt' | |||
| logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||
| logging.exception(save_file_suffix) | |||
| print(repr(exp)) | |||
| """**6. Get results.**""" | |||
| with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: | |||
| pickle.dump(ged_mat, f) | |||
| with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: | |||
| pickle.dump(runtime, f) | |||
| return ged_mat, runtime | |||
| def save_trials_as_group(graphs, N, repeats, ratio): | |||
| # Return if the group file exists. | |||
| name_middle = '.' + str(N) + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.' | |||
| name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' | |||
| if os.path.isfile(name_group): | |||
| return | |||
| ged_mats = [] | |||
| runtimes = [] | |||
| for trial in range(1, 101): | |||
| print() | |||
| print('Trial:', trial) | |||
| ged_mat, runtime = xp_compute_ged_matrix(graphs, N, repeats, ratio, trial) | |||
| ged_mats.append(ged_mat) | |||
| runtimes.append(runtime) | |||
| # Group trials and Remove single files. | |||
| name_prefix = 'ged_matrix' + name_middle | |||
| group_trials(save_dir, name_prefix, True, True, False) | |||
| name_prefix = 'runtime' + name_middle | |||
| group_trials(save_dir, name_prefix, True, True, False) | |||
| def results_for_a_ratio(ratio): | |||
| for N in N_list: | |||
| print() | |||
| print('# of graphs:', N) | |||
| for repeats in [1, 20, 40, 60, 80, 100]: | |||
| print() | |||
| print('Repeats:', repeats) | |||
| save_trials_as_group(graphs[:N], N, repeats, ratio) | |||
| if __name__ == '__main__': | |||
| if len(sys.argv) > 1: | |||
| N_list = [int(i) for i in sys.argv[1:]] | |||
| else: | |||
| N_list = [10, 50, 100] | |||
| # Generate graphs. | |||
| graphs = generate_graphs() | |||
| save_dir = 'outputs/edit_costs.repeats.N.IPFP/' | |||
| os.makedirs(save_dir, exist_ok=True) | |||
| os.makedirs(save_dir + 'groups/', exist_ok=True) | |||
| for ratio in [10, 1, 0.1]: | |||
| print() | |||
| print('Ratio:', ratio) | |||
| results_for_a_ratio(ratio) | |||
| @@ -0,0 +1,142 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Wed Oct 20 11:48:02 2020 | |||
| @author: ljia | |||
| """ | |||
| # This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. | |||
| import os | |||
| import multiprocessing | |||
| import pickle | |||
| import logging | |||
| from gklearn.ged.util import compute_geds | |||
| import time | |||
| import sys | |||
| from group_results import group_trials | |||
| def generate_graphs(): | |||
| from gklearn.utils.graph_synthesizer import GraphSynthesizer | |||
| gsyzer = GraphSynthesizer() | |||
| graphs = gsyzer.unified_graphs(num_graphs=100, num_nodes=20, num_edges=20, num_node_labels=0, num_edge_labels=0, seed=None, directed=False) | |||
| return graphs | |||
| def xp_compute_ged_matrix(graphs, N, repeats, ratio, trial): | |||
| save_file_suffix = '.' + str(N) + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | |||
| # Return if the file exists. | |||
| if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): | |||
| return None, None | |||
| """**2. Set parameters.**""" | |||
| # Parameters for GED computation. | |||
| ged_options = {'method': 'BIPARTITE', # use BIPARTITE huristic. | |||
| # 'initialization_method': 'RANDOM', # or 'NODE', etc. (for GEDEnv) | |||
| 'lsape_model': 'ECBP', # | |||
| # ??when bigger than 1, then the method is considered mIPFP. | |||
| # the actual number of computed solutions might be smaller than the specified value | |||
| 'max_num_solutions': 1, | |||
| 'edit_cost': 'CONSTANT', # use CONSTANT cost. | |||
| 'greedy_method': 'BASIC', # | |||
| # the distance between non-symbolic node/edge labels is computed by euclidean distance. | |||
| 'attr_distance': 'euclidean', | |||
| 'optimal': True, # if TRUE, the option --greedy-method has no effect | |||
| # parallel threads. Do not work if mpg_options['parallel'] = False. | |||
| 'threads': multiprocessing.cpu_count(), | |||
| 'centrality_method': 'NONE', | |||
| 'centrality_weight': 0.7, | |||
| 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' | |||
| } | |||
| edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] | |||
| # edit_cost_constants = [item * 0.01 for item in edit_cost_constants] | |||
| # pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) | |||
| options = ged_options.copy() | |||
| options['edit_cost_constants'] = edit_cost_constants | |||
| options['node_labels'] = [] | |||
| options['edge_labels'] = [] | |||
| options['node_attrs'] = [] | |||
| options['edge_attrs'] = [] | |||
| parallel = True # if num_solutions == 1 else False | |||
| """**5. Compute GED matrix.**""" | |||
| ged_mat = 'error' | |||
| runtime = 0 | |||
| try: | |||
| time0 = time.time() | |||
| ged_vec_init, ged_mat, n_edit_operations = compute_geds(graphs, options=options, repeats=repeats, parallel=parallel, verbose=True) | |||
| runtime = time.time() - time0 | |||
| except Exception as exp: | |||
| print('An exception occured when running this experiment:') | |||
| LOG_FILENAME = save_dir + 'error.txt' | |||
| logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||
| logging.exception(save_file_suffix) | |||
| print(repr(exp)) | |||
| """**6. Get results.**""" | |||
| with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: | |||
| pickle.dump(ged_mat, f) | |||
| with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: | |||
| pickle.dump(runtime, f) | |||
| return ged_mat, runtime | |||
| def save_trials_as_group(graphs, N, repeats, ratio): | |||
| # Return if the group file exists. | |||
| name_middle = '.' + str(N) + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.' | |||
| name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' | |||
| if os.path.isfile(name_group): | |||
| return | |||
| ged_mats = [] | |||
| runtimes = [] | |||
| for trial in range(1, 101): | |||
| print() | |||
| print('Trial:', trial) | |||
| ged_mat, runtime = xp_compute_ged_matrix(graphs, N, repeats, ratio, trial) | |||
| ged_mats.append(ged_mat) | |||
| runtimes.append(runtime) | |||
| # Group trials and Remove single files. | |||
| name_prefix = 'ged_matrix' + name_middle | |||
| group_trials(save_dir, name_prefix, True, True, False) | |||
| name_prefix = 'runtime' + name_middle | |||
| group_trials(save_dir, name_prefix, True, True, False) | |||
| def results_for_a_ratio(ratio): | |||
| for N in N_list: | |||
| print() | |||
| print('# of graphs:', N) | |||
| for repeats in [1, 20, 40, 60, 80, 100]: | |||
| print() | |||
| print('Repeats:', repeats) | |||
| save_trials_as_group(graphs[:N], N, repeats, ratio) | |||
| if __name__ == '__main__': | |||
| if len(sys.argv) > 1: | |||
| N_list = [int(i) for i in sys.argv[1:]] | |||
| else: | |||
| N_list = [10, 50, 100] | |||
| # Generate graphs. | |||
| graphs = generate_graphs() | |||
| save_dir = 'outputs/edit_costs.repeats.N.bipartite/' | |||
| os.makedirs(save_dir, exist_ok=True) | |||
| os.makedirs(save_dir + 'groups/', exist_ok=True) | |||
| for ratio in [10, 1, 0.1]: | |||
| print() | |||
| print('Ratio:', ratio) | |||
| results_for_a_ratio(ratio) | |||
| @@ -12,18 +12,19 @@ import multiprocessing | |||
| import pickle | |||
| import logging | |||
| from gklearn.ged.util import compute_geds | |||
| import numpy as np | |||
| import time | |||
| from utils import get_dataset | |||
| import sys | |||
| from group_results import group_trials | |||
| def xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial): | |||
| save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | |||
| """**1. Get dataset.**""" | |||
| dataset = get_dataset(ds_name) | |||
| # Return if the file exists. | |||
| if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): | |||
| return None, None | |||
| """**2. Set parameters.**""" | |||
| @@ -78,6 +79,12 @@ def xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial): | |||
| def save_trials_as_group(dataset, ds_name, repeats, ratio): | |||
| # Return if the group file exists. | |||
| name_middle = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.' | |||
| name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' | |||
| if os.path.isfile(name_group): | |||
| return | |||
| ged_mats = [] | |||
| runtimes = [] | |||
| for trial in range(1, 101): | |||
| @@ -87,25 +94,36 @@ def save_trials_as_group(dataset, ds_name, repeats, ratio): | |||
| ged_mats.append(ged_mat) | |||
| runtimes.append(runtime) | |||
| save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) | |||
| with open(save_dir + 'groups/ged_mats' + save_file_suffix + '.npy', 'wb') as f: | |||
| np.save(f, np.array(ged_mats)) | |||
| with open(save_dir + 'groups/runtimes' + save_file_suffix + '.pkl', 'wb') as f: | |||
| pickle.dump(runtime, f) | |||
| # Group trials and Remove single files. | |||
| name_prefix = 'ged_matrix' + name_middle | |||
| group_trials(save_dir, name_prefix, True, True, False) | |||
| name_prefix = 'runtime' + name_middle | |||
| group_trials(save_dir, name_prefix, True, True, False) | |||
| def results_for_a_dataset(ds_name): | |||
| """**1. Get dataset.**""" | |||
| dataset = get_dataset(ds_name) | |||
| for repeats in [1, 20, 40, 60, 80, 100]: | |||
| for repeats in repeats_list: | |||
| print() | |||
| print('Repeats:', repeats) | |||
| for ratio in [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]: | |||
| for ratio in ratio_list: | |||
| print() | |||
| print('Ratio:', ratio) | |||
| save_trials_as_group(dataset, ds_name, repeats, ratio) | |||
| def get_param_lists(ds_name): | |||
| if ds_name == 'AIDS_symb': | |||
| repeats_list = [1, 20, 40, 60, 80, 100] | |||
| ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] | |||
| else: | |||
| repeats_list = [1, 20, 40, 60, 80, 100] | |||
| ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] | |||
| return repeats_list, ratio_list | |||
| if __name__ == '__main__': | |||
| if len(sys.argv) > 1: | |||
| @@ -114,12 +132,11 @@ if __name__ == '__main__': | |||
| ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||
| save_dir = 'outputs/edit_costs.repeats.ratios.IPFP/' | |||
| if not os.path.exists(save_dir): | |||
| os.makedirs(save_dir) | |||
| if not os.path.exists(save_dir + 'groups/'): | |||
| os.makedirs(save_dir + 'groups/') | |||
| os.makedirs(save_dir, exist_ok=True) | |||
| os.makedirs(save_dir + 'groups/', exist_ok=True) | |||
| for ds_name in ds_name_list: | |||
| print() | |||
| print('Dataset:', ds_name) | |||
| repeats_list, ratio_list = get_param_lists(ds_name) | |||
| results_for_a_dataset(ds_name) | |||
| @@ -12,18 +12,19 @@ import multiprocessing | |||
| import pickle | |||
| import logging | |||
| from gklearn.ged.util import compute_geds | |||
| import numpy as np | |||
| import time | |||
| from utils import get_dataset | |||
| import sys | |||
| from group_results import group_trials | |||
| def xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial): | |||
| save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | |||
| """**1. Get dataset.**""" | |||
| dataset = get_dataset(ds_name) | |||
| # Return if the file exists. | |||
| if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): | |||
| return None, None | |||
| """**2. Set parameters.**""" | |||
| @@ -83,6 +84,12 @@ def xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial): | |||
| def save_trials_as_group(dataset, ds_name, repeats, ratio): | |||
| # Return if the group file exists. | |||
| name_middle = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.' | |||
| name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' | |||
| if os.path.isfile(name_group): | |||
| return | |||
| ged_mats = [] | |||
| runtimes = [] | |||
| for trial in range(1, 101): | |||
| @@ -92,25 +99,36 @@ def save_trials_as_group(dataset, ds_name, repeats, ratio): | |||
| ged_mats.append(ged_mat) | |||
| runtimes.append(runtime) | |||
| save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) | |||
| with open(save_dir + 'groups/ged_mats' + save_file_suffix + '.npy', 'wb') as f: | |||
| np.save(f, np.array(ged_mats)) | |||
| with open(save_dir + 'groups/runtimes' + save_file_suffix + '.pkl', 'wb') as f: | |||
| pickle.dump(runtime, f) | |||
| # Group trials and Remove single files. | |||
| name_prefix = 'ged_matrix' + name_middle | |||
| group_trials(save_dir, name_prefix, True, True, False) | |||
| name_prefix = 'runtime' + name_middle | |||
| group_trials(save_dir, name_prefix, True, True, False) | |||
| def results_for_a_dataset(ds_name): | |||
| """**1. Get dataset.**""" | |||
| dataset = get_dataset(ds_name) | |||
| for repeats in [1, 20, 40, 60, 80, 100]: | |||
| for repeats in repeats_list: | |||
| print() | |||
| print('Repeats:', repeats) | |||
| for ratio in [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]: | |||
| for ratio in ratio_list: | |||
| print() | |||
| print('Ratio:', ratio) | |||
| save_trials_as_group(dataset, ds_name, repeats, ratio) | |||
| def get_param_lists(ds_name): | |||
| if ds_name == 'AIDS_symb': | |||
| repeats_list = [1, 20, 40, 60, 80, 100] | |||
| ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] | |||
| else: | |||
| repeats_list = [1, 20, 40, 60, 80, 100] | |||
| ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] | |||
| return repeats_list, ratio_list | |||
| if __name__ == '__main__': | |||
| if len(sys.argv) > 1: | |||
| @@ -119,12 +137,11 @@ if __name__ == '__main__': | |||
| ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||
| save_dir = 'outputs/edit_costs.repeats.ratios.bipartite/' | |||
| if not os.path.exists(save_dir): | |||
| os.makedirs(save_dir) | |||
| if not os.path.exists(save_dir + 'groups/'): | |||
| os.makedirs(save_dir + 'groups/') | |||
| os.makedirs(save_dir, exist_ok=True) | |||
| os.makedirs(save_dir + 'groups/', exist_ok=True) | |||
| for ds_name in ds_name_list: | |||
| print() | |||
| print('Dataset:', ds_name) | |||
| repeats_list, ratio_list = get_param_lists(ds_name) | |||
| results_for_a_dataset(ds_name) | |||
| @@ -16,6 +16,7 @@ from tqdm import tqdm | |||
| import sys | |||
| # This function is used by other scripts. Modify it carefully. | |||
| def group_trials(dir_folder, name_prefix, override, clear, backup): | |||
| # Get group name. | |||
| @@ -47,8 +48,20 @@ def group_trials(dir_folder, name_prefix, override, clear, backup): | |||
| file_name = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' | |||
| if os.path.isfile(file_name): | |||
| with open(file_name, 'rb') as f: | |||
| data = pickle.load(f) | |||
| try: | |||
| data = pickle.load(f) | |||
| except EOFError: | |||
| print('EOF Error occurred.') | |||
| return | |||
| data_group.append(data) | |||
| # unpickler = pickle.Unpickler(f) | |||
| # data = unpickler.load() | |||
| # if not isinstance(data, np.array): | |||
| # return | |||
| # else: | |||
| # data_group.append(data) | |||
| else: # Not all trials are completed. | |||
| return | |||
| @@ -81,11 +94,9 @@ def group_trials(dir_folder, name_prefix, override, clear, backup): | |||
| def group_all_in_folder(dir_folder, override=False, clear=True, backup=True): | |||
| # Create folders. | |||
| if not os.path.exists(dir_folder + 'groups/'): | |||
| os.makedirs(dir_folder + 'groups/') | |||
| os.makedirs(dir_folder + 'groups/', exist_ok=True) | |||
| if backup: | |||
| if not os.path.exists(dir_folder + 'backups'): | |||
| os.makedirs(dir_folder + 'backups') | |||
| os.makedirs(dir_folder + 'backups', exist_ok=True) | |||
| # Iterate all files. | |||
| cur_file_prefix = '' | |||
| @@ -105,4 +116,10 @@ if __name__ == '__main__': | |||
| group_all_in_folder(dir_folder) | |||
| dir_folder = 'outputs/CRIANN/edit_costs.repeats.ratios.IPFP/' | |||
| group_all_in_folder(dir_folder) | |||
| dir_folder = 'outputs/CRIANN/edit_costs.max_num_sols.ratios.bipartite/' | |||
| group_all_in_folder(dir_folder) | |||
| dir_folder = 'outputs/CRIANN/edit_costs.repeats.ratios.bipartite/' | |||
| group_all_in_folder(dir_folder) | |||
| @@ -0,0 +1,56 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Tue Nov 3 20:23:25 2020 | |||
| @author: ljia | |||
| """ | |||
| import os | |||
| import re | |||
| def get_job_script(arg, params): | |||
| ged_method = params[0] | |||
| multi_method = params[1] | |||
| job_name_label = r"rep." if multi_method == 'repeats' else r"" | |||
| script = r""" | |||
| #!/bin/bash | |||
| #SBATCH --exclusive | |||
| #SBATCH --job-name="st.""" + job_name_label + r"N" + arg + r"." + ged_method + r"""" | |||
| #SBATCH --partition=tlong | |||
| #SBATCH --mail-type=ALL | |||
| #SBATCH --mail-user=jajupmochi@gmail.com | |||
| #SBATCH --output="outputs/output_edit_costs.""" + multi_method + r".N." + ged_method + r"." + arg + r""".txt" | |||
| #SBATCH --error="errors/error_edit_costs.""" + multi_method + r".N." + ged_method + r"." + arg + r""".txt" | |||
| # | |||
| #SBATCH --ntasks=1 | |||
| #SBATCH --nodes=1 | |||
| #SBATCH --cpus-per-task=1 | |||
| #SBATCH --time=300:00:00 | |||
| #SBATCH --mem-per-cpu=4000 | |||
| srun hostname | |||
| srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/ged/stability | |||
| srun python3 edit_costs.""" + multi_method + r".N." + ged_method + r".py " + arg | |||
| script = script.strip() | |||
| script = re.sub('\n\t+', '\n', script) | |||
| script = re.sub('\n +', '\n', script) | |||
| return script | |||
| if __name__ == '__main__': | |||
| params_list = [('IPFP', 'nums_sols'), | |||
| ('IPFP', 'repeats'), | |||
| ('bipartite', 'max_num_sols'), | |||
| ('bipartite', 'repeats')] | |||
| N_list = [10, 50, 100] | |||
| for params in params_list[1:]: | |||
| for N in [N_list[i] for i in [0, 1, 2]]: | |||
| job_script = get_job_script(str(N), params) | |||
| command = 'sbatch <<EOF\n' + job_script + '\nEOF' | |||
| # print(command) | |||
| os.system(command) | |||
| # os.popen(command) | |||
| # output = stream.readlines() | |||
| @@ -0,0 +1,47 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Tue Nov 3 20:23:25 2020 | |||
| @author: ljia | |||
| """ | |||
| import os | |||
| import re | |||
| def get_job_script(arg): | |||
| script = r""" | |||
| #!/bin/bash | |||
| #SBATCH --exclusive | |||
| #SBATCH --job-name="st.""" + arg + r""".bp" | |||
| #SBATCH --partition=tlong | |||
| #SBATCH --mail-type=ALL | |||
| #SBATCH --mail-user=jajupmochi@gmail.com | |||
| #SBATCH --output="outputs/output_edit_costs.max_num_sols.ratios.bipartite.""" + arg + """.txt" | |||
| #SBATCH --error="errors/error_edit_costs.max_num_sols.ratios.bipartite.""" + arg + """.txt" | |||
| # | |||
| #SBATCH --ntasks=1 | |||
| #SBATCH --nodes=1 | |||
| #SBATCH --cpus-per-task=1 | |||
| #SBATCH --time=300:00:00 | |||
| #SBATCH --mem-per-cpu=4000 | |||
| srun hostname | |||
| srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/ged/stability | |||
| srun python3 edit_costs.max_nums_sols.ratios.bipartite.py """ + arg | |||
| script = script.strip() | |||
| script = re.sub('\n\t+', '\n', script) | |||
| script = re.sub('\n +', '\n', script) | |||
| return script | |||
| if __name__ == '__main__': | |||
| ds_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||
| for ds_name in [ds_list[i] for i in [0, 1, 2, 3]]: | |||
| job_script = get_job_script(ds_name) | |||
| command = 'sbatch <<EOF\n' + job_script + '\nEOF' | |||
| # print(command) | |||
| os.system(command) | |||
| # os.popen(command) | |||
| # output = stream.readlines() | |||
| @@ -0,0 +1,47 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Tue Nov 3 20:23:25 2020 | |||
| @author: ljia | |||
| """ | |||
| import os | |||
| import re | |||
| def get_job_script(arg): | |||
| script = r""" | |||
| #!/bin/bash | |||
| #SBATCH --exclusive | |||
| #SBATCH --job-name="st.""" + arg + r""".IPFP" | |||
| #SBATCH --partition=tlong | |||
| #SBATCH --mail-type=ALL | |||
| #SBATCH --mail-user=jajupmochi@gmail.com | |||
| #SBATCH --output="outputs/output_edit_costs.nums_sols.ratios.IPFP.""" + arg + """.txt" | |||
| #SBATCH --error="errors/error_edit_costs.nums_sols.ratios.IPFP.""" + arg + """.txt" | |||
| # | |||
| #SBATCH --ntasks=1 | |||
| #SBATCH --nodes=1 | |||
| #SBATCH --cpus-per-task=1 | |||
| #SBATCH --time=300:00:00 | |||
| #SBATCH --mem-per-cpu=4000 | |||
| srun hostname | |||
| srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/ged/stability | |||
| srun python3 edit_costs.nums_sols.ratios.IPFP.py """ + arg | |||
| script = script.strip() | |||
| script = re.sub('\n\t+', '\n', script) | |||
| script = re.sub('\n +', '\n', script) | |||
| return script | |||
| if __name__ == '__main__': | |||
| ds_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||
| for ds_name in [ds_list[i] for i in [0, 3]]: | |||
| job_script = get_job_script(ds_name) | |||
| command = 'sbatch <<EOF\n' + job_script + '\nEOF' | |||
| # print(command) | |||
| os.system(command) | |||
| # os.popen(command) | |||
| # output = stream.readlines() | |||
| @@ -0,0 +1,47 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Tue Nov 3 20:23:25 2020 | |||
| @author: ljia | |||
| """ | |||
| import os | |||
| import re | |||
| def get_job_script(arg): | |||
| script = r""" | |||
| #!/bin/bash | |||
| #SBATCH --exclusive | |||
| #SBATCH --job-name="st.rep.""" + arg + r""".IPFP" | |||
| #SBATCH --partition=tlong | |||
| #SBATCH --mail-type=ALL | |||
| #SBATCH --mail-user=jajupmochi@gmail.com | |||
| #SBATCH --output="outputs/output_edit_costs.repeats.ratios.IPFP.""" + arg + """.txt" | |||
| #SBATCH --error="errors/error_edit_costs.repeats.ratios.IPFP.""" + arg + """.txt" | |||
| # | |||
| #SBATCH --ntasks=1 | |||
| #SBATCH --nodes=1 | |||
| #SBATCH --cpus-per-task=1 | |||
| #SBATCH --time=300:00:00 | |||
| #SBATCH --mem-per-cpu=4000 | |||
| srun hostname | |||
| srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/ged/stability | |||
| srun python3 edit_costs.repeats.ratios.IPFP.py """ + arg | |||
| script = script.strip() | |||
| script = re.sub('\n\t+', '\n', script) | |||
| script = re.sub('\n +', '\n', script) | |||
| return script | |||
| if __name__ == '__main__': | |||
| ds_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||
| for ds_name in [ds_list[i] for i in [0, 3]]: | |||
| job_script = get_job_script(ds_name) | |||
| command = 'sbatch <<EOF\n' + job_script + '\nEOF' | |||
| # print(command) | |||
| os.system(command) | |||
| # os.popen(command) | |||
| # output = stream.readlines() | |||
| @@ -0,0 +1,47 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Tue Nov 3 20:23:25 2020 | |||
| @author: ljia | |||
| """ | |||
| import os | |||
| import re | |||
| def get_job_script(arg): | |||
| script = r""" | |||
| #!/bin/bash | |||
| #SBATCH --exclusive | |||
| #SBATCH --job-name="st.rep.""" + arg + r""".bp" | |||
| #SBATCH --partition=tlong | |||
| #SBATCH --mail-type=ALL | |||
| #SBATCH --mail-user=jajupmochi@gmail.com | |||
| #SBATCH --output="outputs/output_edit_costs.repeats.ratios.bipartite.""" + arg + """.txt" | |||
| #SBATCH --error="errors/error_edit_costs.repeats.ratios.bipartite.""" + arg + """.txt" | |||
| # | |||
| #SBATCH --ntasks=1 | |||
| #SBATCH --nodes=1 | |||
| #SBATCH --cpus-per-task=1 | |||
| #SBATCH --time=300:00:00 | |||
| #SBATCH --mem-per-cpu=4000 | |||
| srun hostname | |||
| srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/ged/stability | |||
| srun python3 edit_costs.repeats.ratios.bipartite.py """ + arg | |||
| script = script.strip() | |||
| script = re.sub('\n\t+', '\n', script) | |||
| script = re.sub('\n +', '\n', script) | |||
| return script | |||
| if __name__ == '__main__': | |||
| ds_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||
| for ds_name in [ds_list[i] for i in [0, 1, 2, 3]]: | |||
| job_script = get_job_script(ds_name) | |||
| command = 'sbatch <<EOF\n' + job_script + '\nEOF' | |||
| # print(command) | |||
| os.system(command) | |||
| # os.popen(command) | |||
| # output = stream.readlines() | |||
| @@ -150,8 +150,7 @@ def xp_accuracy_diff_entropy(): | |||
| import pickle | |||
| import os | |||
| save_dir = 'outputs/accuracy_diff_entropy/' | |||
| if not os.path.exists(save_dir): | |||
| os.makedirs(save_dir) | |||
| os.makedirs(save_dir, exist_ok=True) | |||
| accuracies = {} | |||
| confidences = {} | |||
| @@ -16,8 +16,7 @@ def xp_runtimes_of_all_28cores(): | |||
| import pickle | |||
| import os | |||
| save_dir = 'outputs/runtimes_of_all_28cores/' | |||
| if not os.path.exists(save_dir): | |||
| os.makedirs(save_dir) | |||
| os.makedirs(save_dir, exist_ok=True) | |||
| run_times = {} | |||
| @@ -16,8 +16,7 @@ def xp_runtimes_diff_chunksizes(): | |||
| import pickle | |||
| import os | |||
| save_dir = 'outputs/runtimes_diff_chunksizes/' | |||
| if not os.path.exists(save_dir): | |||
| os.makedirs(save_dir) | |||
| os.makedirs(save_dir, exist_ok=True) | |||
| run_times = {} | |||
| @@ -25,8 +25,7 @@ def xp_synthesized_graphs_dataset_size(): | |||
| import pickle | |||
| import os | |||
| save_dir = 'outputs/synthesized_graphs_N/' | |||
| if not os.path.exists(save_dir): | |||
| os.makedirs(save_dir) | |||
| os.makedirs(save_dir, exist_ok=True) | |||
| run_times = {} | |||
| @@ -22,8 +22,7 @@ def xp_synthesized_graphs_degrees(): | |||
| import pickle | |||
| import os | |||
| save_dir = 'outputs/synthesized_graphs_degrees/' | |||
| if not os.path.exists(save_dir): | |||
| os.makedirs(save_dir) | |||
| os.makedirs(save_dir, exist_ok=True) | |||
| run_times = {} | |||
| @@ -22,8 +22,7 @@ def xp_synthesized_graphs_num_node_label_alphabet(): | |||
| import pickle | |||
| import os | |||
| save_dir = 'outputs/synthesized_graphs_num_node_label_alphabet/' | |||
| if not os.path.exists(save_dir): | |||
| os.makedirs(save_dir) | |||
| os.makedirs(save_dir, exist_ok=True) | |||
| run_times = {} | |||
| @@ -22,8 +22,7 @@ def xp_synthesized_graphs_num_nodes(): | |||
| import pickle | |||
| import os | |||
| save_dir = 'outputs/synthesized_graphs_num_nodes/' | |||
| if not os.path.exists(save_dir): | |||
| os.makedirs(save_dir) | |||
| os.makedirs(save_dir, exist_ok=True) | |||
| run_times = {} | |||
| @@ -154,6 +154,6 @@ def test_median_graph_estimator_symb(): | |||
| return set_median, gen_median | |||
| if __name__ == '__main__': | |||
| if _name_ == '_main_': | |||
| # set_median, gen_median = test_median_graph_estimator() | |||
| set_median, gen_median = test_median_graph_estimator_symb() | |||
| @@ -7,6 +7,8 @@ __version__ = "0.1" | |||
| __author__ = "Linlin Jia" | |||
| __date__ = "November 2018" | |||
| from gklearn.kernels.metadata import GRAPH_KERNELS, list_of_graph_kernels | |||
| from gklearn.kernels.graph_kernel import GraphKernel | |||
| from gklearn.kernels.common_walk import CommonWalk | |||
| from gklearn.kernels.marginalized import Marginalized | |||
| @@ -0,0 +1,36 @@ | |||
| #!/usr/bin/env python3 | |||
| # -*- coding: utf-8 -*- | |||
| """ | |||
| Created on Fri Nov 6 10:11:08 2020 | |||
| @author: ljia | |||
| """ | |||
| # The metadata of all graph kernels. | |||
| GRAPH_KERNELS = { | |||
| ### based on walks. | |||
| 'common walk': '', | |||
| 'marginalized': '', | |||
| 'sylvester equation': '', | |||
| 'fixed_point': '', | |||
| 'conjugate gradient': '', | |||
| 'spectral decomposition': '', | |||
| ### based on paths. | |||
| 'shortest path': '', | |||
| 'structural shortest path': '', | |||
| 'path up to length h': '', | |||
| ### based on non-linear patterns. | |||
| 'weisfeiler-lehman subtree': '', | |||
| 'treelet': '', | |||
| } | |||
| def list_of_graph_kernels(): | |||
| """List names of all graph kernels. | |||
| Returns | |||
| ------- | |||
| list | |||
| The list of all graph kernels. | |||
| """ | |||
| return [i for i in GRAPH_KERNELS] | |||
| @@ -126,8 +126,7 @@ def generate_random_preimages_by_class(ds_name, rpg_options, kernel_options, sav | |||
| # save median graphs. | |||
| if save_preimages: | |||
| if not os.path.exists(dir_save + 'preimages/'): | |||
| os.makedirs(dir_save + 'preimages/') | |||
| os.makedirs(dir_save + 'preimages/', exist_ok=True) | |||
| print('Saving preimages to files...') | |||
| fn_best_dataset = dir_save + 'preimages/g_best_dataset.' + 'nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1) | |||
| saveGXL(rpg.best_from_dataset, fn_best_dataset + '.gxl', method='default', | |||
| @@ -167,8 +166,7 @@ def generate_random_preimages_by_class(ds_name, rpg_options, kernel_options, sav | |||
| def _init_output_file_preimage(ds_name, gkernel, dir_output): | |||
| if not os.path.exists(dir_output): | |||
| os.makedirs(dir_output) | |||
| os.makedirs(dir_output, exist_ok=True) | |||
| fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.csv' | |||
| f_detail = open(dir_output + fn_output_detail, 'a') | |||
| csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'num graphs', | |||
| @@ -218,8 +218,7 @@ def remove_best_graph(ds_name, mpg_options, kernel_options, ged_options, mge_opt | |||
| # save median graphs. | |||
| if save_medians: | |||
| if not os.path.exists(dir_save + 'medians/'): | |||
| os.makedirs(dir_save + 'medians/') | |||
| os.makedirs(dir_save + 'medians/', exist_ok=True) | |||
| print('Saving median graphs to files...') | |||
| fn_pre_sm = dir_save + 'medians/set_median.' + mpg_options['fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1) | |||
| saveGXL(mpg.set_median, fn_pre_sm + '.gxl', method='default', | |||
| @@ -375,8 +374,7 @@ def _compute_gram_matrix_unnorm(dataset, kernel_options): | |||
| def _init_output_file(ds_name, gkernel, fit_method, dir_output): | |||
| if not os.path.exists(dir_output): | |||
| os.makedirs(dir_output) | |||
| os.makedirs(dir_output, exist_ok=True) | |||
| fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.csv' | |||
| f_detail = open(dir_output + fn_output_detail, 'a') | |||
| csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'edit cost', | |||
| @@ -230,8 +230,7 @@ def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged | |||
| # save median graphs. | |||
| if save_medians: | |||
| if not os.path.exists(dir_save + 'medians/'): | |||
| os.makedirs(dir_save + 'medians/') | |||
| os.makedirs(dir_save + 'medians/', exist_ok=True) | |||
| print('Saving median graphs to files...') | |||
| fn_pre_sm = dir_save + 'medians/set_median.' + mpg_options['fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1) | |||
| saveGXL(mpg.set_median, fn_pre_sm + '.gxl', method='default', | |||
| @@ -308,8 +307,7 @@ def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged | |||
| def _init_output_file_preimage(ds_name, gkernel, fit_method, dir_output): | |||
| if not os.path.exists(dir_output): | |||
| os.makedirs(dir_output) | |||
| os.makedirs(dir_output, exist_ok=True) | |||
| # fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' | |||
| fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.csv' | |||
| f_detail = open(dir_output + fn_output_detail, 'a') | |||
| @@ -52,6 +52,14 @@ def chooseDataset(ds_name): | |||
| return dataset | |||
| def test_list_graph_kernels(): | |||
| """ | |||
| """ | |||
| from gklearn.kernels import GRAPH_KERNELS, list_of_graph_kernels | |||
| assert list_of_graph_kernels() == [i for i in GRAPH_KERNELS] | |||
| @pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) | |||
| @pytest.mark.parametrize('weight,compute_method', [(0.01, 'geo'), (1, 'exp')]) | |||
| @pytest.mark.parametrize('parallel', ['imap_unordered', None]) | |||
| @@ -433,10 +441,11 @@ def test_WLSubtree(ds_name, parallel): | |||
| if __name__ == "__main__": | |||
| test_list_graph_kernels() | |||
| # test_spkernel('Alkane', 'imap_unordered') | |||
| # test_StructuralSP('Fingerprint_edge', 'imap_unordered') | |||
| test_WLSubtree('Acyclic', 'imap_unordered') | |||
| # test_WLSubtree('Acyclic', 'imap_unordered') | |||
| # test_RandomWalk('Acyclic', 'sylvester', None, 'imap_unordered') | |||
| # test_RandomWalk('Acyclic', 'conjugate', None, 'imap_unordered') | |||
| # test_RandomWalk('Acyclic', 'fp', None, None) | |||
| # test_RandomWalk('Acyclic', 'spectral', 'exp', 'imap_unordered') | |||
| # test_RandomWalk('Acyclic', 'spectral', 'exp', 'imap_unordered') | |||
| @@ -13,6 +13,10 @@ import os | |||
| class Dataset(object): | |||
| import warnings | |||
| warnings.simplefilter('always', DeprecationWarning) | |||
| warnings.warn('This class has been moved to "gklearn.dataset" module. The class "gklearn.utils.dataset.Dataset" has not been maintained since Nov 12th, 2020 (version 0.2.1) and will be removed since version 0.4.0.', DeprecationWarning) | |||
| def __init__(self, filename=None, filename_targets=None, **kwargs): | |||
| if filename is None: | |||
| @@ -803,6 +807,10 @@ class Dataset(object): | |||
| def split_dataset_by_target(dataset): | |||
| import warnings | |||
| warnings.simplefilter('always', DeprecationWarning) | |||
| warnings.warn('This function has been moved to "gklearn.dataset" module. The function "gklearn.utils.dataset.split_dataset_by_target" has not been maintained since Nov 12th, 2020 (version 0.2.1) and will be removed since version 0.4.0.', DeprecationWarning) | |||
| from gklearn.preimage.utils import get_same_item_indices | |||
| graphs = dataset.graphs | |||
| @@ -1,5 +1,9 @@ | |||
| """ Utilities function to manage graph files | |||
| """ | |||
| import warnings | |||
| warnings.simplefilter('always', DeprecationWarning) | |||
| warnings.warn('The functions in the module "gklearn.utils.graph_files" will be deprecated and removed since version 0.4.0. Use the corresponding functions in the module "gklearn.dataset" instead.', DeprecationWarning) | |||
| from os.path import dirname, splitext | |||
| @@ -45,6 +49,10 @@ def load_dataset(filename, filename_targets=None, gformat=None, **kwargs): | |||
| for details. Note here filename is the name of either .txt file in | |||
| the dataset directory. | |||
| """ | |||
| import warnings | |||
| warnings.simplefilter('always', DeprecationWarning) | |||
| warnings.warn('The function "gklearn.utils.load_dataset" will be deprecated and removed since version 0.4.0. Use the class "gklearn.dataset.DataLoader" instead.', DeprecationWarning) | |||
| extension = splitext(filename)[1][1:] | |||
| if extension == "ds": | |||
| data, y, label_names = load_from_ds(filename, filename_targets) | |||
| @@ -66,17 +74,19 @@ def load_dataset(filename, filename_targets=None, gformat=None, **kwargs): | |||
| def save_dataset(Gn, y, gformat='gxl', group=None, filename='gfile', **kwargs): | |||
| """Save list of graphs. | |||
| """ | |||
| import warnings | |||
| warnings.simplefilter('always', DeprecationWarning) | |||
| warnings.warn('The function "gklearn.utils.save_dataset" will be deprecated and removed since version 0.4.0. Use the class "gklearn.dataset.DataSaver" instead.', DeprecationWarning) | |||
| import os | |||
| dirname_ds = os.path.dirname(filename) | |||
| if dirname_ds != '': | |||
| dirname_ds += '/' | |||
| if not os.path.exists(dirname_ds) : | |||
| os.makedirs(dirname_ds) | |||
| os.makedirs(dirname_ds, exist_ok=True) | |||
| if 'graph_dir' in kwargs: | |||
| graph_dir = kwargs['graph_dir'] + '/' | |||
| if not os.path.exists(graph_dir): | |||
| os.makedirs(graph_dir) | |||
| os.makedirs(graph_dir, exist_ok=True) | |||
| del kwargs['graph_dir'] | |||
| else: | |||
| graph_dir = dirname_ds | |||
| @@ -13,6 +13,11 @@ import random | |||
| class GraphSynthesizer(object): | |||
| import warnings | |||
| warnings.simplefilter('always', DeprecationWarning) | |||
| warnings.warn('This class has been moved to "gklearn.dataset" module. The class "gklearn.utils.graph_synthesizer.GraphSynthesizer" has not been maintained since Nov 12th, 2020 (version 0.2.1) and will be removed since version 0.2.2.', DeprecationWarning) | |||
| def __init__(self): | |||
| pass | |||
| @@ -671,13 +671,11 @@ def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=None | |||
| dirname_ds = os.path.dirname(filename) | |||
| if dirname_ds != '': | |||
| dirname_ds += '/' | |||
| if not os.path.exists(dirname_ds) : | |||
| os.makedirs(dirname_ds) | |||
| os.makedirs(dirname_ds, exist_ok=True) | |||
| if xparams is not None and 'graph_dir' in xparams: | |||
| graph_dir = xparams['graph_dir'] + '/' | |||
| if not os.path.exists(graph_dir): | |||
| os.makedirs(graph_dir) | |||
| os.makedirs(graph_dir, exist_ok=True) | |||
| else: | |||
| graph_dir = dirname_ds | |||
| @@ -91,8 +91,7 @@ def model_selection_for_precomputed_kernel(datafile, | |||
| tqdm.monitor_interval = 0 | |||
| output_dir += estimator.__name__ | |||
| if not os.path.exists(output_dir): | |||
| os.makedirs(output_dir) | |||
| os.makedirs(output_dir, exist_ok=True) | |||
| # a string to save all the results. | |||
| str_fw = '###################### log time: ' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + '. ######################\n\n' | |||
| str_fw += '# This file contains results of ' + estimator.__name__ + ' on dataset ' + ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n' | |||
| @@ -604,8 +603,7 @@ def model_selection_for_precomputed_kernel(datafile, | |||
| str_fw += 'training time with hyper-param choices who did not participate in calculation of gram matrices: {:.2f}s\n\n'.format(tt_poster) | |||
| # open file to save all results for this dataset. | |||
| if not os.path.exists(output_dir): | |||
| os.makedirs(output_dir) | |||
| os.makedirs(output_dir, exist_ok=True) | |||
| # print out results as table. | |||
| str_fw += printResultsInTable(param_list, param_list_pre_revised, average_val_scores, | |||
| @@ -458,8 +458,7 @@ def compute_gram_matrices_by_class(ds_name, kernel_options, save_results=True, d | |||
| print() | |||
| print('4. saving results...') | |||
| if save_results: | |||
| if not os.path.exists(dir_save): | |||
| os.makedirs(dir_save) | |||
| os.makedirs(dir_save, exist_ok=True) | |||
| np.savez(dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm', gram_matrix_unnorm_list=gram_matrix_unnorm_list, run_time_list=run_time_list) | |||
| print('\ncomplete.') | |||