From dd9d7b742d858381dbfcc1a91e1d23248bc69896 Mon Sep 17 00:00:00 2001 From: Frozenmad Date: Wed, 16 Dec 2020 23:28:50 +0800 Subject: [PATCH 01/11] change data to dataset in solver and trainer --- autogl/module/train/node_classification.py | 36 +++++++++++---------- autogl/solver/classifier/node_classifier.py | 25 +++++++------- 2 files changed, 30 insertions(+), 31 deletions(-) diff --git a/autogl/module/train/node_classification.py b/autogl/module/train/node_classification.py index 85b3a50..5c4631e 100644 --- a/autogl/module/train/node_classification.py +++ b/autogl/module/train/node_classification.py @@ -265,12 +265,13 @@ class NodeClassificationTrainer(BaseTrainer): A reference of current trainer. """ - self.train_only(dataset) + data = dataset[0] + self.train_only(data) if keep_valid_result: - self.valid_result = self.predict_only(dataset)[dataset.val_mask].max(1)[1] - self.valid_result_prob = self.predict_only(dataset)[dataset.val_mask] + self.valid_result = self.predict_only(data)[data.val_mask].max(1)[1] + self.valid_result_prob = self.predict_only(data)[data.val_mask] self.valid_score = self.evaluate( - dataset, mask=dataset.val_mask, feval=self.feval + dataset, mask=data.val_mask, feval=self.feval ) def predict(self, dataset, mask=None): @@ -288,7 +289,6 @@ class NodeClassificationTrainer(BaseTrainer): ------- The prediction result of ``predict_proba``. """ - dataset = dataset.to(self.device) return self.predict_proba(dataset, mask=mask, in_log_format=True).max(1)[1] def predict_proba(self, dataset, mask=None, in_log_format=False): @@ -309,17 +309,18 @@ class NodeClassificationTrainer(BaseTrainer): ------- The prediction result. """ - dataset = dataset.to(self.device) + data = dataset[0] + data = data.to(self.device) if mask is not None: if mask == "val": - mask = dataset.val_mask + mask = data.val_mask elif mask == "test": - mask = dataset.test_mask + mask = data.test_mask elif mask == "train": - mask = dataset.train_mask + mask = data.train_mask else: - mask = dataset.test_mask - ret = self.predict_only(dataset, mask)[mask] + mask = data.test_mask + ret = self.predict_only(data, mask)[mask] if in_log_format is True: return ret else: @@ -398,23 +399,24 @@ class NodeClassificationTrainer(BaseTrainer): res: The evaluation result on the given dataset. """ - dataset = dataset.to(self.device) + data = dataset[0] + data = data.to(self.device) test_mask = mask if feval is None: feval = self.feval else: feval = get_feval(feval) if test_mask is None: - test_mask = dataset.test_mask + test_mask = data.test_mask elif test_mask == "test": - test_mask = dataset.test_mask + test_mask = data.test_mask elif test_mask == "val": - test_mask = dataset.val_mask + test_mask = data.val_mask elif test_mask == "train": - test_mask = dataset.train_mask + test_mask = data.train_mask y_pred_prob = self.predict_proba(dataset, mask) y_pred = y_pred_prob.max(1)[1] - y_true = dataset.y[test_mask] + y_true = data.y[test_mask] if not isinstance(feval, list): feval = [feval] diff --git a/autogl/solver/classifier/node_classifier.py b/autogl/solver/classifier/node_classifier.py index 7d89dee..368d1cf 100644 --- a/autogl/solver/classifier/node_classifier.py +++ b/autogl/solver/classifier/node_classifier.py @@ -288,18 +288,17 @@ class AutoNodeClassifier(BaseClassifier): if self.feature_module is not None: dataset = self.feature_module.fit_transform(dataset, inplace=inplace) - data = dataset[0] - assert data.x is not None, ( + self.dataset = dataset + assert self.dataset[0].x is not None, ( "Does not support fit on non node-feature dataset!" " Please add node features to dataset or specify feature engineers that generate" " node features." ) - self.data = data # initialize graph networks self._init_graph_module( self.gml, - num_features=data.x.shape[1], + num_features=self.dataset[0].x.shape[1], num_classes=dataset.num_classes, feval=evaluator_list, device=self.runtime_device, @@ -319,7 +318,7 @@ class AutoNodeClassifier(BaseClassifier): optimized = model else: optimized, _ = self.hpo_module.optimize( - trainer=model, dataset=data, time_limit=time_for_each_model + trainer=model, dataset=self.dataset, time_limit=time_for_each_model ) # to save memory, all the trainer derived will be mapped to cpu optimized.to(torch.device("cpu")) @@ -342,7 +341,7 @@ class AutoNodeClassifier(BaseClassifier): if self.ensemble_module is not None: performance = self.ensemble_module.fit( result_valid, - data.y[data.val_mask].cpu().numpy(), + self.dataset[0].y[self.dataset[0].val_mask].cpu().numpy(), names, evaluator_list, n_classes=dataset.num_classes, @@ -489,14 +488,12 @@ class AutoNodeClassifier(BaseClassifier): the number of classes. The prediction on given dataset. """ if dataset is None: - data = self.data - assert data is not None, ( + dataset = self.dataset + assert dataset is not None, ( "Please execute fit() first before" " predicting on remembered dataset" ) elif not inplaced and self.feature_module is not None: - data = self.feature_module.transform(dataset, inplace=inplace)[0] - else: - data = dataset[0] + dataset = self.feature_module.transform(dataset, inplace=inplace) if use_ensemble: LOGGER.info("Ensemble argument on, will try using ensemble model.") @@ -514,7 +511,7 @@ class AutoNodeClassifier(BaseClassifier): names = [] for model_name in self.trained_models: predict_result.append( - self._predict_proba_by_name(data, model_name, mask) + self._predict_proba_by_name(dataset, model_name, mask) ) names.append(model_name) return self.ensemble_module.ensemble(predict_result, names) @@ -528,11 +525,11 @@ class AutoNodeClassifier(BaseClassifier): if use_best or (use_ensemble and self.ensemble_module is None): # just return the best model we have found name = self.leaderboard.get_best_model() - return self._predict_proba_by_name(data, name, mask) + return self._predict_proba_by_name(dataset, name, mask) if name is not None: # return model performance by name - return self._predict_proba_by_name(data, name, mask) + return self._predict_proba_by_name(dataset, name, mask) LOGGER.error( "No model name is given while ensemble and best arguments are off." From 90ac5f92c60809203e6f62c4e5a2e825ee5107e8 Mon Sep 17 00:00:00 2001 From: lihy96 Date: Mon, 21 Dec 2020 09:52:30 +0800 Subject: [PATCH 02/11] fix evaluate in train_only --- autogl/module/train/node_classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autogl/module/train/node_classification.py b/autogl/module/train/node_classification.py index 5c4631e..9ed195c 100644 --- a/autogl/module/train/node_classification.py +++ b/autogl/module/train/node_classification.py @@ -218,7 +218,7 @@ class NodeClassificationTrainer(BaseTrainer): feval = self.feval[0] else: feval = self.feval - val_loss = self.evaluate(data, mask=data.val_mask, feval=feval) + val_loss = self.evaluate([data], mask=data.val_mask, feval=feval) if feval.is_higher_better() is True: val_loss = -val_loss self.early_stopping(val_loss, self.model.model) From 22d50f4a08e100691735bfcdc50502f380a028c3 Mon Sep 17 00:00:00 2001 From: Frozenmad Date: Mon, 21 Dec 2020 13:49:28 +0800 Subject: [PATCH 03/11] fix dataset support --- autogl/datasets/__init__.py | 13 ++- autogl/datasets/ogb.py | 118 ++++++++++++++++---- autogl/datasets/utils.py | 26 +++-- autogl/solver/classifier/node_classifier.py | 6 +- examples/node_classification.py | 2 +- 5 files changed, 126 insertions(+), 39 deletions(-) diff --git a/autogl/datasets/__init__.py b/autogl/datasets/__init__.py index b2f047b..aaa5eac 100644 --- a/autogl/datasets/__init__.py +++ b/autogl/datasets/__init__.py @@ -1,6 +1,7 @@ import os.path as osp import importlib import os +import torch from ..data.dataset import Dataset @@ -122,10 +123,16 @@ def build_dataset(args, path="~/.cache-autogl/"): return DATASET_DICT[args.dataset](path) -def build_dataset_from_name(dataset, path="~/.cache-autogl/"): - path = osp.join(path, "data", dataset) +def build_dataset_from_name(dataset_name, path="~/.cache-autogl/"): + path = osp.join(path, "data", dataset_name) path = os.path.expanduser(path) - return DATASET_DICT[dataset](path) + dataset = DATASET_DICT[dataset_name](path) + if 'ogbn' in dataset_name: + #dataset.data, dataset.slices = dataset.collate([dataset.data]) + #dataset.data.num_nodes = dataset.data.num_nodes[0] + if dataset.data.y.shape[-1] == 1: + dataset.data.y = torch.squeeze(dataset.data.y) + return dataset __all__ = [ diff --git a/autogl/datasets/ogb.py b/autogl/datasets/ogb.py index 633a234..547d433 100644 --- a/autogl/datasets/ogb.py +++ b/autogl/datasets/ogb.py @@ -3,7 +3,8 @@ from ogb.nodeproppred import PygNodePropPredDataset from ogb.graphproppred import PygGraphPropPredDataset from ogb.linkproppred import PygLinkPropPredDataset from . import register_dataset - +from .utils import index_to_mask +from torch_geometric.data import Data # OGBN @@ -12,25 +13,55 @@ class OGBNproductsDataset(PygNodePropPredDataset): def __init__(self, path): dataset = "ogbn-products" # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) - PygNodePropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor()) + PygNodePropPredDataset(name=dataset, root=path) super(OGBNproductsDataset, self).__init__( - dataset, path, transform=T.ToSparseTensor() + dataset, path ) + # Pre-compute GCN normalization. + #adj_t = self.data.adj_t.set_diag() + #deg = adj_t.sum(dim=1).to(torch.float) + #deg_inv_sqrt = deg.pow(-0.5) + #deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0 + #adj_t = deg_inv_sqrt.view(-1, 1) * adj_t * deg_inv_sqrt.view(1, -1) + #self.data.adj_t = adj_t + setattr(OGBNproductsDataset, "metric", "Accuracy") setattr(OGBNproductsDataset, "loss", "nll_loss") - + split_idx = self.get_idx_split() + datalist = [] + for d in self: + setattr(d, "train_mask", index_to_mask(split_idx['train'], d.y.shape[0])) + setattr(d, "val_mask", index_to_mask(split_idx['valid'], d.y.shape[0])) + setattr(d, "test_mask", index_to_mask(split_idx['test'], d.y.shape[0])) + datalist.append(d) + self.data, self.slices = self.collate(datalist) @register_dataset("ogbn-proteins") class OGBNproteinsDataset(PygNodePropPredDataset): def __init__(self, path): dataset = "ogbn-proteins" # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) - PygNodePropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor()) + PygNodePropPredDataset(name=dataset, root=path) super(OGBNproteinsDataset, self).__init__( - dataset, path, transform=T.ToSparseTensor() + dataset, path ) + dataset_t = PygNodePropPredDataset(name=dataset, root = path, transform=T.ToSparseTensor()) + + # Move edge features to node features. + self.data.x = dataset_t[0].adj_t.mean(dim=1) + #dataset_t[0].adj_t.set_value_(None) + del dataset_t + setattr(OGBNproteinsDataset, "metric", "ROC-AUC") - setattr(OGBNproteinsDataset, "loss", "BCEWithLogitsLoss") + setattr(OGBNproteinsDataset, "loss", "binary_cross_entropy_with_logits") + split_idx = self.get_idx_split() + datalist = [] + for d in self: + setattr(d, "train_mask", index_to_mask(split_idx['train'], d.y.shape[0])) + setattr(d, "val_mask", index_to_mask(split_idx['valid'], d.y.shape[0])) + setattr(d, "test_mask", index_to_mask(split_idx['test'], d.y.shape[0])) + datalist.append(d) + self.data, self.slices = self.collate(datalist) @register_dataset("ogbn-arxiv") @@ -38,38 +69,77 @@ class OGBNarxivDataset(PygNodePropPredDataset): def __init__(self, path): dataset = "ogbn-arxiv" # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) - PygNodePropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor()) + PygNodePropPredDataset(name=dataset, root=path) super(OGBNarxivDataset, self).__init__( - dataset, path, transform=T.ToSparseTensor() + dataset, path ) + + #self[0].adj_t = self[0].adj_t.to_symmetric() + setattr(OGBNarxivDataset, "metric", "Accuracy") setattr(OGBNarxivDataset, "loss", "nll_loss") + split_idx = self.get_idx_split() + datalist = [] + for d in self: + setattr(d, "train_mask", index_to_mask(split_idx['train'], d.y.shape[0])) + setattr(d, "val_mask", index_to_mask(split_idx['valid'], d.y.shape[0])) + setattr(d, "test_mask", index_to_mask(split_idx['test'], d.y.shape[0])) + datalist.append(d) + self.data, self.slices = self.collate(datalist) @register_dataset("ogbn-papers100M") class OGBNpapers100MDataset(PygNodePropPredDataset): def __init__(self, path): dataset = "ogbn-papers100M" # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) - PygNodePropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor()) + PygNodePropPredDataset(name=dataset, root=path) super(OGBNpapers100MDataset, self).__init__( - dataset, path, transform=T.ToSparseTensor() + dataset, path ) setattr(OGBNpapers100MDataset, "metric", "Accuracy") setattr(OGBNpapers100MDataset, "loss", "nll_loss") - + split_idx = self.get_idx_split() + datalist = [] + for d in self: + setattr(d, "train_mask", index_to_mask(split_idx['train'], d.y.shape[0])) + setattr(d, "val_mask", index_to_mask(split_idx['valid'], d.y.shape[0])) + setattr(d, "test_mask", index_to_mask(split_idx['test'], d.y.shape[0])) + datalist.append(d) + self.data, self.slices = self.collate(datalist) @register_dataset("ogbn-mag") class OGBNmagDataset(PygNodePropPredDataset): def __init__(self, path): dataset = "ogbn-mag" # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) - PygNodePropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor()) + PygNodePropPredDataset(name=dataset, root=path) super(OGBNmagDataset, self).__init__( - dataset, path, transform=T.ToSparseTensor() + dataset, path ) + + # Preprocessing + rel_data = self[0] + # We are only interested in paper <-> paper relations. + self.data = Data( + x=rel_data.x_dict['paper'], + edge_index=rel_data.edge_index_dict[('paper', 'cites', 'paper')], + y=rel_data.y_dict['paper']) + + #self.data = T.ToSparseTensor()(data) + #self[0].adj_t = self[0].adj_t.to_symmetric() + setattr(OGBNmagDataset, "metric", "Accuracy") setattr(OGBNmagDataset, "loss", "nll_loss") + split_idx = self.get_idx_split() + + datalist = [] + for d in self: + setattr(d, "train_mask", index_to_mask(split_idx['train'], d.y.shape[0])) + setattr(d, "val_mask", index_to_mask(split_idx['valid'], d.y.shape[0])) + setattr(d, "test_mask", index_to_mask(split_idx['test'], d.y.shape[0])) + datalist.append(d) + self.data, self.slices = self.collate(datalist) # OGBG @@ -83,7 +153,7 @@ class OGBGmolhivDataset(PygGraphPropPredDataset): PygGraphPropPredDataset(name=dataset, root=path) super(OGBGmolhivDataset, self).__init__(dataset, path) setattr(OGBGmolhivDataset, "metric", "ROC-AUC") - setattr(OGBGmolhivDataset, "loss", "BCEWithLogitsLoss") + setattr(OGBGmolhivDataset, "loss", "binary_cross_entropy_with_logits") @register_dataset("ogbg-molpcba") @@ -94,7 +164,7 @@ class OGBGmolpcbaDataset(PygGraphPropPredDataset): PygGraphPropPredDataset(name=dataset, root=path) super(OGBGmolpcbaDataset, self).__init__(dataset, path) setattr(OGBGmolpcbaDataset, "metric", "AP") - setattr(OGBGmolpcbaDataset, "loss", "BCEWithLogitsLoss") + setattr(OGBGmolpcbaDataset, "loss", "binary_cross_entropy_with_logits") @register_dataset("ogbg-ppa") @@ -105,7 +175,7 @@ class OGBGppaDataset(PygGraphPropPredDataset): PygGraphPropPredDataset(name=dataset, root=path) super(OGBGppaDataset, self).__init__(dataset, path) setattr(OGBGppaDataset, "metric", "Accuracy") - setattr(OGBGppaDataset, "loss", "CrossEntropyLoss") + setattr(OGBGppaDataset, "loss", "cross_entropy") @register_dataset("ogbg-code") @@ -116,7 +186,7 @@ class OGBGcodeDataset(PygGraphPropPredDataset): PygGraphPropPredDataset(name=dataset, root=path) super(OGBGcodeDataset, self).__init__(dataset, path) setattr(OGBGcodeDataset, "metric", "F1 score") - setattr(OGBGcodeDataset, "loss", "CrossEntropyLoss") + setattr(OGBGcodeDataset, "loss", "cross_entropy") # OGBL @@ -127,7 +197,7 @@ class OGBLppaDataset(PygLinkPropPredDataset): def __init__(self, path): dataset = "ogbl-ppa" # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) - PygLinkPropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor()) + PygLinkPropPredDataset(name=dataset, root=path) super(OGBLppaDataset, self).__init__(dataset, path) setattr(OGBLppaDataset, "metric", "Hits@100") setattr(OGBLppaDataset, "loss", "pos_neg_loss") @@ -138,7 +208,7 @@ class OGBLcollabDataset(PygLinkPropPredDataset): def __init__(self, path): dataset = "ogbl-collab" # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) - PygLinkPropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor()) + PygLinkPropPredDataset(name=dataset, root=path) super(OGBLcollabDataset, self).__init__(dataset, path) setattr(OGBLcollabDataset, "metric", "Hits@50") setattr(OGBLcollabDataset, "loss", "pos_neg_loss") @@ -149,7 +219,7 @@ class OGBLddiDataset(PygLinkPropPredDataset): def __init__(self, path): dataset = "ogbl-ddi" # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) - PygLinkPropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor()) + PygLinkPropPredDataset(name=dataset, root=path) super(OGBLddiDataset, self).__init__(dataset, path) setattr(OGBLddiDataset, "metric", "Hits@20") setattr(OGBLddiDataset, "loss", "pos_neg_loss") @@ -160,7 +230,7 @@ class OGBLcitationDataset(PygLinkPropPredDataset): def __init__(self, path): dataset = "ogbl-citation" # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) - PygLinkPropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor()) + PygLinkPropPredDataset(name=dataset, root=path) super(OGBLcitationDataset, self).__init__(dataset, path) setattr(OGBLcitationDataset, "metric", "MRR") setattr(OGBLcitationDataset, "loss", "pos_neg_loss") @@ -171,7 +241,7 @@ class OGBLwikikgDataset(PygLinkPropPredDataset): def __init__(self, path): dataset = "ogbl-wikikg" # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) - PygLinkPropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor()) + PygLinkPropPredDataset(name=dataset, root=path) super(OGBLwikikgDataset, self).__init__(dataset, path) setattr(OGBLwikikgDataset, "metric", "MRR") setattr(OGBLwikikgDataset, "loss", "pos_neg_loss") @@ -182,7 +252,7 @@ class OGBLbiokgDataset(PygLinkPropPredDataset): def __init__(self, path): dataset = "ogbl-biokg" # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) - PygLinkPropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor()) + PygLinkPropPredDataset(name=dataset, root=path) super(OGBLbiokgDataset, self).__init__(dataset, path) setattr(OGBLbiokgDataset, "metric", "MRR") setattr(OGBLbiokgDataset, "loss", "pos_neg_loss") diff --git a/autogl/datasets/utils.py b/autogl/datasets/utils.py index 7b9e38e..23d98d4 100644 --- a/autogl/datasets/utils.py +++ b/autogl/datasets/utils.py @@ -4,12 +4,12 @@ from torch_geometric.data import DataLoader from sklearn.model_selection import StratifiedKFold -def get_label_number(data): +def get_label_number(dataset): r"""Get the number of labels in this dataset as dict.""" label_num = {} - labels = data.y.unique().cpu().detach().numpy().tolist() + labels = dataset.data.y.unique().cpu().detach().numpy().tolist() for label in labels: - label_num[label] = (data.y == label).sum().item() + label_num[label] = (dataset.data.y == label).sum().item() return label_num @@ -19,7 +19,7 @@ def index_to_mask(index, size): return mask -def random_splits_mask(data, train_ratio=0.2, val_ratio=0.4, seed=None): +def random_splits_mask(dataset, train_ratio=0.2, val_ratio=0.4, seed=None): r"""If the data has masks for train/val/test, return the splits with specific ratio. Parameters @@ -37,6 +37,7 @@ def random_splits_mask(data, train_ratio=0.2, val_ratio=0.4, seed=None): assert ( train_ratio + val_ratio <= 1 ), "the sum of train_ratio and val_ratio is larger than 1" + data = dataset[0] r_s = torch.get_rng_state() if torch.cuda.is_available(): r_s_cuda = torch.cuda.get_rng_state() @@ -61,11 +62,15 @@ def random_splits_mask(data, train_ratio=0.2, val_ratio=0.4, seed=None): if torch.cuda.is_available(): torch.cuda.set_rng_state(r_s_cuda) - return data + dataset.data, dataset.slices = dataset.collate([d for d in dataset]) + # while type(dataset.data.num_nodes) == list: + # dataset.data.num_nodes = dataset.data.num_nodes[0] + # dataset.data.num_nodes = dataset.data.num_nodes[0] + return dataset def random_splits_mask_class( - data, + dataset, num_train_per_class=20, num_val_per_class=30, num_val=None, @@ -97,6 +102,8 @@ def random_splits_mask_class( seed : int random seed for splitting dataset. """ + data = dataset[0] + r_s = torch.get_rng_state() if torch.cuda.is_available(): r_s_cuda = torch.cuda.get_rng_state() @@ -152,7 +159,12 @@ def random_splits_mask_class( torch.set_rng_state(r_s) if torch.cuda.is_available(): torch.cuda.set_rng_state(r_s_cuda) - return data + + dataset.data, dataset.slices = dataset.collate([d for d in dataset]) + # while type(dataset.data.num_nodes) == list: + # dataset.data.num_nodes = dataset.data.num_nodes[0] + # dataset.data.num_nodes = dataset.data.num_nodes[0] + return dataset def graph_cross_validation(dataset, n_splits=10, shuffle=True, random_seed=42): diff --git a/autogl/solver/classifier/node_classifier.py b/autogl/solver/classifier/node_classifier.py index 368d1cf..9274e1b 100644 --- a/autogl/solver/classifier/node_classifier.py +++ b/autogl/solver/classifier/node_classifier.py @@ -262,19 +262,17 @@ class AutoNodeClassifier(BaseClassifier): ) val_split = val_split if val_split > 1 else int(val_split * size) utils.random_splits_mask_class( - dataset.data, + dataset, num_train_per_class=train_split // dataset.num_classes, num_val_per_class=val_split // dataset.num_classes, seed=seed, ) - dataset.data, dataset.slices = dataset.collate([dataset.data]) else: train_split = train_split if train_split < 1 else train_split / size val_split = val_split if val_split < 1 else val_split / size utils.random_splits_mask( - dataset.data, train_ratio=train_split, val_ratio=val_split + dataset, train_ratio=train_split, val_ratio=val_split ) - dataset.data, dataset.slices = dataset.collate([dataset.data]) else: assert hasattr(dataset.data, "train_mask") and hasattr( dataset.data, "val_mask" diff --git a/examples/node_classification.py b/examples/node_classification.py index 8b9fdb7..1e7a61e 100644 --- a/examples/node_classification.py +++ b/examples/node_classification.py @@ -46,7 +46,7 @@ if __name__ == '__main__': if args.dataset in ['cora', 'citeseer', 'pubmed']: autoClassifier.fit(dataset, time_limit=3600, evaluation_method=[Acc]) else: - autoClassifier.fit(dataset, time_limit=3600, evaluation_method=[Acc], seed=seed, train_split=20*dataset.num_classes, val_split=30*dataset.num_classes) + autoClassifier.fit(dataset, time_limit=3600, evaluation_method=[Acc], seed=seed, train_split=20*dataset.num_classes, val_split=30*dataset.num_classes, balanced=False) val = autoClassifier.get_model_by_performance(0)[0].get_valid_score()[0] print('val acc: ', val) From 4754e47ef8cd0dff14d500479ff4f573131b6630 Mon Sep 17 00:00:00 2001 From: cluster32 Date: Mon, 21 Dec 2020 14:29:50 +0800 Subject: [PATCH 04/11] fix AutoNE --- autogl/module/hpo/autone.py | 31 +++++++++++++++++++++---------- autogl/module/hpo/base.py | 2 +- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/autogl/module/hpo/autone.py b/autogl/module/hpo/autone.py index a527619..7889fa0 100644 --- a/autogl/module/hpo/autone.py +++ b/autogl/module/hpo/autone.py @@ -17,6 +17,12 @@ from torch_geometric.data import GraphSAINTRandomWalkSampler from ..feature.subgraph.nx import NxSubgraph, NxLargeCliqueSize from ..feature.subgraph import nx, SgNetLSD +from torch_geometric.data import InMemoryDataset + +class _MyDataset(InMemoryDataset): + def __init__(self, datalist) -> None: + super().__init__() + self.data, self.slices = self.collate(datalist) @register_hpo("autone") class AutoNE(BaseHPOptimizer): @@ -39,9 +45,9 @@ class AutoNE(BaseHPOptimizer): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.max_evals = kwargs.get("max_evals", 100) - self.subgraphs = kwargs.get("subgraphs", 5) - self.sub_evals = kwargs.get("sub_evals", 5) + self.max_evals = kwargs.get("max_evals", 3) + self.subgraphs = kwargs.get("subgraphs", 2) + self.sub_evals = kwargs.get("sub_evals", 2) self.sample_batch_size = kwargs.get("sample_batch_size", 150) self.sample_walk_length = kwargs.get("sample_walk_length", 2) @@ -51,6 +57,8 @@ class AutoNE(BaseHPOptimizer): See .base.BaseHPOptimizer.optimize """ + self.feval_name = trainer.get_feval(return_major=True).get_eval_name() + self.is_higher_better = trainer.get_feval(return_major=True).is_higher_better() space = trainer.hyper_parameter_space + trainer.model.hyper_parameter_space current_space = self._encode_para(space) @@ -65,15 +73,18 @@ class AutoNE(BaseHPOptimizer): ) results = [] for data in loader: - results.append(data) + in_dataset= _MyDataset([data]) + results.append(in_dataset) return results func = SgNetLSD() def get_wne(graph): - func.fit_transform(graph) - transform = nx.NxSubgraph.compose(map(lambda x: x(), nx.NX_EXTRACTORS)) - gf = transform.fit_transform(graph).gf + graph=func.fit_transform(graph) + # transform = nx.NxSubgraph.compose(map(lambda x: x(), nx.NX_EXTRACTORS)) + # print(type(graph)) + #gf = transform.fit_transform(graph).data.gf + gf = graph.data.gf fin = list(gf[0]) + list(map(lambda x: float(x), gf[1:])) return fin @@ -117,7 +128,7 @@ class AutoNE(BaseHPOptimizer): best_res = None best_trainer = None best_para = None - wne = get_wne(dataset.data) + wne = get_wne(dataset) for t in range(s): if time.time() - start_time > time_limit: self.logger.info("Time out of limit, Epoch: {}".format(str(i))) @@ -129,7 +140,7 @@ class AutoNE(BaseHPOptimizer): para = params.x2dict(X_temp) externel_para, trial_para = self._decode_para(para) current_trainer, res_temp = fn(dataset, externel_para) - self._print_info(externel_para, res_temp, trainer) + self._print_info(externel_para, res_temp) X_reg = params.dict2x(trial_para) X.append(np.hstack((X_reg, wne))) @@ -150,7 +161,7 @@ class AutoNE(BaseHPOptimizer): decoded_json, _ = self._decode_para(best_para) self.logger.info("Best Parameter:") - self._print_info(decoded_json, best_res, trainer) + self._print_info(decoded_json, best_res) return best_trainer, decoded_json diff --git a/autogl/module/hpo/base.py b/autogl/module/hpo/base.py index 94b8f09..1e666dd 100644 --- a/autogl/module/hpo/base.py +++ b/autogl/module/hpo/base.py @@ -174,7 +174,7 @@ class BaseHPOptimizer: elif val > old_para["maxValue"]: val = old_para["maxValue"] if old_para["type"] == "INTEGER": - val = round(val) + val = int(round(val)) externel_para[name] = val trial_para[name] = ( val if old_para["scalingType"] != "LOG" else math.log(val) From b74264e169322e0de3ee86044c077b4baa1fd23c Mon Sep 17 00:00:00 2001 From: cluster32 Date: Mon, 21 Dec 2020 14:45:38 +0800 Subject: [PATCH 05/11] add refer to advisor --- autogl/module/hpo/suggestion/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/autogl/module/hpo/suggestion/__init__.py b/autogl/module/hpo/suggestion/__init__.py index e69de29..6dfafe0 100644 --- a/autogl/module/hpo/suggestion/__init__.py +++ b/autogl/module/hpo/suggestion/__init__.py @@ -0,0 +1 @@ +# Files in this folder are reproduced from https://github.com/tobegit3hub/advisor with some changes. \ No newline at end of file From d356ebdbd630914145787f4eb312be6c04af607b Mon Sep 17 00:00:00 2001 From: cluster32 Date: Mon, 21 Dec 2020 15:14:03 +0800 Subject: [PATCH 06/11] update default AutoNE para --- autogl/module/hpo/autone.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/autogl/module/hpo/autone.py b/autogl/module/hpo/autone.py index 7889fa0..d254a19 100644 --- a/autogl/module/hpo/autone.py +++ b/autogl/module/hpo/autone.py @@ -45,9 +45,9 @@ class AutoNE(BaseHPOptimizer): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.max_evals = kwargs.get("max_evals", 3) - self.subgraphs = kwargs.get("subgraphs", 2) - self.sub_evals = kwargs.get("sub_evals", 2) + self.max_evals = kwargs.get("max_evals", 100) + self.subgraphs = kwargs.get("subgraphs", 5) + self.sub_evals = kwargs.get("sub_evals", 5) self.sample_batch_size = kwargs.get("sample_batch_size", 150) self.sample_walk_length = kwargs.get("sample_walk_length", 2) From 65762071f2eb6c86984217b8e513b04d23eb0453 Mon Sep 17 00:00:00 2001 From: Frozenmad Date: Mon, 21 Dec 2020 20:58:59 +0800 Subject: [PATCH 07/11] rename autograph to autogl --- autogl/datasets/__init__.py | 15 ++------------- autogl/datasets/utils.py | 8 ++++---- autogl/module/model/__init__.py | 7 ------- autogl/module/train/__init__.py | 10 ---------- autogl/module/train/graph_classification.py | 6 +++--- autogl/module/train/node_classification.py | 6 +++--- 6 files changed, 12 insertions(+), 40 deletions(-) diff --git a/autogl/datasets/__init__.py b/autogl/datasets/__init__.py index aaa5eac..826b9b2 100644 --- a/autogl/datasets/__init__.py +++ b/autogl/datasets/__init__.py @@ -17,7 +17,7 @@ DATASET_DICT = {} def register_dataset(name): """ - New dataset types can be added to autograph with the :func:`register_dataset` + New dataset types can be added to autogl with the :func:`register_dataset` function decorator. For example:: @@ -37,7 +37,7 @@ def register_dataset(name): pyg and not issubclass(cls, torch_geometric.data.Dataset) ): raise ValueError( - "Dataset ({}: {}) must extend autograph.data.Dataset".format( + "Dataset ({}: {}) must extend autogl.data.Dataset".format( name, cls.__name__ ) ) @@ -106,17 +106,6 @@ from .utils import ( graph_get_split, ) -""" -# automatically import any Python files in the datasets/ directory -for file in os.listdir(os.path.dirname(__file__)): - if file.endswith(".py") and not file.startswith("_"): - dataset_name = file[: file.find(".py")] - if not pyg and dataset_name.startswith("pyg"): - continue - module = importlib.import_module("autograph.datasets." + dataset_name) -""" - - def build_dataset(args, path="~/.cache-autogl/"): path = osp.join(path, "data", args.dataset) path = os.path.expanduser(path) diff --git a/autogl/datasets/utils.py b/autogl/datasets/utils.py index 23d98d4..8b677ee 100644 --- a/autogl/datasets/utils.py +++ b/autogl/datasets/utils.py @@ -168,7 +168,7 @@ def random_splits_mask_class( def graph_cross_validation(dataset, n_splits=10, shuffle=True, random_seed=42): - r"""Cross validation for graph classification data, returning one fold with specific idx in autograph.datasets or pyg.Dataloader(default) + r"""Cross validation for graph classification data, returning one fold with specific idx in autogl.datasets or pyg.Dataloader(default) Parameters ---------- @@ -315,7 +315,7 @@ def graph_get_split(dataset, mask="train", is_loader=True, batch_size=128): return with which dataset/dataloader is_loader : bool - return with autograph.datasets or pyg.Dataloader + return with autogl.datasets or pyg.Dataloader batch_size : int batch_size for generateing Dataloader @@ -332,7 +332,7 @@ def graph_get_split(dataset, mask="train", is_loader=True, batch_size=128): ''' def graph_cross_validation(dataset, n_splits = 10, shuffle = True, random_seed = 42, fold_idx = 0, batch_size = 32, dataloader = True): - r"""Cross validation for graph classification data, returning one fold with specific idx in autograph.datasets or pyg.Dataloader(default) + r"""Cross validation for graph classification data, returning one fold with specific idx in autogl.datasets or pyg.Dataloader(default) Parameters ---------- @@ -355,7 +355,7 @@ def graph_cross_validation(dataset, n_splits = 10, shuffle = True, random_seed = batch_size for generateing Dataloader dataloader : bool - return with autograph.datasets or pyg.Dataloader + return with autogl.datasets or pyg.Dataloader """ skf = StratifiedKFold(n_splits=n_splits, shuffle = shuffle, random_state = random_seed) idx_list = [] diff --git a/autogl/module/model/__init__.py b/autogl/module/model/__init__.py index 69c0272..9eb8495 100644 --- a/autogl/module/model/__init__.py +++ b/autogl/module/model/__init__.py @@ -17,13 +17,6 @@ def register_model(name): return register_model_cls - -# automatically import any Python files in this directory -# for file in os.listdir(os.path.dirname(__file__)): -# if file.endswith(".py") and not file.startswith("_"): -# file_name = file[: file.find(".py")] -# module = importlib.import_module("autograph.module.model." + file_name) - from .base import BaseModel from .topkpool import AutoTopkpool from .graphsage import AutoSAGE diff --git a/autogl/module/train/__init__.py b/autogl/module/train/__init__.py index fb8975c..36fd434 100644 --- a/autogl/module/train/__init__.py +++ b/autogl/module/train/__init__.py @@ -36,16 +36,6 @@ def register_evaluate(*name): return register_evaluate_cls - -""" -# automatically import any Python files in this directory -for file in os.listdir(os.path.dirname(__file__)): - if file.endswith(".py") and not file.startswith("_"): - file_name = file[: file.find(".py")] - module = importlib.import_module("autograph.module.train." + file_name) -""" - - def get_feval(feval): if isinstance(feval, str): return EVALUATE_DICT[feval] diff --git a/autogl/module/train/graph_classification.py b/autogl/module/train/graph_classification.py index b285b79..e365021 100644 --- a/autogl/module/train/graph_classification.py +++ b/autogl/module/train/graph_classification.py @@ -214,7 +214,7 @@ class GraphClassificationTrainer(BaseTrainer): Returns ------- - self: ``autograph.train.GraphClassificationTrainer`` + self: ``autogl.train.GraphClassificationTrainer`` A reference of current trainer. """ @@ -289,7 +289,7 @@ class GraphClassificationTrainer(BaseTrainer): Returns ------- - self: ``autograph.train.GraphClassificationTrainer`` + self: ``autogl.train.GraphClassificationTrainer`` A reference of current trainer. """ @@ -507,7 +507,7 @@ class GraphClassificationTrainer(BaseTrainer): Returns ------- - self: ``autograph.train.GraphClassificationTrainer`` + self: ``autogl.train.GraphClassificationTrainer`` A new instance of trainer. """ diff --git a/autogl/module/train/node_classification.py b/autogl/module/train/node_classification.py index 9ed195c..14970f7 100644 --- a/autogl/module/train/node_classification.py +++ b/autogl/module/train/node_classification.py @@ -191,7 +191,7 @@ class NodeClassificationTrainer(BaseTrainer): Returns ------- - self: ``autograph.train.NodeClassificationTrainer`` + self: ``autogl.train.NodeClassificationTrainer`` A reference of current trainer. """ @@ -261,7 +261,7 @@ class NodeClassificationTrainer(BaseTrainer): Returns ------- - self: ``autograph.train.NodeClassificationTrainer`` + self: ``autogl.train.NodeClassificationTrainer`` A reference of current trainer. """ @@ -456,7 +456,7 @@ class NodeClassificationTrainer(BaseTrainer): Returns ------- - self: ``autograph.train.NodeClassificationTrainer`` + self: ``autogl.train.NodeClassificationTrainer`` A new instance of trainer. """ From 05f09c5f7b461d1822f4e8043d371aaa232e21e0 Mon Sep 17 00:00:00 2001 From: Frozenmad Date: Mon, 21 Dec 2020 21:07:44 +0800 Subject: [PATCH 08/11] change default config to gcn small --- examples/node_classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/node_classification.py b/examples/node_classification.py index 1e7a61e..af7334d 100644 --- a/examples/node_classification.py +++ b/examples/node_classification.py @@ -16,7 +16,7 @@ if __name__ == '__main__': from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument('--dataset', default='cora', type=str) - parser.add_argument('--configs', type=str, default='../configs/node_classification.yaml') + parser.add_argument('--configs', type=str, default='../configs/nodeclf_gat_benchmark_small.yml') # following arguments will override parameters in the config file parser.add_argument('--hpo', type=str, default='random') parser.add_argument('--max_eval', type=int, default=5) From 0e2acedb68aef6be1063e0ea4db644e24db4be50 Mon Sep 17 00:00:00 2001 From: SwiftieH Date: Tue, 22 Dec 2020 11:39:46 +0000 Subject: [PATCH 09/11] Update README in datasets --- autogl/datasets/README.md | 50 +++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/autogl/datasets/README.md b/autogl/datasets/README.md index 7764fd5..416bcb5 100644 --- a/autogl/datasets/README.md +++ b/autogl/datasets/README.md @@ -1,7 +1,7 @@ -Datasets are derived from CogDL +Datasets are derived from PyG, OGB and CogDL. ================= -Autograph now supports the following benchmarks for different tasks: +AutoGL now supports the following benchmarks for different tasks: - semi-supervised node classification: Cora, Citeseer, Pubmed, Amazon Computers\*, Amazon Photo\*, Coauthor CS\*, Coauthor Physics\*, Reddit (\*: using `utils.random_splits_mask_class` for splitting dataset is recommended.) @@ -16,19 +16,32 @@ Autograph now supports the following benchmarks for different tasks: | Coauthor Physics | ✓ | | ✓ | ✓ | ✓ | ✓ | | | | Reddit | ✓ | | ✓ | ✓ | ✓ | ✓ | | ✓ | + +- supervised graph classification: MUTAG, IMDB-B, IMDB-M, PROTEINS, COLLAB + +| Dataset | PyG | CogDL | x | y | edge_index | edge_attr | train/val/test node | train/val/test mask | adj| +| ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | +| Mutag | ✓ | | ✓ | ✓ | ✓ | ✓ | | | | +| IMDB-B | ✓ | | | ✓ | ✓ | | | | | +| IMDB-M | ✓ | | | ✓ | ✓ | | | | | +| PROTEINS | ✓ | | ✓ | ✓ | ✓ | | | | | +| COLLAB | ✓ | | | ✓ | ✓ | | | | | + +- node classification datasets from OGB: ogbn-products, ogbn-proteins, ogbn-arxiv, ogbn-papers100M and ogbn-mag. + +- graph classification datasets from OGB: ogbg-molhiv, ogbg-molpcba, ogbg-ppa and ogbg-code. + --- TODO: -Autograph now supports the following benchmarks for different tasks: +In future version, AutoGL will support the following benchmarks for different tasks: - unsupervised node classification: PPI, Blogcatalog, Wikipedia -- semi-supervised node classification: Cora, Citeseer, Pubmed - heterogeneous node classification: DBLP, ACM, IMDB - link prediction: PPI, Wikipedia, Blogcatalog - multiplex link prediction: Amazon, YouTube, Twitter -- unsupervised graph classification: MUTAG, IMDB-B, IMDB-M, PROTEINS, COLLAB -- supervised graph classification: MUTAG, IMDB-B, IMDB-M, PROTEINS, COLLAB - +- link prediction datasets from OGB: ogbl-ppa, ogbl-collab, ogbl-ddi, ogbl-citation, ogbl-wikikg and ogbl-biokg. + From f2e96bfa44e3e80a111089d38d37b3fdcf1ff1c5 Mon Sep 17 00:00:00 2001 From: Frozenmad Date: Wed, 23 Dec 2020 15:52:28 +0800 Subject: [PATCH 10/11] change version to 0.1.1 --- autogl/__init__.py | 2 +- docs/conf.py | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/autogl/__init__.py b/autogl/__init__.py index 3dc1f76..485f44a 100644 --- a/autogl/__init__.py +++ b/autogl/__init__.py @@ -1 +1 @@ -__version__ = "0.1.0" +__version__ = "0.1.1" diff --git a/docs/conf.py b/docs/conf.py index e6a59b4..2af4e14 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -23,7 +23,7 @@ copyright = '2020, THUMNLab/aglteam' author = 'THUMNLab/aglteam' # The full version, including alpha/beta/rc tags -release = 'v0.1.0' +release = 'v0.1.1' # -- General configuration --------------------------------------------------- diff --git a/setup.py b/setup.py index 20c0688..db1b30a 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ with open("README.md", 'r') as fh: ''' https://setuptools.readthedocs.io/en/latest/ ''' setup( name='auto-graph-learning', - version='0.1.0-20201207185314', + version='0.1.1', author='THUMNLab/aglteam', maintainer='THUMNLab/aglteam', author_email='xin_wang@tsinghua.edu.cn', From bdad1a32063911a4ddc666628b83a97d6ba9c500 Mon Sep 17 00:00:00 2001 From: Frozenmad Date: Wed, 23 Dec 2020 15:57:35 +0800 Subject: [PATCH 11/11] fix typo in example/nodeclf --- examples/node_classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/node_classification.py b/examples/node_classification.py index af7334d..939b555 100644 --- a/examples/node_classification.py +++ b/examples/node_classification.py @@ -16,7 +16,7 @@ if __name__ == '__main__': from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument('--dataset', default='cora', type=str) - parser.add_argument('--configs', type=str, default='../configs/nodeclf_gat_benchmark_small.yml') + parser.add_argument('--configs', type=str, default='../configs/nodeclf_gcn_benchmark_small.yml') # following arguments will override parameters in the config file parser.add_argument('--hpo', type=str, default='random') parser.add_argument('--max_eval', type=int, default=5)