Fix bugs and update to v0.1.1 * fix bugs in ogb dataset support * change data to dataset in solver and trainer * fix AutoNE for node clf * fix some typostags/v0.3.1
| @@ -1 +1 @@ | |||
| __version__ = "0.1.0" | |||
| __version__ = "0.1.1" | |||
| @@ -1,7 +1,7 @@ | |||
| Datasets are derived from CogDL | |||
| Datasets are derived from PyG, OGB and CogDL. | |||
| ================= | |||
| Autograph now supports the following benchmarks for different tasks: | |||
| AutoGL now supports the following benchmarks for different tasks: | |||
| - semi-supervised node classification: Cora, Citeseer, Pubmed, Amazon Computers\*, Amazon Photo\*, Coauthor CS\*, Coauthor Physics\*, Reddit (\*: using `utils.random_splits_mask_class` for splitting dataset is recommended.) | |||
| @@ -16,19 +16,32 @@ Autograph now supports the following benchmarks for different tasks: | |||
| | Coauthor Physics | ✓ | | ✓ | ✓ | ✓ | ✓ | | | | |||
| | Reddit | ✓ | | ✓ | ✓ | ✓ | ✓ | | ✓ | | |||
| - supervised graph classification: MUTAG, IMDB-B, IMDB-M, PROTEINS, COLLAB | |||
| | Dataset | PyG | CogDL | x | y | edge_index | edge_attr | train/val/test node | train/val/test mask | adj| | |||
| | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | | |||
| | Mutag | ✓ | | ✓ | ✓ | ✓ | ✓ | | | | | |||
| | IMDB-B | ✓ | | | ✓ | ✓ | | | | | | |||
| | IMDB-M | ✓ | | | ✓ | ✓ | | | | | | |||
| | PROTEINS | ✓ | | ✓ | ✓ | ✓ | | | | | | |||
| | COLLAB | ✓ | | | ✓ | ✓ | | | | | | |||
| - node classification datasets from OGB: ogbn-products, ogbn-proteins, ogbn-arxiv, ogbn-papers100M and ogbn-mag. | |||
| - graph classification datasets from OGB: ogbg-molhiv, ogbg-molpcba, ogbg-ppa and ogbg-code. | |||
| --- | |||
| TODO: | |||
| Autograph now supports the following benchmarks for different tasks: | |||
| In future version, AutoGL will support the following benchmarks for different tasks: | |||
| - unsupervised node classification: PPI, Blogcatalog, Wikipedia | |||
| - semi-supervised node classification: Cora, Citeseer, Pubmed | |||
| - heterogeneous node classification: DBLP, ACM, IMDB | |||
| - link prediction: PPI, Wikipedia, Blogcatalog | |||
| - multiplex link prediction: Amazon, YouTube, Twitter | |||
| - unsupervised graph classification: MUTAG, IMDB-B, IMDB-M, PROTEINS, COLLAB | |||
| - supervised graph classification: MUTAG, IMDB-B, IMDB-M, PROTEINS, COLLAB | |||
| - link prediction datasets from OGB: ogbl-ppa, ogbl-collab, ogbl-ddi, ogbl-citation, ogbl-wikikg and ogbl-biokg. | |||
| <!-- | |||
| | Dataset | PyG | CogDL | x | y | edge_index | edge_attr | train/val/test node | train/val/test mask | adj| | |||
| | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | | |||
| | ACM | | ✓ | ✓ | ✓ | ✓ | | ✓ | | ✓ list | | |||
| @@ -41,27 +54,12 @@ Autograph now supports the following benchmarks for different tasks: | |||
| | Amazon | | ✓ | | | | | ✓ data | | | | |||
| | Twitter | | ✓ | | | | | ✓ data | | | | |||
| | Youtube | | ✓ | | | | | ✓ data | | | | |||
| | Cora | ✓ | | ✓ | ✓ | ✓ | ✓ | | ✓ | | | |||
| | Citeseer | ✓ | | ✓ | ✓ | ✓ | ✓ | | ✓ | | | |||
| | Pubmed | ✓ | | ✓ | ✓ | ✓ | ✓ | | ✓ | | | |||
| | Reddit | ✓ | | ✓ | ✓ | ✓ | ✓ | | ✓ | | | |||
| | Mutag | ✓ | | ✓ | ✓ | ✓ | ✓ | | | | | |||
| | IMDB-B | ✓ | | | ✓ | ✓ | | | | | | |||
| | IMDB-M | ✓ | | | ✓ | ✓ | | | | | | |||
| | PROTEINS | ✓ | | ✓ | ✓ | ✓ | | | | | | |||
| | COLLAB | ✓ | | | ✓ | ✓ | | | | | | |||
| | NCI1 | ✓ | | ✓ | ✓ | ✓ | | | | | | |||
| | NCI109 | ✓ | | ✓ | ✓ | ✓ | | | | | | |||
| | Enzyme | ✓ | | ✓ | ✓ | ✓ | | | | | | |||
| | Reddit-B | ✓ | | | ✓ | ✓ | | | | | | |||
| | Reddit-Multi-5k | ✓ | | | ✓ | ✓ | | | | | | |||
| | Reddit-Multi-12k | ✓ | | | ✓ | ✓ | | | | | | |||
| | PTC-MR | ✓ | | ✓ | ✓ | ✓ | ✓ | | | | | |||
| | NCI1 | ✓ | | ✓ | ✓ | ✓ | | | | | | |||
| | NCI109 | ✓ | | ✓ | ✓ | ✓ | | | | | | |||
| | Enzyme | ✓ | | ✓ | ✓ | ✓ | | | | | | |||
| --> | |||
| @@ -1,6 +1,7 @@ | |||
| import os.path as osp | |||
| import importlib | |||
| import os | |||
| import torch | |||
| from ..data.dataset import Dataset | |||
| @@ -16,7 +17,7 @@ DATASET_DICT = {} | |||
| def register_dataset(name): | |||
| """ | |||
| New dataset types can be added to autograph with the :func:`register_dataset` | |||
| New dataset types can be added to autogl with the :func:`register_dataset` | |||
| function decorator. | |||
| For example:: | |||
| @@ -36,7 +37,7 @@ def register_dataset(name): | |||
| pyg and not issubclass(cls, torch_geometric.data.Dataset) | |||
| ): | |||
| raise ValueError( | |||
| "Dataset ({}: {}) must extend autograph.data.Dataset".format( | |||
| "Dataset ({}: {}) must extend autogl.data.Dataset".format( | |||
| name, cls.__name__ | |||
| ) | |||
| ) | |||
| @@ -105,27 +106,22 @@ from .utils import ( | |||
| graph_get_split, | |||
| ) | |||
| """ | |||
| # automatically import any Python files in the datasets/ directory | |||
| for file in os.listdir(os.path.dirname(__file__)): | |||
| if file.endswith(".py") and not file.startswith("_"): | |||
| dataset_name = file[: file.find(".py")] | |||
| if not pyg and dataset_name.startswith("pyg"): | |||
| continue | |||
| module = importlib.import_module("autograph.datasets." + dataset_name) | |||
| """ | |||
| def build_dataset(args, path="~/.cache-autogl/"): | |||
| path = osp.join(path, "data", args.dataset) | |||
| path = os.path.expanduser(path) | |||
| return DATASET_DICT[args.dataset](path) | |||
| def build_dataset_from_name(dataset, path="~/.cache-autogl/"): | |||
| path = osp.join(path, "data", dataset) | |||
| def build_dataset_from_name(dataset_name, path="~/.cache-autogl/"): | |||
| path = osp.join(path, "data", dataset_name) | |||
| path = os.path.expanduser(path) | |||
| return DATASET_DICT[dataset](path) | |||
| dataset = DATASET_DICT[dataset_name](path) | |||
| if 'ogbn' in dataset_name: | |||
| #dataset.data, dataset.slices = dataset.collate([dataset.data]) | |||
| #dataset.data.num_nodes = dataset.data.num_nodes[0] | |||
| if dataset.data.y.shape[-1] == 1: | |||
| dataset.data.y = torch.squeeze(dataset.data.y) | |||
| return dataset | |||
| __all__ = [ | |||
| @@ -3,7 +3,8 @@ from ogb.nodeproppred import PygNodePropPredDataset | |||
| from ogb.graphproppred import PygGraphPropPredDataset | |||
| from ogb.linkproppred import PygLinkPropPredDataset | |||
| from . import register_dataset | |||
| from .utils import index_to_mask | |||
| from torch_geometric.data import Data | |||
| # OGBN | |||
| @@ -12,25 +13,55 @@ class OGBNproductsDataset(PygNodePropPredDataset): | |||
| def __init__(self, path): | |||
| dataset = "ogbn-products" | |||
| # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) | |||
| PygNodePropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor()) | |||
| PygNodePropPredDataset(name=dataset, root=path) | |||
| super(OGBNproductsDataset, self).__init__( | |||
| dataset, path, transform=T.ToSparseTensor() | |||
| dataset, path | |||
| ) | |||
| # Pre-compute GCN normalization. | |||
| #adj_t = self.data.adj_t.set_diag() | |||
| #deg = adj_t.sum(dim=1).to(torch.float) | |||
| #deg_inv_sqrt = deg.pow(-0.5) | |||
| #deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0 | |||
| #adj_t = deg_inv_sqrt.view(-1, 1) * adj_t * deg_inv_sqrt.view(1, -1) | |||
| #self.data.adj_t = adj_t | |||
| setattr(OGBNproductsDataset, "metric", "Accuracy") | |||
| setattr(OGBNproductsDataset, "loss", "nll_loss") | |||
| split_idx = self.get_idx_split() | |||
| datalist = [] | |||
| for d in self: | |||
| setattr(d, "train_mask", index_to_mask(split_idx['train'], d.y.shape[0])) | |||
| setattr(d, "val_mask", index_to_mask(split_idx['valid'], d.y.shape[0])) | |||
| setattr(d, "test_mask", index_to_mask(split_idx['test'], d.y.shape[0])) | |||
| datalist.append(d) | |||
| self.data, self.slices = self.collate(datalist) | |||
| @register_dataset("ogbn-proteins") | |||
| class OGBNproteinsDataset(PygNodePropPredDataset): | |||
| def __init__(self, path): | |||
| dataset = "ogbn-proteins" | |||
| # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) | |||
| PygNodePropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor()) | |||
| PygNodePropPredDataset(name=dataset, root=path) | |||
| super(OGBNproteinsDataset, self).__init__( | |||
| dataset, path, transform=T.ToSparseTensor() | |||
| dataset, path | |||
| ) | |||
| dataset_t = PygNodePropPredDataset(name=dataset, root = path, transform=T.ToSparseTensor()) | |||
| # Move edge features to node features. | |||
| self.data.x = dataset_t[0].adj_t.mean(dim=1) | |||
| #dataset_t[0].adj_t.set_value_(None) | |||
| del dataset_t | |||
| setattr(OGBNproteinsDataset, "metric", "ROC-AUC") | |||
| setattr(OGBNproteinsDataset, "loss", "BCEWithLogitsLoss") | |||
| setattr(OGBNproteinsDataset, "loss", "binary_cross_entropy_with_logits") | |||
| split_idx = self.get_idx_split() | |||
| datalist = [] | |||
| for d in self: | |||
| setattr(d, "train_mask", index_to_mask(split_idx['train'], d.y.shape[0])) | |||
| setattr(d, "val_mask", index_to_mask(split_idx['valid'], d.y.shape[0])) | |||
| setattr(d, "test_mask", index_to_mask(split_idx['test'], d.y.shape[0])) | |||
| datalist.append(d) | |||
| self.data, self.slices = self.collate(datalist) | |||
| @register_dataset("ogbn-arxiv") | |||
| @@ -38,38 +69,77 @@ class OGBNarxivDataset(PygNodePropPredDataset): | |||
| def __init__(self, path): | |||
| dataset = "ogbn-arxiv" | |||
| # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) | |||
| PygNodePropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor()) | |||
| PygNodePropPredDataset(name=dataset, root=path) | |||
| super(OGBNarxivDataset, self).__init__( | |||
| dataset, path, transform=T.ToSparseTensor() | |||
| dataset, path | |||
| ) | |||
| #self[0].adj_t = self[0].adj_t.to_symmetric() | |||
| setattr(OGBNarxivDataset, "metric", "Accuracy") | |||
| setattr(OGBNarxivDataset, "loss", "nll_loss") | |||
| split_idx = self.get_idx_split() | |||
| datalist = [] | |||
| for d in self: | |||
| setattr(d, "train_mask", index_to_mask(split_idx['train'], d.y.shape[0])) | |||
| setattr(d, "val_mask", index_to_mask(split_idx['valid'], d.y.shape[0])) | |||
| setattr(d, "test_mask", index_to_mask(split_idx['test'], d.y.shape[0])) | |||
| datalist.append(d) | |||
| self.data, self.slices = self.collate(datalist) | |||
| @register_dataset("ogbn-papers100M") | |||
| class OGBNpapers100MDataset(PygNodePropPredDataset): | |||
| def __init__(self, path): | |||
| dataset = "ogbn-papers100M" | |||
| # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) | |||
| PygNodePropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor()) | |||
| PygNodePropPredDataset(name=dataset, root=path) | |||
| super(OGBNpapers100MDataset, self).__init__( | |||
| dataset, path, transform=T.ToSparseTensor() | |||
| dataset, path | |||
| ) | |||
| setattr(OGBNpapers100MDataset, "metric", "Accuracy") | |||
| setattr(OGBNpapers100MDataset, "loss", "nll_loss") | |||
| split_idx = self.get_idx_split() | |||
| datalist = [] | |||
| for d in self: | |||
| setattr(d, "train_mask", index_to_mask(split_idx['train'], d.y.shape[0])) | |||
| setattr(d, "val_mask", index_to_mask(split_idx['valid'], d.y.shape[0])) | |||
| setattr(d, "test_mask", index_to_mask(split_idx['test'], d.y.shape[0])) | |||
| datalist.append(d) | |||
| self.data, self.slices = self.collate(datalist) | |||
| @register_dataset("ogbn-mag") | |||
| class OGBNmagDataset(PygNodePropPredDataset): | |||
| def __init__(self, path): | |||
| dataset = "ogbn-mag" | |||
| # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) | |||
| PygNodePropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor()) | |||
| PygNodePropPredDataset(name=dataset, root=path) | |||
| super(OGBNmagDataset, self).__init__( | |||
| dataset, path, transform=T.ToSparseTensor() | |||
| dataset, path | |||
| ) | |||
| # Preprocessing | |||
| rel_data = self[0] | |||
| # We are only interested in paper <-> paper relations. | |||
| self.data = Data( | |||
| x=rel_data.x_dict['paper'], | |||
| edge_index=rel_data.edge_index_dict[('paper', 'cites', 'paper')], | |||
| y=rel_data.y_dict['paper']) | |||
| #self.data = T.ToSparseTensor()(data) | |||
| #self[0].adj_t = self[0].adj_t.to_symmetric() | |||
| setattr(OGBNmagDataset, "metric", "Accuracy") | |||
| setattr(OGBNmagDataset, "loss", "nll_loss") | |||
| split_idx = self.get_idx_split() | |||
| datalist = [] | |||
| for d in self: | |||
| setattr(d, "train_mask", index_to_mask(split_idx['train'], d.y.shape[0])) | |||
| setattr(d, "val_mask", index_to_mask(split_idx['valid'], d.y.shape[0])) | |||
| setattr(d, "test_mask", index_to_mask(split_idx['test'], d.y.shape[0])) | |||
| datalist.append(d) | |||
| self.data, self.slices = self.collate(datalist) | |||
| # OGBG | |||
| @@ -83,7 +153,7 @@ class OGBGmolhivDataset(PygGraphPropPredDataset): | |||
| PygGraphPropPredDataset(name=dataset, root=path) | |||
| super(OGBGmolhivDataset, self).__init__(dataset, path) | |||
| setattr(OGBGmolhivDataset, "metric", "ROC-AUC") | |||
| setattr(OGBGmolhivDataset, "loss", "BCEWithLogitsLoss") | |||
| setattr(OGBGmolhivDataset, "loss", "binary_cross_entropy_with_logits") | |||
| @register_dataset("ogbg-molpcba") | |||
| @@ -94,7 +164,7 @@ class OGBGmolpcbaDataset(PygGraphPropPredDataset): | |||
| PygGraphPropPredDataset(name=dataset, root=path) | |||
| super(OGBGmolpcbaDataset, self).__init__(dataset, path) | |||
| setattr(OGBGmolpcbaDataset, "metric", "AP") | |||
| setattr(OGBGmolpcbaDataset, "loss", "BCEWithLogitsLoss") | |||
| setattr(OGBGmolpcbaDataset, "loss", "binary_cross_entropy_with_logits") | |||
| @register_dataset("ogbg-ppa") | |||
| @@ -105,7 +175,7 @@ class OGBGppaDataset(PygGraphPropPredDataset): | |||
| PygGraphPropPredDataset(name=dataset, root=path) | |||
| super(OGBGppaDataset, self).__init__(dataset, path) | |||
| setattr(OGBGppaDataset, "metric", "Accuracy") | |||
| setattr(OGBGppaDataset, "loss", "CrossEntropyLoss") | |||
| setattr(OGBGppaDataset, "loss", "cross_entropy") | |||
| @register_dataset("ogbg-code") | |||
| @@ -116,7 +186,7 @@ class OGBGcodeDataset(PygGraphPropPredDataset): | |||
| PygGraphPropPredDataset(name=dataset, root=path) | |||
| super(OGBGcodeDataset, self).__init__(dataset, path) | |||
| setattr(OGBGcodeDataset, "metric", "F1 score") | |||
| setattr(OGBGcodeDataset, "loss", "CrossEntropyLoss") | |||
| setattr(OGBGcodeDataset, "loss", "cross_entropy") | |||
| # OGBL | |||
| @@ -127,7 +197,7 @@ class OGBLppaDataset(PygLinkPropPredDataset): | |||
| def __init__(self, path): | |||
| dataset = "ogbl-ppa" | |||
| # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) | |||
| PygLinkPropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor()) | |||
| PygLinkPropPredDataset(name=dataset, root=path) | |||
| super(OGBLppaDataset, self).__init__(dataset, path) | |||
| setattr(OGBLppaDataset, "metric", "Hits@100") | |||
| setattr(OGBLppaDataset, "loss", "pos_neg_loss") | |||
| @@ -138,7 +208,7 @@ class OGBLcollabDataset(PygLinkPropPredDataset): | |||
| def __init__(self, path): | |||
| dataset = "ogbl-collab" | |||
| # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) | |||
| PygLinkPropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor()) | |||
| PygLinkPropPredDataset(name=dataset, root=path) | |||
| super(OGBLcollabDataset, self).__init__(dataset, path) | |||
| setattr(OGBLcollabDataset, "metric", "Hits@50") | |||
| setattr(OGBLcollabDataset, "loss", "pos_neg_loss") | |||
| @@ -149,7 +219,7 @@ class OGBLddiDataset(PygLinkPropPredDataset): | |||
| def __init__(self, path): | |||
| dataset = "ogbl-ddi" | |||
| # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) | |||
| PygLinkPropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor()) | |||
| PygLinkPropPredDataset(name=dataset, root=path) | |||
| super(OGBLddiDataset, self).__init__(dataset, path) | |||
| setattr(OGBLddiDataset, "metric", "Hits@20") | |||
| setattr(OGBLddiDataset, "loss", "pos_neg_loss") | |||
| @@ -160,7 +230,7 @@ class OGBLcitationDataset(PygLinkPropPredDataset): | |||
| def __init__(self, path): | |||
| dataset = "ogbl-citation" | |||
| # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) | |||
| PygLinkPropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor()) | |||
| PygLinkPropPredDataset(name=dataset, root=path) | |||
| super(OGBLcitationDataset, self).__init__(dataset, path) | |||
| setattr(OGBLcitationDataset, "metric", "MRR") | |||
| setattr(OGBLcitationDataset, "loss", "pos_neg_loss") | |||
| @@ -171,7 +241,7 @@ class OGBLwikikgDataset(PygLinkPropPredDataset): | |||
| def __init__(self, path): | |||
| dataset = "ogbl-wikikg" | |||
| # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) | |||
| PygLinkPropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor()) | |||
| PygLinkPropPredDataset(name=dataset, root=path) | |||
| super(OGBLwikikgDataset, self).__init__(dataset, path) | |||
| setattr(OGBLwikikgDataset, "metric", "MRR") | |||
| setattr(OGBLwikikgDataset, "loss", "pos_neg_loss") | |||
| @@ -182,7 +252,7 @@ class OGBLbiokgDataset(PygLinkPropPredDataset): | |||
| def __init__(self, path): | |||
| dataset = "ogbl-biokg" | |||
| # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) | |||
| PygLinkPropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor()) | |||
| PygLinkPropPredDataset(name=dataset, root=path) | |||
| super(OGBLbiokgDataset, self).__init__(dataset, path) | |||
| setattr(OGBLbiokgDataset, "metric", "MRR") | |||
| setattr(OGBLbiokgDataset, "loss", "pos_neg_loss") | |||
| @@ -4,12 +4,12 @@ from torch_geometric.data import DataLoader | |||
| from sklearn.model_selection import StratifiedKFold | |||
| def get_label_number(data): | |||
| def get_label_number(dataset): | |||
| r"""Get the number of labels in this dataset as dict.""" | |||
| label_num = {} | |||
| labels = data.y.unique().cpu().detach().numpy().tolist() | |||
| labels = dataset.data.y.unique().cpu().detach().numpy().tolist() | |||
| for label in labels: | |||
| label_num[label] = (data.y == label).sum().item() | |||
| label_num[label] = (dataset.data.y == label).sum().item() | |||
| return label_num | |||
| @@ -19,7 +19,7 @@ def index_to_mask(index, size): | |||
| return mask | |||
| def random_splits_mask(data, train_ratio=0.2, val_ratio=0.4, seed=None): | |||
| def random_splits_mask(dataset, train_ratio=0.2, val_ratio=0.4, seed=None): | |||
| r"""If the data has masks for train/val/test, return the splits with specific ratio. | |||
| Parameters | |||
| @@ -37,6 +37,7 @@ def random_splits_mask(data, train_ratio=0.2, val_ratio=0.4, seed=None): | |||
| assert ( | |||
| train_ratio + val_ratio <= 1 | |||
| ), "the sum of train_ratio and val_ratio is larger than 1" | |||
| data = dataset[0] | |||
| r_s = torch.get_rng_state() | |||
| if torch.cuda.is_available(): | |||
| r_s_cuda = torch.cuda.get_rng_state() | |||
| @@ -61,11 +62,15 @@ def random_splits_mask(data, train_ratio=0.2, val_ratio=0.4, seed=None): | |||
| if torch.cuda.is_available(): | |||
| torch.cuda.set_rng_state(r_s_cuda) | |||
| return data | |||
| dataset.data, dataset.slices = dataset.collate([d for d in dataset]) | |||
| # while type(dataset.data.num_nodes) == list: | |||
| # dataset.data.num_nodes = dataset.data.num_nodes[0] | |||
| # dataset.data.num_nodes = dataset.data.num_nodes[0] | |||
| return dataset | |||
| def random_splits_mask_class( | |||
| data, | |||
| dataset, | |||
| num_train_per_class=20, | |||
| num_val_per_class=30, | |||
| num_val=None, | |||
| @@ -97,6 +102,8 @@ def random_splits_mask_class( | |||
| seed : int | |||
| random seed for splitting dataset. | |||
| """ | |||
| data = dataset[0] | |||
| r_s = torch.get_rng_state() | |||
| if torch.cuda.is_available(): | |||
| r_s_cuda = torch.cuda.get_rng_state() | |||
| @@ -152,11 +159,16 @@ def random_splits_mask_class( | |||
| torch.set_rng_state(r_s) | |||
| if torch.cuda.is_available(): | |||
| torch.cuda.set_rng_state(r_s_cuda) | |||
| return data | |||
| dataset.data, dataset.slices = dataset.collate([d for d in dataset]) | |||
| # while type(dataset.data.num_nodes) == list: | |||
| # dataset.data.num_nodes = dataset.data.num_nodes[0] | |||
| # dataset.data.num_nodes = dataset.data.num_nodes[0] | |||
| return dataset | |||
| def graph_cross_validation(dataset, n_splits=10, shuffle=True, random_seed=42): | |||
| r"""Cross validation for graph classification data, returning one fold with specific idx in autograph.datasets or pyg.Dataloader(default) | |||
| r"""Cross validation for graph classification data, returning one fold with specific idx in autogl.datasets or pyg.Dataloader(default) | |||
| Parameters | |||
| ---------- | |||
| @@ -303,7 +315,7 @@ def graph_get_split(dataset, mask="train", is_loader=True, batch_size=128): | |||
| return with which dataset/dataloader | |||
| is_loader : bool | |||
| return with autograph.datasets or pyg.Dataloader | |||
| return with autogl.datasets or pyg.Dataloader | |||
| batch_size : int | |||
| batch_size for generateing Dataloader | |||
| @@ -320,7 +332,7 @@ def graph_get_split(dataset, mask="train", is_loader=True, batch_size=128): | |||
| ''' | |||
| def graph_cross_validation(dataset, n_splits = 10, shuffle = True, random_seed = 42, fold_idx = 0, batch_size = 32, dataloader = True): | |||
| r"""Cross validation for graph classification data, returning one fold with specific idx in autograph.datasets or pyg.Dataloader(default) | |||
| r"""Cross validation for graph classification data, returning one fold with specific idx in autogl.datasets or pyg.Dataloader(default) | |||
| Parameters | |||
| ---------- | |||
| @@ -343,7 +355,7 @@ def graph_cross_validation(dataset, n_splits = 10, shuffle = True, random_seed = | |||
| batch_size for generateing Dataloader | |||
| dataloader : bool | |||
| return with autograph.datasets or pyg.Dataloader | |||
| return with autogl.datasets or pyg.Dataloader | |||
| """ | |||
| skf = StratifiedKFold(n_splits=n_splits, shuffle = shuffle, random_state = random_seed) | |||
| idx_list = [] | |||
| @@ -17,6 +17,12 @@ from torch_geometric.data import GraphSAINTRandomWalkSampler | |||
| from ..feature.subgraph.nx import NxSubgraph, NxLargeCliqueSize | |||
| from ..feature.subgraph import nx, SgNetLSD | |||
| from torch_geometric.data import InMemoryDataset | |||
| class _MyDataset(InMemoryDataset): | |||
| def __init__(self, datalist) -> None: | |||
| super().__init__() | |||
| self.data, self.slices = self.collate(datalist) | |||
| @register_hpo("autone") | |||
| class AutoNE(BaseHPOptimizer): | |||
| @@ -51,6 +57,8 @@ class AutoNE(BaseHPOptimizer): | |||
| See .base.BaseHPOptimizer.optimize | |||
| """ | |||
| self.feval_name = trainer.get_feval(return_major=True).get_eval_name() | |||
| self.is_higher_better = trainer.get_feval(return_major=True).is_higher_better() | |||
| space = trainer.hyper_parameter_space + trainer.model.hyper_parameter_space | |||
| current_space = self._encode_para(space) | |||
| @@ -65,15 +73,18 @@ class AutoNE(BaseHPOptimizer): | |||
| ) | |||
| results = [] | |||
| for data in loader: | |||
| results.append(data) | |||
| in_dataset= _MyDataset([data]) | |||
| results.append(in_dataset) | |||
| return results | |||
| func = SgNetLSD() | |||
| def get_wne(graph): | |||
| func.fit_transform(graph) | |||
| transform = nx.NxSubgraph.compose(map(lambda x: x(), nx.NX_EXTRACTORS)) | |||
| gf = transform.fit_transform(graph).gf | |||
| graph=func.fit_transform(graph) | |||
| # transform = nx.NxSubgraph.compose(map(lambda x: x(), nx.NX_EXTRACTORS)) | |||
| # print(type(graph)) | |||
| #gf = transform.fit_transform(graph).data.gf | |||
| gf = graph.data.gf | |||
| fin = list(gf[0]) + list(map(lambda x: float(x), gf[1:])) | |||
| return fin | |||
| @@ -117,7 +128,7 @@ class AutoNE(BaseHPOptimizer): | |||
| best_res = None | |||
| best_trainer = None | |||
| best_para = None | |||
| wne = get_wne(dataset.data) | |||
| wne = get_wne(dataset) | |||
| for t in range(s): | |||
| if time.time() - start_time > time_limit: | |||
| self.logger.info("Time out of limit, Epoch: {}".format(str(i))) | |||
| @@ -129,7 +140,7 @@ class AutoNE(BaseHPOptimizer): | |||
| para = params.x2dict(X_temp) | |||
| externel_para, trial_para = self._decode_para(para) | |||
| current_trainer, res_temp = fn(dataset, externel_para) | |||
| self._print_info(externel_para, res_temp, trainer) | |||
| self._print_info(externel_para, res_temp) | |||
| X_reg = params.dict2x(trial_para) | |||
| X.append(np.hstack((X_reg, wne))) | |||
| @@ -150,7 +161,7 @@ class AutoNE(BaseHPOptimizer): | |||
| decoded_json, _ = self._decode_para(best_para) | |||
| self.logger.info("Best Parameter:") | |||
| self._print_info(decoded_json, best_res, trainer) | |||
| self._print_info(decoded_json, best_res) | |||
| return best_trainer, decoded_json | |||
| @@ -174,7 +174,7 @@ class BaseHPOptimizer: | |||
| elif val > old_para["maxValue"]: | |||
| val = old_para["maxValue"] | |||
| if old_para["type"] == "INTEGER": | |||
| val = round(val) | |||
| val = int(round(val)) | |||
| externel_para[name] = val | |||
| trial_para[name] = ( | |||
| val if old_para["scalingType"] != "LOG" else math.log(val) | |||
| @@ -0,0 +1 @@ | |||
| # Files in this folder are reproduced from https://github.com/tobegit3hub/advisor with some changes. | |||
| @@ -17,13 +17,6 @@ def register_model(name): | |||
| return register_model_cls | |||
| # automatically import any Python files in this directory | |||
| # for file in os.listdir(os.path.dirname(__file__)): | |||
| # if file.endswith(".py") and not file.startswith("_"): | |||
| # file_name = file[: file.find(".py")] | |||
| # module = importlib.import_module("autograph.module.model." + file_name) | |||
| from .base import BaseModel | |||
| from .topkpool import AutoTopkpool | |||
| from .graphsage import AutoSAGE | |||
| @@ -36,16 +36,6 @@ def register_evaluate(*name): | |||
| return register_evaluate_cls | |||
| """ | |||
| # automatically import any Python files in this directory | |||
| for file in os.listdir(os.path.dirname(__file__)): | |||
| if file.endswith(".py") and not file.startswith("_"): | |||
| file_name = file[: file.find(".py")] | |||
| module = importlib.import_module("autograph.module.train." + file_name) | |||
| """ | |||
| def get_feval(feval): | |||
| if isinstance(feval, str): | |||
| return EVALUATE_DICT[feval] | |||
| @@ -214,7 +214,7 @@ class GraphClassificationTrainer(BaseTrainer): | |||
| Returns | |||
| ------- | |||
| self: ``autograph.train.GraphClassificationTrainer`` | |||
| self: ``autogl.train.GraphClassificationTrainer`` | |||
| A reference of current trainer. | |||
| """ | |||
| @@ -289,7 +289,7 @@ class GraphClassificationTrainer(BaseTrainer): | |||
| Returns | |||
| ------- | |||
| self: ``autograph.train.GraphClassificationTrainer`` | |||
| self: ``autogl.train.GraphClassificationTrainer`` | |||
| A reference of current trainer. | |||
| """ | |||
| @@ -507,7 +507,7 @@ class GraphClassificationTrainer(BaseTrainer): | |||
| Returns | |||
| ------- | |||
| self: ``autograph.train.GraphClassificationTrainer`` | |||
| self: ``autogl.train.GraphClassificationTrainer`` | |||
| A new instance of trainer. | |||
| """ | |||
| @@ -191,7 +191,7 @@ class NodeClassificationTrainer(BaseTrainer): | |||
| Returns | |||
| ------- | |||
| self: ``autograph.train.NodeClassificationTrainer`` | |||
| self: ``autogl.train.NodeClassificationTrainer`` | |||
| A reference of current trainer. | |||
| """ | |||
| @@ -218,7 +218,7 @@ class NodeClassificationTrainer(BaseTrainer): | |||
| feval = self.feval[0] | |||
| else: | |||
| feval = self.feval | |||
| val_loss = self.evaluate(data, mask=data.val_mask, feval=feval) | |||
| val_loss = self.evaluate([data], mask=data.val_mask, feval=feval) | |||
| if feval.is_higher_better() is True: | |||
| val_loss = -val_loss | |||
| self.early_stopping(val_loss, self.model.model) | |||
| @@ -261,16 +261,17 @@ class NodeClassificationTrainer(BaseTrainer): | |||
| Returns | |||
| ------- | |||
| self: ``autograph.train.NodeClassificationTrainer`` | |||
| self: ``autogl.train.NodeClassificationTrainer`` | |||
| A reference of current trainer. | |||
| """ | |||
| self.train_only(dataset) | |||
| data = dataset[0] | |||
| self.train_only(data) | |||
| if keep_valid_result: | |||
| self.valid_result = self.predict_only(dataset)[dataset.val_mask].max(1)[1] | |||
| self.valid_result_prob = self.predict_only(dataset)[dataset.val_mask] | |||
| self.valid_result = self.predict_only(data)[data.val_mask].max(1)[1] | |||
| self.valid_result_prob = self.predict_only(data)[data.val_mask] | |||
| self.valid_score = self.evaluate( | |||
| dataset, mask=dataset.val_mask, feval=self.feval | |||
| dataset, mask=data.val_mask, feval=self.feval | |||
| ) | |||
| def predict(self, dataset, mask=None): | |||
| @@ -288,7 +289,6 @@ class NodeClassificationTrainer(BaseTrainer): | |||
| ------- | |||
| The prediction result of ``predict_proba``. | |||
| """ | |||
| dataset = dataset.to(self.device) | |||
| return self.predict_proba(dataset, mask=mask, in_log_format=True).max(1)[1] | |||
| def predict_proba(self, dataset, mask=None, in_log_format=False): | |||
| @@ -309,17 +309,18 @@ class NodeClassificationTrainer(BaseTrainer): | |||
| ------- | |||
| The prediction result. | |||
| """ | |||
| dataset = dataset.to(self.device) | |||
| data = dataset[0] | |||
| data = data.to(self.device) | |||
| if mask is not None: | |||
| if mask == "val": | |||
| mask = dataset.val_mask | |||
| mask = data.val_mask | |||
| elif mask == "test": | |||
| mask = dataset.test_mask | |||
| mask = data.test_mask | |||
| elif mask == "train": | |||
| mask = dataset.train_mask | |||
| mask = data.train_mask | |||
| else: | |||
| mask = dataset.test_mask | |||
| ret = self.predict_only(dataset, mask)[mask] | |||
| mask = data.test_mask | |||
| ret = self.predict_only(data, mask)[mask] | |||
| if in_log_format is True: | |||
| return ret | |||
| else: | |||
| @@ -398,23 +399,24 @@ class NodeClassificationTrainer(BaseTrainer): | |||
| res: The evaluation result on the given dataset. | |||
| """ | |||
| dataset = dataset.to(self.device) | |||
| data = dataset[0] | |||
| data = data.to(self.device) | |||
| test_mask = mask | |||
| if feval is None: | |||
| feval = self.feval | |||
| else: | |||
| feval = get_feval(feval) | |||
| if test_mask is None: | |||
| test_mask = dataset.test_mask | |||
| test_mask = data.test_mask | |||
| elif test_mask == "test": | |||
| test_mask = dataset.test_mask | |||
| test_mask = data.test_mask | |||
| elif test_mask == "val": | |||
| test_mask = dataset.val_mask | |||
| test_mask = data.val_mask | |||
| elif test_mask == "train": | |||
| test_mask = dataset.train_mask | |||
| test_mask = data.train_mask | |||
| y_pred_prob = self.predict_proba(dataset, mask) | |||
| y_pred = y_pred_prob.max(1)[1] | |||
| y_true = dataset.y[test_mask] | |||
| y_true = data.y[test_mask] | |||
| if not isinstance(feval, list): | |||
| feval = [feval] | |||
| @@ -454,7 +456,7 @@ class NodeClassificationTrainer(BaseTrainer): | |||
| Returns | |||
| ------- | |||
| self: ``autograph.train.NodeClassificationTrainer`` | |||
| self: ``autogl.train.NodeClassificationTrainer`` | |||
| A new instance of trainer. | |||
| """ | |||
| @@ -262,19 +262,17 @@ class AutoNodeClassifier(BaseClassifier): | |||
| ) | |||
| val_split = val_split if val_split > 1 else int(val_split * size) | |||
| utils.random_splits_mask_class( | |||
| dataset.data, | |||
| dataset, | |||
| num_train_per_class=train_split // dataset.num_classes, | |||
| num_val_per_class=val_split // dataset.num_classes, | |||
| seed=seed, | |||
| ) | |||
| dataset.data, dataset.slices = dataset.collate([dataset.data]) | |||
| else: | |||
| train_split = train_split if train_split < 1 else train_split / size | |||
| val_split = val_split if val_split < 1 else val_split / size | |||
| utils.random_splits_mask( | |||
| dataset.data, train_ratio=train_split, val_ratio=val_split | |||
| dataset, train_ratio=train_split, val_ratio=val_split | |||
| ) | |||
| dataset.data, dataset.slices = dataset.collate([dataset.data]) | |||
| else: | |||
| assert hasattr(dataset.data, "train_mask") and hasattr( | |||
| dataset.data, "val_mask" | |||
| @@ -288,18 +286,17 @@ class AutoNodeClassifier(BaseClassifier): | |||
| if self.feature_module is not None: | |||
| dataset = self.feature_module.fit_transform(dataset, inplace=inplace) | |||
| data = dataset[0] | |||
| assert data.x is not None, ( | |||
| self.dataset = dataset | |||
| assert self.dataset[0].x is not None, ( | |||
| "Does not support fit on non node-feature dataset!" | |||
| " Please add node features to dataset or specify feature engineers that generate" | |||
| " node features." | |||
| ) | |||
| self.data = data | |||
| # initialize graph networks | |||
| self._init_graph_module( | |||
| self.gml, | |||
| num_features=data.x.shape[1], | |||
| num_features=self.dataset[0].x.shape[1], | |||
| num_classes=dataset.num_classes, | |||
| feval=evaluator_list, | |||
| device=self.runtime_device, | |||
| @@ -319,7 +316,7 @@ class AutoNodeClassifier(BaseClassifier): | |||
| optimized = model | |||
| else: | |||
| optimized, _ = self.hpo_module.optimize( | |||
| trainer=model, dataset=data, time_limit=time_for_each_model | |||
| trainer=model, dataset=self.dataset, time_limit=time_for_each_model | |||
| ) | |||
| # to save memory, all the trainer derived will be mapped to cpu | |||
| optimized.to(torch.device("cpu")) | |||
| @@ -342,7 +339,7 @@ class AutoNodeClassifier(BaseClassifier): | |||
| if self.ensemble_module is not None: | |||
| performance = self.ensemble_module.fit( | |||
| result_valid, | |||
| data.y[data.val_mask].cpu().numpy(), | |||
| self.dataset[0].y[self.dataset[0].val_mask].cpu().numpy(), | |||
| names, | |||
| evaluator_list, | |||
| n_classes=dataset.num_classes, | |||
| @@ -489,14 +486,12 @@ class AutoNodeClassifier(BaseClassifier): | |||
| the number of classes. The prediction on given dataset. | |||
| """ | |||
| if dataset is None: | |||
| data = self.data | |||
| assert data is not None, ( | |||
| dataset = self.dataset | |||
| assert dataset is not None, ( | |||
| "Please execute fit() first before" " predicting on remembered dataset" | |||
| ) | |||
| elif not inplaced and self.feature_module is not None: | |||
| data = self.feature_module.transform(dataset, inplace=inplace)[0] | |||
| else: | |||
| data = dataset[0] | |||
| dataset = self.feature_module.transform(dataset, inplace=inplace) | |||
| if use_ensemble: | |||
| LOGGER.info("Ensemble argument on, will try using ensemble model.") | |||
| @@ -514,7 +509,7 @@ class AutoNodeClassifier(BaseClassifier): | |||
| names = [] | |||
| for model_name in self.trained_models: | |||
| predict_result.append( | |||
| self._predict_proba_by_name(data, model_name, mask) | |||
| self._predict_proba_by_name(dataset, model_name, mask) | |||
| ) | |||
| names.append(model_name) | |||
| return self.ensemble_module.ensemble(predict_result, names) | |||
| @@ -528,11 +523,11 @@ class AutoNodeClassifier(BaseClassifier): | |||
| if use_best or (use_ensemble and self.ensemble_module is None): | |||
| # just return the best model we have found | |||
| name = self.leaderboard.get_best_model() | |||
| return self._predict_proba_by_name(data, name, mask) | |||
| return self._predict_proba_by_name(dataset, name, mask) | |||
| if name is not None: | |||
| # return model performance by name | |||
| return self._predict_proba_by_name(data, name, mask) | |||
| return self._predict_proba_by_name(dataset, name, mask) | |||
| LOGGER.error( | |||
| "No model name is given while ensemble and best arguments are off." | |||
| @@ -23,7 +23,7 @@ copyright = '2020, THUMNLab/aglteam' | |||
| author = 'THUMNLab/aglteam' | |||
| # The full version, including alpha/beta/rc tags | |||
| release = 'v0.1.0' | |||
| release = 'v0.1.1' | |||
| # -- General configuration --------------------------------------------------- | |||
| @@ -16,7 +16,7 @@ if __name__ == '__main__': | |||
| from argparse import ArgumentParser | |||
| parser = ArgumentParser() | |||
| parser.add_argument('--dataset', default='cora', type=str) | |||
| parser.add_argument('--configs', type=str, default='../configs/node_classification.yaml') | |||
| parser.add_argument('--configs', type=str, default='../configs/nodeclf_gcn_benchmark_small.yml') | |||
| # following arguments will override parameters in the config file | |||
| parser.add_argument('--hpo', type=str, default='random') | |||
| parser.add_argument('--max_eval', type=int, default=5) | |||
| @@ -46,7 +46,7 @@ if __name__ == '__main__': | |||
| if args.dataset in ['cora', 'citeseer', 'pubmed']: | |||
| autoClassifier.fit(dataset, time_limit=3600, evaluation_method=[Acc]) | |||
| else: | |||
| autoClassifier.fit(dataset, time_limit=3600, evaluation_method=[Acc], seed=seed, train_split=20*dataset.num_classes, val_split=30*dataset.num_classes) | |||
| autoClassifier.fit(dataset, time_limit=3600, evaluation_method=[Acc], seed=seed, train_split=20*dataset.num_classes, val_split=30*dataset.num_classes, balanced=False) | |||
| val = autoClassifier.get_model_by_performance(0)[0].get_valid_score()[0] | |||
| print('val acc: ', val) | |||
| @@ -28,7 +28,7 @@ with open("README.md", 'r') as fh: | |||
| ''' https://setuptools.readthedocs.io/en/latest/ ''' | |||
| setup( | |||
| name='auto-graph-learning', | |||
| version='0.1.0-20201207185314', | |||
| version='0.1.1', | |||
| author='THUMNLab/aglteam', | |||
| maintainer='THUMNLab/aglteam', | |||
| author_email='xin_wang@tsinghua.edu.cn', | |||