From 22c8e9798d67ebbdde4f03bfccda5904deb8b9c9 Mon Sep 17 00:00:00 2001 From: caij <626820136@qq.com> Date: Mon, 27 Dec 2021 19:41:53 +0800 Subject: [PATCH] tutoral het --- .../_dgl_heterogeneous_datasets.py | 1 + autogl/module/model/encoders/_dgl/_gat.py | 11 +- autogl/module/model/encoders/_dgl/_gcn.py | 4 +- docs/docfile/tutorial/t_hetero_node_clf.rst | 117 ++++++++++++++ .../performance/link_prediction/pyg/helper.py | 47 ++++++ .../pyg/link_prediction_base.py | 6 +- .../pyg/link_prediction_model.py | 9 +- .../pyg/link_prediction_trainer.py | 153 ++++++++++++++++++ 8 files changed, 327 insertions(+), 21 deletions(-) create mode 100644 docs/docfile/tutorial/t_hetero_node_clf.rst create mode 100644 test/performance/link_prediction/pyg/helper.py create mode 100644 test/performance/link_prediction/pyg/link_prediction_trainer.py diff --git a/autogl/datasets/_heterogeneous_datasets/_dgl_heterogeneous_datasets.py b/autogl/datasets/_heterogeneous_datasets/_dgl_heterogeneous_datasets.py index 133f75b..69c8d34 100644 --- a/autogl/datasets/_heterogeneous_datasets/_dgl_heterogeneous_datasets.py +++ b/autogl/datasets/_heterogeneous_datasets/_dgl_heterogeneous_datasets.py @@ -17,6 +17,7 @@ def get_binary_mask(total_size, indices): class ACMHANDataset(InMemoryStaticGraphSet): def __init__(self, path: str): data_path: str = os.path.join(path, 'raw', 'ACM.mat') + print(os.path.join(path, 'raw', 'ACM.mat')) _url: str = "https://data.dgl.ai/dataset/ACM.mat" if os.path.exists(data_path) and os.path.isfile(data_path): print(f"Using cached file {data_path}") diff --git a/autogl/module/model/encoders/_dgl/_gat.py b/autogl/module/model/encoders/_dgl/_gat.py index 58916c6..b5d8c43 100644 --- a/autogl/module/model/encoders/_dgl/_gat.py +++ b/autogl/module/model/encoders/_dgl/_gat.py @@ -52,8 +52,6 @@ class GAT(torch.nn.Module): def forward( self, graph: dgl.DGLGraph, *__args, **__kwargs ) -> _typing.Iterable[torch.Tensor]: - graph = dgl.remove_self_loop(graph) - graph = dgl.add_self_loop(graph) num_layers = len(self.__convolutions) x: torch.Tensor = graph.ndata['feat'] results = [x] @@ -75,13 +73,10 @@ class GATMaintainer(base_encoder.AutoHomogeneousEncoderMaintainer): r""" AutoGAT. The model used in this automodel is GAT, i.e., the graph attentional network from the `"Graph Attention Networks" `_ paper. The layer is - .. math:: \mathbf{x}^{\prime}_i = \alpha_{i,i}\mathbf{\Theta}\mathbf{x}_{i} + \sum_{j \in \mathcal{N}(i)} \alpha_{i,j}\mathbf{\Theta}\mathbf{x}_{j} - where the attention coefficients :math:`\alpha_{i,j}` are computed as - .. math:: \alpha_{i,j} = \frac{ @@ -92,18 +87,14 @@ class GATMaintainer(base_encoder.AutoHomogeneousEncoderMaintainer): \exp\left(\mathrm{LeakyReLU}\left(\mathbf{a}^{\top} [\mathbf{\Theta}\mathbf{x}_i \, \Vert \, \mathbf{\Theta}\mathbf{x}_k] \right)\right)}. - Parameters ---------- input_dimension: `Optional[int]` The dimension of input features. - final_dimension: `Optional[int]` The dimension of final features. - device: `torch.device` or `str` or `int` The device where model will be running on. - kwargs: Other parameters. """ @@ -194,4 +185,4 @@ class GATMaintainer(base_encoder.AutoHomogeneousEncoderMaintainer): self.hyper_parameters.get("dropout"), concat_last ).to(self.device) - return True + return True \ No newline at end of file diff --git a/autogl/module/model/encoders/_dgl/_gcn.py b/autogl/module/model/encoders/_dgl/_gcn.py index 098723f..940cb5a 100644 --- a/autogl/module/model/encoders/_dgl/_gcn.py +++ b/autogl/module/model/encoders/_dgl/_gcn.py @@ -26,8 +26,6 @@ class _GCN(torch.nn.Module): self._dropout: _typing.Optional[float] = dropout def forward(self, graph: dgl.DGLGraph, *__args, **__kwargs): - graph = dgl.remove_self_loop(graph) - graph = dgl.add_self_loop(graph) x: torch.Tensor = graph.ndata['feat'] results: _typing.MutableSequence[torch.Tensor] = [] for _layer in range(len(self.__convolution_layers)): @@ -112,4 +110,4 @@ class GCNMaintainer(base_encoder.AutoHomogeneousEncoderMaintainer): self.hyper_parameters["act"], self.hyper_parameters["dropout"] ).to(self.device) - return True + return True \ No newline at end of file diff --git a/docs/docfile/tutorial/t_hetero_node_clf.rst b/docs/docfile/tutorial/t_hetero_node_clf.rst new file mode 100644 index 0000000..7905d0f --- /dev/null +++ b/docs/docfile/tutorial/t_hetero_node_clf.rst @@ -0,0 +1,117 @@ +.. _hetero_node_clf: + +Node Classification for Heterogeneous Graph +============== + +This tutorial introduces how to use AutoGL to automate the learning of heterogeneous graphs in Deep Graph Library (DGL). + +Creating a Heterogeneous Graph +------------------- +AutoGL supports datasets created in DGL. We provide two datasets named "hetero-acm-han" and "hetero-acm-hgt" for HAN and HGT models, respectively. +The following code snippet is an example for loading a heterogeneous graph. + +.. code-block:: python + from autogl.datasets import build_dataset_from_name + dataset = build_dataset_from_name("hetero-acm-han") + +You can also access to data stored in the dataset object for more details: + +.. code-block:: python + g = dataset[0] + + node_type = dataset.schema["target_node_type"] + labels = g.nodes[node_type].data['label'] + num_classes = labels.max().item() + 1 + num_features=g.nodes[node_type].data['feat'].shape[1] + + train_mask = g.nodes[node_type].data['train_mask'] + val_mask = g.nodes[node_type].data['val_mask'] + test_mask = g.nodes[node_type].data['test_mask'] + +You can also build your own dataset and do feature engineering by adding files in the location AutoGL/autogl/datasets/_heterogeneous_datasets/_dgl_heterogeneous_datasets.py. We suggest users create a data object of type torch_geometric.data.HeteroData refering to the official documentation of DGL. + +Building Heterogeneous GNN Modules +------------------- +AutoGL integrates commonly used heterogeneous graph neural network models such as HeteroRGCN (Schlichtkrull et al., 2018), HAN (Wang et al., 2019) and HGT (Hu et al., 2020). + +.. code-block:: python + from autogl.module.model.dgl import AutoHAN + model = AutoHAN( + dataset=dataset, + num_features=num_features, + num_classes=num_classes, + device = args['device'], + init=True + ).model + +Then you can train the model for 100 epochs. +.. code-block:: python + # Define the loss function. + loss_fcn = torch.nn.CrossEntropyLoss() + # Define the loss optimizer. + optimizer = torch.optim.Adam(model.parameters(), lr=1e-2, + weight_decay=1e-2) + + # Training. + for epoch in range(100): + model.train() + logits = model(g) + loss = loss_fcn(logits[train_mask], labels[train_mask]) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + val_loss, val_acc, _, _ = evaluate(model, g, labels, val_mask, loss_fcn) + +Finally, evaluate the model. +.. code-block:: python + _, test_acc, _, _ = evaluate(model, g, labels, test_mask, loss_fcn) + +You can also define your own heterogeneous graph neural network models by adding files in the location AutoGL/autogl/module/model/dgl/hetero. + +Automatic Search for Node Classification Tasks +------------------- +On top of the modules mentioned above, we provide a high-level API Solver to control the overall pipeline. We encapsulated the training process in the Building Heterogeneous GNN Modules part in the solver AutoHeteroNodeClassifier that supports automatic hyperparametric optimization as well as feature engineering and ensemble. +In this part, we will show you how to use AutoHeteroNodeClassifier to automatically predict the publishing conference of a paper using the ACM academic graph dataset. + +Firstly, we get the pre-defined model hyperparameter. + +.. code-block:: python +from helper import get_encoder_decoder_hp +model_hp, _ = get_encoder_decoder_hp(args.model) + +You can also define your own model hyperparameters in a dict: + +.. code-block:: python +model_hp = { + "num_layers": 2, + "hidden": [256], + "heads": 4, + "dropout": 0.2, + "act": "leaky_relu", + } + +Secondly, use AutoHeteroNodeClassifier directly to bulid automatic heterogeneous GNN models in the following example: + +.. code-block:: python + from autogl.solver import AutoHeteroNodeClassifier + solver = AutoHeteroNodeClassifier( + graph_models=["han"], + hpo_module="random", + ensemble_module=None, + max_evals=1, + device=args.device, + trainer_hp_space=fixed( + max_epoch=100, + early_stopping_round=101, + lr=1e-3, + weight_decay=1e-2 + ), + model_hp_spaces=[fixed(**model_hp)] + ) + +Finally, fit and evlauate the model. +.. code-block:: python + solver.fit(dataset) + acc = solver.evaluate() \ No newline at end of file diff --git a/test/performance/link_prediction/pyg/helper.py b/test/performance/link_prediction/pyg/helper.py new file mode 100644 index 0000000..2f0acf4 --- /dev/null +++ b/test/performance/link_prediction/pyg/helper.py @@ -0,0 +1,47 @@ +def get_encoder_decoder_hp(model='gin', decoder=None): + if model == 'gin': + model_hp = { + "num_layers": 5, + "hidden": [64], + "act": "relu", + "eps": "False", + "mlp_layers": 2, + "neighbor_pooling_type": "sum" + } + elif model == 'gat': + model_hp = { + # hp from model + "num_layers": 3, + "hidden": [128,64], + "heads": 1, + "dropout": 0.0, + "act": "relu", + 'add_self_loops': 'False', + 'normalize': 'False', + } + elif model == 'gcn': + model_hp = { + "num_layers": 3, + "hidden": [128,64], + "dropout": 0.0, + "act": "relu", + 'add_self_loops': 'False', + 'normalize': 'False', + } + elif model == 'sage': + model_hp = { + "num_layers": 3, + "hidden": [128,64], + "dropout": 0.0, + "act": "relu", + "agg": "mean", + 'add_self_loops': 'False', + 'normalize': 'False', + } + elif model == 'topk': + model_hp = { + "num_layers": 5, + "hidden": [64, 64, 64, 64] + } + + return model_hp, {} diff --git a/test/performance/link_prediction/pyg/link_prediction_base.py b/test/performance/link_prediction/pyg/link_prediction_base.py index 05dd5bd..8435272 100644 --- a/test/performance/link_prediction/pyg/link_prediction_base.py +++ b/test/performance/link_prediction/pyg/link_prediction_base.py @@ -152,9 +152,9 @@ def train(): optimizer.zero_grad() z = model.encode(data) #encode - print(data) - print("trainen_shape",data.x.shape, data.train_pos_edge_index.shape) - print("trainde_shape",z.shape, data.train_pos_edge_index.shape,neg_edge_index.shape) + # print(data) + # print("trainen_shape",data.x.shape, data.train_pos_edge_index.shape) + # print("trainde_shape",z.shape, data.train_pos_edge_index.shape,neg_edge_index.shape) # trainen_shape torch.Size([2708, 1433]) torch.Size([2, 8976]) # trainde_shape torch.Size([2708, 64]) torch.Size([2, 8976]) torch.Size([2, 8976]) diff --git a/test/performance/link_prediction/pyg/link_prediction_model.py b/test/performance/link_prediction/pyg/link_prediction_model.py index ee5a6eb..40b7eaa 100644 --- a/test/performance/link_prediction/pyg/link_prediction_model.py +++ b/test/performance/link_prediction/pyg/link_prediction_model.py @@ -49,8 +49,8 @@ args = parser.parse_args() args.device = torch.device('cuda:0') device = torch.device('cuda:0') -# args.dataset = 'Cora' -# args.model = 'gat' +args.dataset = 'Cora' +args.model = 'gcn' print(args.dataset) print(args.model) # load the dataset @@ -66,7 +66,7 @@ elif args.dataset == 'PubMed': else: assert False -def train(): +def train(data): model.train() neg_edge_index = negative_sampling( @@ -120,7 +120,6 @@ def test(train_data): res = [] for seed in tqdm(range(1234, 1234+args.repeat)): setup_seed(seed) - g = dataset[0].to(device) data = dataset[0].to(device) # use train_test_split_edges to create neg and positive edges data.train_mask = data.val_mask = data.test_mask = data.y = None @@ -177,7 +176,7 @@ for seed in tqdm(range(1234, 1234+args.repeat)): best_val_perf = test_perf = 0 for epoch in range(100): - train_loss, train_data = train() + train_loss, train_data = train(data) val_perf, tmp_test_perf = test(train_data) if val_perf > best_val_perf: best_val_perf = val_perf diff --git a/test/performance/link_prediction/pyg/link_prediction_trainer.py b/test/performance/link_prediction/pyg/link_prediction_trainer.py new file mode 100644 index 0000000..f0e745a --- /dev/null +++ b/test/performance/link_prediction/pyg/link_prediction_trainer.py @@ -0,0 +1,153 @@ +import os +os.environ["AUTOGL_BACKEND"] = "pyg" +from tqdm import tqdm +from autogl.module.train.evaluation import Auc +import random +import torch +import numpy as np +import torch +import numpy as np +import scipy.sparse as sp +from helper import get_encoder_decoder_hp +import os.path as osp +import torch_geometric.transforms as T +from torch_geometric.datasets import Planetoid +from torch_geometric.data import Data +from torch_geometric.utils import train_test_split_edges +from torch_geometric.utils import negative_sampling + +def construct_negative_graph(graph, k): + src, dst = graph.edges() + + neg_src = src.repeat_interleave(k) + neg_dst = torch.randint(0, graph.num_nodes(), (len(src) * k,)) + # return dgl.graph((neg_src, neg_dst), num_nodes=graph.num_nodes()).edges() + return neg_src, neg_dst + +def negative_sample(data): + return construct_negative_graph(data, 5) + +import autogl.datasets.utils as tmp_utils +tmp_utils.negative_sampling = negative_sample + +from autogl.module.train.link_prediction_full import LinkPredictionTrainer + +def setup_seed(seed): + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + torch.backends.cudnn.deterministic = True + np.random.seed(seed) + random.seed(seed) + +def split_train_test(data): + data.train_mask = data.val_mask = data.test_mask = data.y = None + data = train_test_split_edges(data) + neg_edge_index = negative_sampling( + edge_index=data.train_pos_edge_index, #positive edges + num_nodes=data.num_nodes, # number of nodes + num_neg_samples=data.train_pos_edge_index.size(1)) # number of neg_sample equal to number of pos_edges + dataset_splitted = Data( + x=data.x,train_pos_edge_index=data.train_pos_edge_index,train_neg_edge_index=neg_edge_index, + test_pos_edge_index=data.test_pos_edge_index, + test_neg_edge_index = data.test_neg_edge_index, + val_pos_edge_index = data.val_pos_edge_index, + val_neg_edge_index = data.val_neg_edge_index + ) + return dataset_splitted + + +if __name__ == "__main__": + + + from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter + + parser = ArgumentParser( + "auto link prediction", formatter_class=ArgumentDefaultsHelpFormatter + ) + parser.add_argument( + "--dataset", + default="Cora", + type=str, + help="dataset to use", + choices=[ + "Cora", + "CiteSeer", + "PubMed", + ], + ) + parser.add_argument( + "--model", + default="sage", + type=str, + help="model to use", + choices=[ + "gcn", + "gat", + "sage", + "gin", + "topk" + ], + ) + parser.add_argument("--seed", type=int, default=0, help="random seed") + parser.add_argument('--repeat', type=int, default=10) + parser.add_argument("--device", default="cuda", type=str, help="GPU device") + + args = parser.parse_args() + + args.dataset = 'Cora' + args.model = 'gcn' + + path = osp.join('data', args.dataset) + if args.dataset == 'Cora': + dataset = Planetoid(path, name='Cora',transform=T.NormalizeFeatures()) + elif args.dataset == 'CiteSeer': + dataset = Planetoid(path, name='CiteSeer',transform=T.NormalizeFeatures()) + elif args.dataset == 'PubMed': + dataset = Planetoid(path, name='PubMed',transform=T.NormalizeFeatures()) + else: + assert False + + res = [] + for seed in tqdm(range(1234, 1234+args.repeat)): + # set random seed + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + data = dataset[0].to(args.device) + num_features = dataset.num_features + + model_hp, decoder_hp = get_encoder_decoder_hp(args.model) + + trainer = LinkPredictionTrainer( + model = args.model, + num_features = num_features, + lr = 1e-2, + max_epoch = 100, + early_stopping_round = 101, + weight_decay = 0.0, + device = args.device, + feval = [Auc], + loss = "binary_cross_entropy_with_logits", + init = False + ).duplicate_from_hyper_parameter( + { + "trainer": {}, + "encoder": model_hp, + "decoder": decoder_hp + }, + restricted=False + ) + + dataset_splitted = split_train_test(data.cpu()) + + trainer.train([dataset_splitted], False) + pre = trainer.evaluate([dataset_splitted], mask="test", feval=Auc) + result = pre.item() + res.append(result) + + print(np.mean(res), np.std(res)) \ No newline at end of file