tutoral het

4 years ago · 22c8e9798d
--- a/autogl/datasets/_heterogeneous_datasets/_dgl_heterogeneous_datasets.py
+++ b/autogl/datasets/_heterogeneous_datasets/_dgl_heterogeneous_datasets.py
@@ -17,6 +17,7 @@ def get_binary_mask(total_size, indices):
 class ACMHANDataset(InMemoryStaticGraphSet):
    def __init__(self, path: str):
        data_path: str = os.path.join(path, 'raw', 'ACM.mat')
        print(os.path.join(path, 'raw', 'ACM.mat'))
        _url: str = "https://data.dgl.ai/dataset/ACM.mat"
        if os.path.exists(data_path) and os.path.isfile(data_path):
            print(f"Using cached file {data_path}")
--- a/autogl/module/model/encoders/_dgl/_gat.py
+++ b/autogl/module/model/encoders/_dgl/_gat.py
@@ -52,8 +52,6 @@ class GAT(torch.nn.Module):
    def forward(
            self, graph: dgl.DGLGraph, *__args, **__kwargs
    ) -> _typing.Iterable[torch.Tensor]:
        graph = dgl.remove_self_loop(graph)
        graph = dgl.add_self_loop(graph)
        num_layers = len(self.__convolutions)
        x: torch.Tensor = graph.ndata['feat']
        results = [x]
@@ -75,13 +73,10 @@ class GATMaintainer(base_encoder.AutoHomogeneousEncoderMaintainer):
    r"""
    AutoGAT. The model used in this automodel is GAT, i.e., the graph attentional network from the `"Graph Attention Networks"
    <https://arxiv.org/abs/1710.10903>`_ paper. The layer is

    .. math::
        \mathbf{x}^{\prime}_i = \alpha_{i,i}\mathbf{\Theta}\mathbf{x}_{i} +
        \sum_{j \in \mathcal{N}(i)} \alpha_{i,j}\mathbf{\Theta}\mathbf{x}_{j}

    where the attention coefficients :math:`\alpha_{i,j}` are computed as

    .. math::
        \alpha_{i,j} =
        \frac{
@@ -92,18 +87,14 @@ class GATMaintainer(base_encoder.AutoHomogeneousEncoderMaintainer):
        \exp\left(\mathrm{LeakyReLU}\left(\mathbf{a}^{\top}
        [\mathbf{\Theta}\mathbf{x}_i \, \Vert \, \mathbf{\Theta}\mathbf{x}_k]
        \right)\right)}.

    Parameters
    ----------
    input_dimension: `Optional[int]`
        The dimension of input features.

    final_dimension: `Optional[int]`
        The dimension of final features.

    device: `torch.device` or `str` or `int`
        The device where model will be running on.

    kwargs:
        Other parameters.
    """
@@ -194,4 +185,4 @@ class GATMaintainer(base_encoder.AutoHomogeneousEncoderMaintainer):
            self.hyper_parameters.get("dropout"),
            concat_last
        ).to(self.device)
        return True
        return True
--- a/autogl/module/model/encoders/_dgl/_gcn.py
+++ b/autogl/module/model/encoders/_dgl/_gcn.py
@@ -26,8 +26,6 @@ class _GCN(torch.nn.Module):
        self._dropout: _typing.Optional[float] = dropout

    def forward(self, graph: dgl.DGLGraph, *__args, **__kwargs):
        graph = dgl.remove_self_loop(graph)
        graph = dgl.add_self_loop(graph)
        x: torch.Tensor = graph.ndata['feat']
        results: _typing.MutableSequence[torch.Tensor] = []
        for _layer in range(len(self.__convolution_layers)):
@@ -112,4 +110,4 @@ class GCNMaintainer(base_encoder.AutoHomogeneousEncoderMaintainer):
            self.hyper_parameters["act"],
            self.hyper_parameters["dropout"]
        ).to(self.device)
        return True
        return True
--- a/docs/docfile/tutorial/t_hetero_node_clf.rst
+++ b/docs/docfile/tutorial/t_hetero_node_clf.rst
@@ -0,0 +1,117 @@
 .. _hetero_node_clf:

 Node Classification for Heterogeneous Graph
 ==============

 This tutorial introduces how to use AutoGL to automate the learning of heterogeneous graphs in Deep Graph Library (DGL).

 Creating a Heterogeneous Graph
 -------------------
 AutoGL supports datasets created in DGL. We provide two datasets named "hetero-acm-han" and "hetero-acm-hgt" for HAN and HGT models, respectively.
 The following code snippet is an example for loading a heterogeneous graph. 

 .. code-block:: python
    from autogl.datasets import build_dataset_from_name
    dataset = build_dataset_from_name("hetero-acm-han")

 You can also access to data stored in the dataset object for more details:

 .. code-block:: python
    g = dataset[0]

    node_type = dataset.schema["target_node_type"]
    labels = g.nodes[node_type].data['label']
    num_classes = labels.max().item() + 1
    num_features=g.nodes[node_type].data['feat'].shape[1]

    train_mask = g.nodes[node_type].data['train_mask']
    val_mask = g.nodes[node_type].data['val_mask']
    test_mask = g.nodes[node_type].data['test_mask']

 You can also build your own dataset and do feature engineering by adding files in the location AutoGL/autogl/datasets/_heterogeneous_datasets/_dgl_heterogeneous_datasets.py. We suggest users create a data object of type torch_geometric.data.HeteroData refering to the official documentation of DGL.

 Building Heterogeneous GNN Modules
 -------------------
 AutoGL integrates commonly used heterogeneous graph neural network models such as HeteroRGCN (Schlichtkrull et al., 2018), HAN (Wang et al., 2019) and HGT (Hu et al., 2020).

 .. code-block:: python
    from autogl.module.model.dgl import AutoHAN
    model = AutoHAN(
        dataset=dataset,
        num_features=num_features,
        num_classes=num_classes,
        device = args['device'],
        init=True
    ).model

 Then you can train the model for 100 epochs.
 .. code-block:: python
    # Define the loss function.
    loss_fcn = torch.nn.CrossEntropyLoss()
    # Define the loss optimizer.
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-2,
                                 weight_decay=1e-2)
    
    # Training.
    for epoch in range(100):
        model.train()
        logits = model(g)
        loss = loss_fcn(logits[train_mask], labels[train_mask])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        val_loss, val_acc, _, _ = evaluate(model, g, labels, val_mask, loss_fcn)

 Finally, evaluate the model.
 .. code-block:: python
    _, test_acc, _, _ = evaluate(model, g, labels, test_mask, loss_fcn)

 You can also define your own heterogeneous graph neural network models by adding files in the location AutoGL/autogl/module/model/dgl/hetero.

 Automatic Search for Node Classification Tasks
 -------------------
 On top of the modules mentioned above, we provide a high-level API Solver to control the overall pipeline. We encapsulated the training process in the Building Heterogeneous GNN Modules part in the solver AutoHeteroNodeClassifier that supports automatic hyperparametric optimization as well as feature engineering and ensemble.
 In this part, we will show you how to use AutoHeteroNodeClassifier to automatically predict the publishing conference of a paper using the ACM academic graph dataset.

 Firstly, we get the pre-defined model hyperparameter. 

 .. code-block:: python
 from helper import get_encoder_decoder_hp
 model_hp, _ = get_encoder_decoder_hp(args.model)

 You can also define your own model hyperparameters in a dict:

 .. code-block:: python
 model_hp = {
            "num_layers": 2,
            "hidden": [256],
            "heads": 4,
            "dropout": 0.2,
            "act": "leaky_relu",
        }

 Secondly, use AutoHeteroNodeClassifier directly to bulid automatic heterogeneous GNN models in the following example:

 .. code-block:: python
    from autogl.solver import AutoHeteroNodeClassifier
    solver = AutoHeteroNodeClassifier(
                graph_models=["han"],
                hpo_module="random",
                ensemble_module=None,
                max_evals=1,
                device=args.device,
                trainer_hp_space=fixed(
                    max_epoch=100,
                    early_stopping_round=101,
                    lr=1e-3,
                    weight_decay=1e-2
                ),
                model_hp_spaces=[fixed(**model_hp)]
            )

 Finally, fit and evlauate the model.
 .. code-block:: python
    solver.fit(dataset)
    acc = solver.evaluate()
--- a/test/performance/link_prediction/pyg/helper.py
+++ b/test/performance/link_prediction/pyg/helper.py
@@ -0,0 +1,47 @@
 def get_encoder_decoder_hp(model='gin', decoder=None):
    if model == 'gin':
        model_hp = {
            "num_layers": 5,
            "hidden": [64],
            "act": "relu",
            "eps": "False",
            "mlp_layers": 2,
            "neighbor_pooling_type": "sum"
        }
    elif model == 'gat':
        model_hp = {
            # hp from model
            "num_layers": 3,
            "hidden": [128,64],
            "heads": 1,
            "dropout": 0.0,
            "act": "relu",
            'add_self_loops': 'False',
            'normalize': 'False',
        }
    elif model == 'gcn':
        model_hp = {
            "num_layers": 3,
            "hidden": [128,64],
            "dropout": 0.0,
            "act": "relu",
            'add_self_loops': 'False',
            'normalize': 'False',
        }
    elif model == 'sage':
        model_hp = {
            "num_layers": 3,
            "hidden": [128,64],
            "dropout": 0.0,
            "act": "relu",
            "agg": "mean",
            'add_self_loops': 'False',
            'normalize': 'False',
        }
    elif model == 'topk':
        model_hp = {
            "num_layers": 5,
            "hidden": [64, 64, 64, 64]
        }
        
    return model_hp, {}
--- a/test/performance/link_prediction/pyg/link_prediction_base.py
+++ b/test/performance/link_prediction/pyg/link_prediction_base.py
@@ -152,9 +152,9 @@ def train():
    optimizer.zero_grad()
    z = model.encode(data) #encode

    print(data)
    print("trainen_shape",data.x.shape, data.train_pos_edge_index.shape)
    print("trainde_shape",z.shape, data.train_pos_edge_index.shape,neg_edge_index.shape)
    # print(data)
    # print("trainen_shape",data.x.shape, data.train_pos_edge_index.shape)
    # print("trainde_shape",z.shape, data.train_pos_edge_index.shape,neg_edge_index.shape)
    # trainen_shape torch.Size([2708, 1433]) torch.Size([2, 8976])
    # trainde_shape torch.Size([2708, 64]) torch.Size([2, 8976]) torch.Size([2, 8976])   
 
--- a/test/performance/link_prediction/pyg/link_prediction_model.py
+++ b/test/performance/link_prediction/pyg/link_prediction_model.py
@@ -49,8 +49,8 @@ args = parser.parse_args()
 args.device = torch.device('cuda:0')
 device = torch.device('cuda:0')

 # args.dataset = 'Cora'
 # args.model = 'gat'
 args.dataset = 'Cora'
 args.model = 'gcn'
 print(args.dataset)
 print(args.model)
 # load the dataset
@@ -66,7 +66,7 @@ elif args.dataset == 'PubMed':
 else:
    assert False

 def train():
 def train(data):
    model.train()

    neg_edge_index = negative_sampling(
@@ -120,7 +120,6 @@ def test(train_data):
 res = []
 for seed in tqdm(range(1234, 1234+args.repeat)):
    setup_seed(seed)
    g = dataset[0].to(device)
    data = dataset[0].to(device)
    # use train_test_split_edges to create neg and positive edges
    data.train_mask = data.val_mask = data.test_mask = data.y = None
@@ -177,7 +176,7 @@ for seed in tqdm(range(1234, 1234+args.repeat)):

    best_val_perf = test_perf = 0
    for epoch in range(100):
        train_loss, train_data = train()
        train_loss, train_data = train(data)
        val_perf, tmp_test_perf = test(train_data)
        if val_perf > best_val_perf:
            best_val_perf = val_perf
--- a/test/performance/link_prediction/pyg/link_prediction_trainer.py
+++ b/test/performance/link_prediction/pyg/link_prediction_trainer.py
@@ -0,0 +1,153 @@
 import os
 os.environ["AUTOGL_BACKEND"] = "pyg"
 from tqdm import tqdm
 from autogl.module.train.evaluation import Auc
 import random
 import torch
 import numpy as np
 import torch
 import numpy as np
 import scipy.sparse as sp
 from helper import get_encoder_decoder_hp
 import os.path as osp
 import torch_geometric.transforms as T
 from torch_geometric.datasets import Planetoid
 from torch_geometric.data import Data
 from torch_geometric.utils import train_test_split_edges
 from torch_geometric.utils import negative_sampling

 def construct_negative_graph(graph, k):
    src, dst = graph.edges()

    neg_src = src.repeat_interleave(k)
    neg_dst = torch.randint(0, graph.num_nodes(), (len(src) * k,))
    # return dgl.graph((neg_src, neg_dst), num_nodes=graph.num_nodes()).edges()
    return neg_src, neg_dst

 def negative_sample(data):
    return construct_negative_graph(data, 5)

 import autogl.datasets.utils as tmp_utils
 tmp_utils.negative_sampling = negative_sample

 from autogl.module.train.link_prediction_full import LinkPredictionTrainer

 def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    np.random.seed(seed)
    random.seed(seed)

 def split_train_test(data):
    data.train_mask = data.val_mask = data.test_mask = data.y = None
    data = train_test_split_edges(data)
    neg_edge_index = negative_sampling(
        edge_index=data.train_pos_edge_index, #positive edges
        num_nodes=data.num_nodes, # number of nodes
        num_neg_samples=data.train_pos_edge_index.size(1)) # number of neg_sample equal to number of pos_edges
    dataset_splitted = Data(
        x=data.x,train_pos_edge_index=data.train_pos_edge_index,train_neg_edge_index=neg_edge_index,
        test_pos_edge_index=data.test_pos_edge_index,
        test_neg_edge_index = data.test_neg_edge_index, 
        val_pos_edge_index = data.val_pos_edge_index, 
        val_neg_edge_index = data.val_neg_edge_index
        )
    return dataset_splitted


 if __name__ == "__main__":


    from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter

    parser = ArgumentParser(
        "auto link prediction", formatter_class=ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--dataset",
        default="Cora",
        type=str,
        help="dataset to use",
        choices=[
            "Cora",
            "CiteSeer",
            "PubMed",
        ],
    )
    parser.add_argument(
        "--model",
        default="sage",
        type=str,
        help="model to use",
        choices=[
            "gcn",
            "gat",
            "sage",
            "gin",
            "topk"
        ],
    )
    parser.add_argument("--seed", type=int, default=0, help="random seed")
    parser.add_argument('--repeat', type=int, default=10)
    parser.add_argument("--device", default="cuda", type=str, help="GPU device")

    args = parser.parse_args()

    args.dataset = 'Cora'
    args.model = 'gcn'

    path = osp.join('data', args.dataset)
    if args.dataset == 'Cora':
        dataset = Planetoid(path, name='Cora',transform=T.NormalizeFeatures())
    elif args.dataset == 'CiteSeer':
        dataset = Planetoid(path, name='CiteSeer',transform=T.NormalizeFeatures())
    elif args.dataset == 'PubMed':
        dataset = Planetoid(path, name='PubMed',transform=T.NormalizeFeatures())
    else:
        assert False

    res = []
    for seed in tqdm(range(1234, 1234+args.repeat)):
        # set random seed
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(seed)
            torch.backends.cudnn.deterministic = True
            torch.backends.cudnn.benchmark = False

        data = dataset[0].to(args.device)
        num_features = dataset.num_features

        model_hp, decoder_hp = get_encoder_decoder_hp(args.model)

        trainer = LinkPredictionTrainer(
            model = args.model,
            num_features = num_features,
            lr = 1e-2,
            max_epoch = 100,
            early_stopping_round = 101,
            weight_decay = 0.0,
            device = args.device,
            feval = [Auc],
            loss = "binary_cross_entropy_with_logits",
            init = False
        ).duplicate_from_hyper_parameter(
            {
                "trainer": {},
                "encoder": model_hp,
                "decoder": decoder_hp
            },
            restricted=False
        )

        dataset_splitted = split_train_test(data.cpu())
        
        trainer.train([dataset_splitted], False)
        pre = trainer.evaluate([dataset_splitted], mask="test", feval=Auc)
        result = pre.item()
        res.append(result)

    print(np.mean(res), np.std(res))