Browse Source

tutoral het

tags/v0.3.1
caij 4 years ago
parent
commit
22c8e9798d
8 changed files with 327 additions and 21 deletions
  1. +1
    -0
      autogl/datasets/_heterogeneous_datasets/_dgl_heterogeneous_datasets.py
  2. +1
    -10
      autogl/module/model/encoders/_dgl/_gat.py
  3. +1
    -3
      autogl/module/model/encoders/_dgl/_gcn.py
  4. +117
    -0
      docs/docfile/tutorial/t_hetero_node_clf.rst
  5. +47
    -0
      test/performance/link_prediction/pyg/helper.py
  6. +3
    -3
      test/performance/link_prediction/pyg/link_prediction_base.py
  7. +4
    -5
      test/performance/link_prediction/pyg/link_prediction_model.py
  8. +153
    -0
      test/performance/link_prediction/pyg/link_prediction_trainer.py

+ 1
- 0
autogl/datasets/_heterogeneous_datasets/_dgl_heterogeneous_datasets.py View File

@@ -17,6 +17,7 @@ def get_binary_mask(total_size, indices):
class ACMHANDataset(InMemoryStaticGraphSet):
def __init__(self, path: str):
data_path: str = os.path.join(path, 'raw', 'ACM.mat')
print(os.path.join(path, 'raw', 'ACM.mat'))
_url: str = "https://data.dgl.ai/dataset/ACM.mat"
if os.path.exists(data_path) and os.path.isfile(data_path):
print(f"Using cached file {data_path}")


+ 1
- 10
autogl/module/model/encoders/_dgl/_gat.py View File

@@ -52,8 +52,6 @@ class GAT(torch.nn.Module):
def forward(
self, graph: dgl.DGLGraph, *__args, **__kwargs
) -> _typing.Iterable[torch.Tensor]:
graph = dgl.remove_self_loop(graph)
graph = dgl.add_self_loop(graph)
num_layers = len(self.__convolutions)
x: torch.Tensor = graph.ndata['feat']
results = [x]
@@ -75,13 +73,10 @@ class GATMaintainer(base_encoder.AutoHomogeneousEncoderMaintainer):
r"""
AutoGAT. The model used in this automodel is GAT, i.e., the graph attentional network from the `"Graph Attention Networks"
<https://arxiv.org/abs/1710.10903>`_ paper. The layer is

.. math::
\mathbf{x}^{\prime}_i = \alpha_{i,i}\mathbf{\Theta}\mathbf{x}_{i} +
\sum_{j \in \mathcal{N}(i)} \alpha_{i,j}\mathbf{\Theta}\mathbf{x}_{j}

where the attention coefficients :math:`\alpha_{i,j}` are computed as

.. math::
\alpha_{i,j} =
\frac{
@@ -92,18 +87,14 @@ class GATMaintainer(base_encoder.AutoHomogeneousEncoderMaintainer):
\exp\left(\mathrm{LeakyReLU}\left(\mathbf{a}^{\top}
[\mathbf{\Theta}\mathbf{x}_i \, \Vert \, \mathbf{\Theta}\mathbf{x}_k]
\right)\right)}.

Parameters
----------
input_dimension: `Optional[int]`
The dimension of input features.

final_dimension: `Optional[int]`
The dimension of final features.

device: `torch.device` or `str` or `int`
The device where model will be running on.

kwargs:
Other parameters.
"""
@@ -194,4 +185,4 @@ class GATMaintainer(base_encoder.AutoHomogeneousEncoderMaintainer):
self.hyper_parameters.get("dropout"),
concat_last
).to(self.device)
return True
return True

+ 1
- 3
autogl/module/model/encoders/_dgl/_gcn.py View File

@@ -26,8 +26,6 @@ class _GCN(torch.nn.Module):
self._dropout: _typing.Optional[float] = dropout

def forward(self, graph: dgl.DGLGraph, *__args, **__kwargs):
graph = dgl.remove_self_loop(graph)
graph = dgl.add_self_loop(graph)
x: torch.Tensor = graph.ndata['feat']
results: _typing.MutableSequence[torch.Tensor] = []
for _layer in range(len(self.__convolution_layers)):
@@ -112,4 +110,4 @@ class GCNMaintainer(base_encoder.AutoHomogeneousEncoderMaintainer):
self.hyper_parameters["act"],
self.hyper_parameters["dropout"]
).to(self.device)
return True
return True

+ 117
- 0
docs/docfile/tutorial/t_hetero_node_clf.rst View File

@@ -0,0 +1,117 @@
.. _hetero_node_clf:

Node Classification for Heterogeneous Graph
==============

This tutorial introduces how to use AutoGL to automate the learning of heterogeneous graphs in Deep Graph Library (DGL).

Creating a Heterogeneous Graph
-------------------
AutoGL supports datasets created in DGL. We provide two datasets named "hetero-acm-han" and "hetero-acm-hgt" for HAN and HGT models, respectively.
The following code snippet is an example for loading a heterogeneous graph.

.. code-block:: python
from autogl.datasets import build_dataset_from_name
dataset = build_dataset_from_name("hetero-acm-han")

You can also access to data stored in the dataset object for more details:

.. code-block:: python
g = dataset[0]

node_type = dataset.schema["target_node_type"]
labels = g.nodes[node_type].data['label']
num_classes = labels.max().item() + 1
num_features=g.nodes[node_type].data['feat'].shape[1]

train_mask = g.nodes[node_type].data['train_mask']
val_mask = g.nodes[node_type].data['val_mask']
test_mask = g.nodes[node_type].data['test_mask']

You can also build your own dataset and do feature engineering by adding files in the location AutoGL/autogl/datasets/_heterogeneous_datasets/_dgl_heterogeneous_datasets.py. We suggest users create a data object of type torch_geometric.data.HeteroData refering to the official documentation of DGL.

Building Heterogeneous GNN Modules
-------------------
AutoGL integrates commonly used heterogeneous graph neural network models such as HeteroRGCN (Schlichtkrull et al., 2018), HAN (Wang et al., 2019) and HGT (Hu et al., 2020).

.. code-block:: python
from autogl.module.model.dgl import AutoHAN
model = AutoHAN(
dataset=dataset,
num_features=num_features,
num_classes=num_classes,
device = args['device'],
init=True
).model

Then you can train the model for 100 epochs.
.. code-block:: python
# Define the loss function.
loss_fcn = torch.nn.CrossEntropyLoss()
# Define the loss optimizer.
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2,
weight_decay=1e-2)
# Training.
for epoch in range(100):
model.train()
logits = model(g)
loss = loss_fcn(logits[train_mask], labels[train_mask])

optimizer.zero_grad()
loss.backward()
optimizer.step()

val_loss, val_acc, _, _ = evaluate(model, g, labels, val_mask, loss_fcn)

Finally, evaluate the model.
.. code-block:: python
_, test_acc, _, _ = evaluate(model, g, labels, test_mask, loss_fcn)

You can also define your own heterogeneous graph neural network models by adding files in the location AutoGL/autogl/module/model/dgl/hetero.

Automatic Search for Node Classification Tasks
-------------------
On top of the modules mentioned above, we provide a high-level API Solver to control the overall pipeline. We encapsulated the training process in the Building Heterogeneous GNN Modules part in the solver AutoHeteroNodeClassifier that supports automatic hyperparametric optimization as well as feature engineering and ensemble.
In this part, we will show you how to use AutoHeteroNodeClassifier to automatically predict the publishing conference of a paper using the ACM academic graph dataset.

Firstly, we get the pre-defined model hyperparameter.

.. code-block:: python
from helper import get_encoder_decoder_hp
model_hp, _ = get_encoder_decoder_hp(args.model)

You can also define your own model hyperparameters in a dict:

.. code-block:: python
model_hp = {
"num_layers": 2,
"hidden": [256],
"heads": 4,
"dropout": 0.2,
"act": "leaky_relu",
}

Secondly, use AutoHeteroNodeClassifier directly to bulid automatic heterogeneous GNN models in the following example:

.. code-block:: python
from autogl.solver import AutoHeteroNodeClassifier
solver = AutoHeteroNodeClassifier(
graph_models=["han"],
hpo_module="random",
ensemble_module=None,
max_evals=1,
device=args.device,
trainer_hp_space=fixed(
max_epoch=100,
early_stopping_round=101,
lr=1e-3,
weight_decay=1e-2
),
model_hp_spaces=[fixed(**model_hp)]
)

Finally, fit and evlauate the model.
.. code-block:: python
solver.fit(dataset)
acc = solver.evaluate()

+ 47
- 0
test/performance/link_prediction/pyg/helper.py View File

@@ -0,0 +1,47 @@
def get_encoder_decoder_hp(model='gin', decoder=None):
if model == 'gin':
model_hp = {
"num_layers": 5,
"hidden": [64],
"act": "relu",
"eps": "False",
"mlp_layers": 2,
"neighbor_pooling_type": "sum"
}
elif model == 'gat':
model_hp = {
# hp from model
"num_layers": 3,
"hidden": [128,64],
"heads": 1,
"dropout": 0.0,
"act": "relu",
'add_self_loops': 'False',
'normalize': 'False',
}
elif model == 'gcn':
model_hp = {
"num_layers": 3,
"hidden": [128,64],
"dropout": 0.0,
"act": "relu",
'add_self_loops': 'False',
'normalize': 'False',
}
elif model == 'sage':
model_hp = {
"num_layers": 3,
"hidden": [128,64],
"dropout": 0.0,
"act": "relu",
"agg": "mean",
'add_self_loops': 'False',
'normalize': 'False',
}
elif model == 'topk':
model_hp = {
"num_layers": 5,
"hidden": [64, 64, 64, 64]
}
return model_hp, {}

+ 3
- 3
test/performance/link_prediction/pyg/link_prediction_base.py View File

@@ -152,9 +152,9 @@ def train():
optimizer.zero_grad()
z = model.encode(data) #encode

print(data)
print("trainen_shape",data.x.shape, data.train_pos_edge_index.shape)
print("trainde_shape",z.shape, data.train_pos_edge_index.shape,neg_edge_index.shape)
# print(data)
# print("trainen_shape",data.x.shape, data.train_pos_edge_index.shape)
# print("trainde_shape",z.shape, data.train_pos_edge_index.shape,neg_edge_index.shape)
# trainen_shape torch.Size([2708, 1433]) torch.Size([2, 8976])
# trainde_shape torch.Size([2708, 64]) torch.Size([2, 8976]) torch.Size([2, 8976])


+ 4
- 5
test/performance/link_prediction/pyg/link_prediction_model.py View File

@@ -49,8 +49,8 @@ args = parser.parse_args()
args.device = torch.device('cuda:0')
device = torch.device('cuda:0')

# args.dataset = 'Cora'
# args.model = 'gat'
args.dataset = 'Cora'
args.model = 'gcn'
print(args.dataset)
print(args.model)
# load the dataset
@@ -66,7 +66,7 @@ elif args.dataset == 'PubMed':
else:
assert False

def train():
def train(data):
model.train()

neg_edge_index = negative_sampling(
@@ -120,7 +120,6 @@ def test(train_data):
res = []
for seed in tqdm(range(1234, 1234+args.repeat)):
setup_seed(seed)
g = dataset[0].to(device)
data = dataset[0].to(device)
# use train_test_split_edges to create neg and positive edges
data.train_mask = data.val_mask = data.test_mask = data.y = None
@@ -177,7 +176,7 @@ for seed in tqdm(range(1234, 1234+args.repeat)):

best_val_perf = test_perf = 0
for epoch in range(100):
train_loss, train_data = train()
train_loss, train_data = train(data)
val_perf, tmp_test_perf = test(train_data)
if val_perf > best_val_perf:
best_val_perf = val_perf


+ 153
- 0
test/performance/link_prediction/pyg/link_prediction_trainer.py View File

@@ -0,0 +1,153 @@
import os
os.environ["AUTOGL_BACKEND"] = "pyg"
from tqdm import tqdm
from autogl.module.train.evaluation import Auc
import random
import torch
import numpy as np
import torch
import numpy as np
import scipy.sparse as sp
from helper import get_encoder_decoder_hp
import os.path as osp
import torch_geometric.transforms as T
from torch_geometric.datasets import Planetoid
from torch_geometric.data import Data
from torch_geometric.utils import train_test_split_edges
from torch_geometric.utils import negative_sampling

def construct_negative_graph(graph, k):
src, dst = graph.edges()

neg_src = src.repeat_interleave(k)
neg_dst = torch.randint(0, graph.num_nodes(), (len(src) * k,))
# return dgl.graph((neg_src, neg_dst), num_nodes=graph.num_nodes()).edges()
return neg_src, neg_dst

def negative_sample(data):
return construct_negative_graph(data, 5)

import autogl.datasets.utils as tmp_utils
tmp_utils.negative_sampling = negative_sample

from autogl.module.train.link_prediction_full import LinkPredictionTrainer

def setup_seed(seed):
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
np.random.seed(seed)
random.seed(seed)

def split_train_test(data):
data.train_mask = data.val_mask = data.test_mask = data.y = None
data = train_test_split_edges(data)
neg_edge_index = negative_sampling(
edge_index=data.train_pos_edge_index, #positive edges
num_nodes=data.num_nodes, # number of nodes
num_neg_samples=data.train_pos_edge_index.size(1)) # number of neg_sample equal to number of pos_edges
dataset_splitted = Data(
x=data.x,train_pos_edge_index=data.train_pos_edge_index,train_neg_edge_index=neg_edge_index,
test_pos_edge_index=data.test_pos_edge_index,
test_neg_edge_index = data.test_neg_edge_index,
val_pos_edge_index = data.val_pos_edge_index,
val_neg_edge_index = data.val_neg_edge_index
)
return dataset_splitted


if __name__ == "__main__":


from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter

parser = ArgumentParser(
"auto link prediction", formatter_class=ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"--dataset",
default="Cora",
type=str,
help="dataset to use",
choices=[
"Cora",
"CiteSeer",
"PubMed",
],
)
parser.add_argument(
"--model",
default="sage",
type=str,
help="model to use",
choices=[
"gcn",
"gat",
"sage",
"gin",
"topk"
],
)
parser.add_argument("--seed", type=int, default=0, help="random seed")
parser.add_argument('--repeat', type=int, default=10)
parser.add_argument("--device", default="cuda", type=str, help="GPU device")

args = parser.parse_args()

args.dataset = 'Cora'
args.model = 'gcn'

path = osp.join('data', args.dataset)
if args.dataset == 'Cora':
dataset = Planetoid(path, name='Cora',transform=T.NormalizeFeatures())
elif args.dataset == 'CiteSeer':
dataset = Planetoid(path, name='CiteSeer',transform=T.NormalizeFeatures())
elif args.dataset == 'PubMed':
dataset = Planetoid(path, name='PubMed',transform=T.NormalizeFeatures())
else:
assert False

res = []
for seed in tqdm(range(1234, 1234+args.repeat)):
# set random seed
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

data = dataset[0].to(args.device)
num_features = dataset.num_features

model_hp, decoder_hp = get_encoder_decoder_hp(args.model)

trainer = LinkPredictionTrainer(
model = args.model,
num_features = num_features,
lr = 1e-2,
max_epoch = 100,
early_stopping_round = 101,
weight_decay = 0.0,
device = args.device,
feval = [Auc],
loss = "binary_cross_entropy_with_logits",
init = False
).duplicate_from_hyper_parameter(
{
"trainer": {},
"encoder": model_hp,
"decoder": decoder_hp
},
restricted=False
)

dataset_splitted = split_train_test(data.cpu())
trainer.train([dataset_splitted], False)
pre = trainer.evaluate([dataset_splitted], mask="test", feval=Auc)
result = pre.item()
res.append(result)

print(np.mean(res), np.std(res))

Loading…
Cancel
Save