Browse Source

Merge pull request #120 from THUMNLab/graphcl_ogb

ogb dataset examples
develop/0.4/predevelop
Generall GitHub 3 years ago
parent
commit
eb073ecc82
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 628 additions and 5 deletions
  1. +1
    -1
      .github/ISSUE_TEMPLATE/bug_report.md
  2. +1
    -1
      .github/ISSUE_TEMPLATE/feature_request.md
  3. +1
    -0
      README.md
  4. +1
    -0
      README_cn.md
  5. +1
    -1
      autogl/__init__.py
  6. +15
    -1
      autogl/datasets/_ogb.py
  7. +59
    -0
      configs/nodeclf_gcn_benchmark_ogb.yml
  8. +146
    -0
      examples/glfogb.py
  9. +214
    -0
      examples/nodeclf_ogb.py
  10. +188
    -0
      examples/nodeclf_ogb_proteins.py
  11. +1
    -1
      setup.py

+ 1
- 1
.github/ISSUE_TEMPLATE/bug_report.md View File

@@ -3,7 +3,7 @@ name: Bug report
about: Create a report to help us improve
title: "[BUG]"
labels: bug
assignees: Frozenmad
assignees: general502570

---



+ 1
- 1
.github/ISSUE_TEMPLATE/feature_request.md View File

@@ -3,7 +3,7 @@ name: Feature request
about: Suggest an idea for this project
title: "[FEATURE]"
labels: enhancement
assignees: Frozenmad
assignees: general502570

---



+ 1
- 0
README.md View File

@@ -14,6 +14,7 @@ Feel free to open <a href="https://github.com/THUMNLab/AutoGL/issues">issues</a>

## News!

- 2022.4.19 New version v0.3.1!We have released Chinese tutorial for the first time!
- 2021.12.31 New Version! v0.3.0-pre is here!
- AutoGL now support [__Deep Graph Library (DGL)__](https://www.dgl.ai/) backend to be interface-friendly for DGL users! All the homogeneous node classification task, link prediction task, and graph classification task are currently supported under DGL backend. AutoGL is also compatible with PyG 2.0 now.
- The __heterogeneous__ node classification tasks are now supported! See [hetero tutorial](http://mn.cs.tsinghua.edu.cn/autogl/documentation/docfile/tutorial/t_hetero_node_clf.html) for more details.


+ 1
- 0
README_cn.md View File

@@ -13,6 +13,7 @@

## 最新消息

- 2022.4.19 v0.3.1版本更新!首次更新中文教程!
- 2021.12.31 v0.3.0-pre版本更新!
- 智图目前支持[__Deep Graph Library (DGL)__](https://www.dgl.ai/)作为后端,以方便DGL的用户使用。目前在DGL后端已经支持同构图的节点分类、链接预测以及图分类等任务。智图现在也可兼容PyG 2.0版本。
- 智图可以支持__异构图__节点分类任务!详情请参考[异构图教程](http://mn.cs.tsinghua.edu.cn/autogl/documentation/docfile/tutorial/t_hetero_node_clf.html)。


+ 1
- 1
autogl/__init__.py View File

@@ -16,4 +16,4 @@ from .module import (
train,
)

__version__ = "0.3.0-pre"
__version__ = "0.3.1"

+ 15
- 1
autogl/datasets/_ogb.py View File

@@ -5,6 +5,8 @@ from ogb.nodeproppred import NodePropPredDataset
from ogb.linkproppred import LinkPropPredDataset
from ogb.graphproppred import GraphPropPredDataset

from torch_sparse import SparseTensor

from autogl import backend as _backend
from autogl.data import InMemoryStaticGraphSet
from autogl.data.graph import (
@@ -30,13 +32,25 @@ class _OGBNDatasetUtil(_OGBDatasetUtil):
edges_data_key_mapping: _typing.Optional[_typing.Mapping[str, str]] = ...,
graph_data_key_mapping: _typing.Optional[_typing.Mapping[str, str]] = ...
) -> GeneralStaticGraph:
# TODO
edge_index = ogbn_data['edge_index']
num_nodes = ogbn_data['num_nodes']
edge_feat = ogbn_data['edge_feat']
if edge_feat is not None:
edge_feat = torch.tensor(edge_feat)
edge_index = SparseTensor(row=torch.tensor(edge_index[0]), col=torch.tensor(edge_index[1]), value=edge_feat, sparse_sizes=(num_nodes, num_nodes))
_, _, value = edge_index.coo()
ogbn_data['edge_feat'] = value.cpu().detach().numpy()
edge_index = edge_index.to_symmetric()
row, col, _ = edge_index.coo()
edge_index = np.array([row.cpu().detach().numpy(), col.cpu().detach().numpy()])
homogeneous_static_graph: GeneralStaticGraph = (
GeneralStaticGraphGenerator.create_homogeneous_static_graph(
dict([
(target_key, torch.from_numpy(ogbn_data[source_key]))
for source_key, target_key in nodes_data_key_mapping.items()
]),
torch.from_numpy(ogbn_data['edge_index']),
torch.tensor(edge_index),
dict([
(target_key, torch.from_numpy(ogbn_data[source_key]))
for source_key, target_key in edges_data_key_mapping.items()


+ 59
- 0
configs/nodeclf_gcn_benchmark_ogb.yml View File

@@ -0,0 +1,59 @@
ensemble:
name: null
feature:
- name: NormalizeFeatures
hpo:
max_evals: 10
name: random
models:
- hp_space:
- feasiblePoints: 2,3
parameterName: num_layers
type: DISCRETE
- cutFunc: lambda x:x[0] - 1
cutPara:
- num_layers
length: 2
maxValue:
- 256
- 256
minValue:
- 256
- 256
numericalType: INTEGER
parameterName: hidden
scalingType: LOG
type: NUMERICAL_LIST
- maxValue: 0.505
minValue: 0.495
parameterName: dropout
scalingType: LINEAR
type: DOUBLE
- feasiblePoints:
- leaky_relu
- relu
parameterName: act
type: CATEGORICAL
name: gcn-model
trainer:
hp_space:
- maxValue: 500
minValue: 500
parameterName: max_epoch
scalingType: LINEAR
type: INTEGER
- maxValue: 500
minValue: 500
parameterName: early_stopping_round
scalingType: LINEAR
type: INTEGER
- maxValue: 0.0105
minValue: 0.0095
parameterName: lr
scalingType: LOG
type: DOUBLE
- maxValue: 0.0000001
minValue: 0.00000001
parameterName: weight_decay
scalingType: LOG
type: DOUBLE

+ 146
- 0
examples/glfogb.py View File

@@ -0,0 +1,146 @@
from torch_geometric.data import DataLoader
import torch.optim as optim
from tqdm import tqdm
from ogb.graphproppred import Evaluator
import random
import torch
import numpy as np
from autogl.datasets import build_dataset_from_name
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from ogb_gnn import GNN
from autogl.backend import DependentBackend
from torch_geometric.data import Data

backend = DependentBackend.get_backend_name()

cls_criterion = torch.nn.BCEWithLogitsLoss()
reg_criterion = torch.nn.MSELoss()

def train(model, device, loader, optimizer, task_type):
model.train()

for step, batch in enumerate(tqdm(loader, desc="Iteration")):
batch = batch.to(device)

if batch.x.shape[0] == 1 or batch.batch[-1] == 0:
pass
else:
pred = model(batch)
optimizer.zero_grad()
is_labeled = batch.y == batch.y
loss = cls_criterion(pred.to(torch.float32)[is_labeled], batch.y.to(torch.float32)[is_labeled])
loss.backward()
optimizer.step()

def eval(model, device, loader, evaluator):
model.eval()
y_true = []
y_pred = []

for step, batch in enumerate(tqdm(loader, desc="Iteration")):
batch = batch.to(device)

if batch.x.shape[0] == 1:
pass
else:
with torch.no_grad():
pred = model(batch)

y_true.append(batch.y.view(pred.shape).detach().cpu())
y_pred.append(pred.detach().cpu())

y_true = torch.cat(y_true, dim = 0).numpy()
y_pred = torch.cat(y_pred, dim = 0).numpy()

input_dict = {"y_true": y_true, "y_pred": y_pred}

return evaluator.eval(input_dict)

def trans(dataset):
ret = []
for i in range(len(dataset)):
x = dataset[i].nodes.data['x']
y = dataset[i].data['y'].view(-1, 1)
edge_index = dataset[i].edges.connections
edge_attr = dataset[i].edges.data['edge_feat']
data = Data(x=x, y=y, edge_index=edge_index, edge_attr=edge_attr)
ret.append(data)
return ret

if __name__ == "__main__":
parser = ArgumentParser(
"auto graph classification", formatter_class=ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"--dataset",
default="ogbg-molhiv",
type=str,
help="graph classification dataset",
choices=["mutag", "imdb-b", "imdb-m", "proteins", "collab", "ogbg-molbace"],
)
parser.add_argument(
"--configs", default="../configs/graphclf_gin_benchmark.yml", help="config files"
)
parser.add_argument("--device", type=int, default=0, help="device to run on, -1 means cpu")
parser.add_argument("--seed", type=int, default=0, help="random seed")

args = parser.parse_args()

if args.device == -1:
args.device = "cpu"

if torch.cuda.is_available() and args.device != "cpu":
torch.cuda.set_device(args.device)
seed = args.seed
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

dataset = build_dataset_from_name(args.dataset)
model = GNN(num_tasks=1, gnn_type = 'gcn').to(args.device)
evaluator = Evaluator(args.dataset)

train_dataset = trans(dataset.train_split)
val_dataset = trans(dataset.val_split)
test_dataset = trans(dataset.test_split)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True,
num_workers=0)
valid_loader = DataLoader(val_dataset, batch_size=32, shuffle=False,
num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False,
num_workers=0)

optimizer = optim.Adam(model.parameters(), lr=0.001)

valid_curve = []
test_curve = []
train_curve = []
device = torch.device("cuda:0")
for epoch in range(1, 100 + 1):
print("=====Epoch {}".format(epoch))
print('Training...')
train(model, device, train_loader, optimizer, 'binary classification')

print('Evaluating...')
train_perf = eval(model, device, train_loader, evaluator)
valid_perf = eval(model, device, valid_loader, evaluator)
test_perf = eval(model, device, test_loader, evaluator)

print({'Train': train_perf, 'Validation': valid_perf, 'Test': test_perf})

train_curve.append(train_perf['rocauc'])
valid_curve.append(valid_perf['rocauc'])
test_curve.append(test_perf['rocauc'])

best_val_epoch = np.argmax(np.array(valid_curve))
best_train = max(train_curve)

print('Finished training!')
print('Best validation score: {}'.format(valid_curve[best_val_epoch]))
print('Test score: {}'.format(test_curve[best_val_epoch]))


+ 214
- 0
examples/nodeclf_ogb.py View File

@@ -0,0 +1,214 @@
import os
import tqdm
import argparse
import numpy as np
import torch
import torch.nn.functional as F

from torch_geometric.nn import GCNConv, SAGEConv

from ogb.nodeproppred import Evaluator
from autogl.datasets import build_dataset_from_name
from autogl import backend

if backend.DependentBackend.is_dgl():
feat = 'feat'
label = 'label'
else:
feat = 'x'
label = 'y'

class GCN(torch.nn.Module):
def __init__(self, in_channels, hidden_channels, out_channels, num_layers,
dropout):
super(GCN, self).__init__()

self.convs = torch.nn.ModuleList()
self.convs.append(GCNConv(in_channels, hidden_channels, cached=True))
self.bns = torch.nn.ModuleList()
self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
for _ in range(num_layers - 2):
self.convs.append(
GCNConv(hidden_channels, hidden_channels, cached=True))
self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
self.convs.append(GCNConv(hidden_channels, out_channels, cached=True))

self.dropout = dropout

def reset_parameters(self):
for conv in self.convs:
conv.reset_parameters()
for bn in self.bns:
bn.reset_parameters()

def forward(self, x, adj_t):
for i, conv in enumerate(self.convs[:-1]):
x = conv(x, adj_t)
x = self.bns[i](x)
x = F.relu(x)
x = F.dropout(x, p=self.dropout, training=self.training)
x = self.convs[-1](x, adj_t)
return x.log_softmax(dim=-1)


class SAGE(torch.nn.Module):
def __init__(self, in_channels, hidden_channels, out_channels, num_layers,
dropout):
super(SAGE, self).__init__()

self.convs = torch.nn.ModuleList()
self.convs.append(SAGEConv(in_channels, hidden_channels))
self.bns = torch.nn.ModuleList()
self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
for _ in range(num_layers - 2):
self.convs.append(SAGEConv(hidden_channels, hidden_channels))
self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
self.convs.append(SAGEConv(hidden_channels, out_channels))

self.dropout = dropout

def reset_parameters(self):
for conv in self.convs:
conv.reset_parameters()
for bn in self.bns:
bn.reset_parameters()

def forward(self, x, adj_t):
for i, conv in enumerate(self.convs[:-1]):
x = conv(x, adj_t)
x = self.bns[i](x)
x = F.relu(x)
x = F.dropout(x, p=self.dropout, training=self.training)
x = self.convs[-1](x, adj_t)
return x.log_softmax(dim=-1)


def train(model, x, y, edge_index, train_idx, optimizer):
model.train()
optimizer.zero_grad()
out = model(x, edge_index)[train_idx]
loss = F.nll_loss(out, y[train_idx])
loss.backward()
optimizer.step()
return loss.item()


@torch.no_grad()
def test(model, x, y, edge_index, split_idx, evaluator):
model.eval()
out = model(x, edge_index)
y_pred = out.argmax(dim=-1, keepdim=True)

train_acc = evaluator.eval({
'y_true': y[split_idx['train']].view(-1, 1),
'y_pred': y_pred[split_idx['train']],
})['acc']
valid_acc = evaluator.eval({
'y_true': y[split_idx['valid']].view(-1, 1),
'y_pred': y_pred[split_idx['valid']],
})['acc']
test_acc = evaluator.eval({
'y_true': y[split_idx['test']].view(-1, 1),
'y_pred': y_pred[split_idx['test']],
})['acc']

return train_acc, valid_acc, test_acc

class Node:
def __init__(self, a, b):
self.a = a
self.b = b

def __le__(self, other):
return self.a <= other.a
def __lt__(self, other):
if self.a < other.a:
return True
elif self.a == other.a:
return self.b < other.b
else:
return False

def main():
parser = argparse.ArgumentParser(description='OGBN-Arxiv (GNN)')
parser.add_argument('--device', type=int, default=0)
parser.add_argument('--log_steps', type=int, default=1)
parser.add_argument('--use_sage', action='store_true')
parser.add_argument('--num_layers', type=int, default=3)
parser.add_argument('--hidden_channels', type=int, default=256)
parser.add_argument('--dropout', type=float, default=0.5)
parser.add_argument('--lr', type=float, default=0.01)
parser.add_argument('--epochs', type=int, default=500)
parser.add_argument('--runs', type=int, default=10)
args = parser.parse_args()
print(args)

device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
device = torch.device(device)
# print(oedge_index)

dataset = build_dataset_from_name('ogbn_arxiv', path='./dataset/')

data = dataset[0]
x = data.nodes.data[feat].to(device)
y = data.nodes.data[label].to(device)
edge_index = data.edges.connections.to(device)
# edge_index = data_transfer(edge_index, row, col)
print(edge_index)
# print(edge_index.shape)

train_mask = data.nodes.data['train_mask']
val_mask = data.nodes.data['val_mask']
test_mask = data.nodes.data['test_mask']
split_idx = {
'train': train_mask,
'valid': val_mask,
'test': test_mask
}

# split_idx = dataset.get_idx_split()
train_idx = split_idx['train'].to(device)
labels = dataset[0].nodes.data[label]
num_classes = len(np.unique(labels.numpy()))

if args.use_sage:
model = SAGE(dataset[0].nodes.data[feat].size(1), args.hidden_channels,
num_classes, args.num_layers,
args.dropout).to(device)
else:
model = GCN(dataset[0].nodes.data[feat].size(1), args.hidden_channels,
num_classes, args.num_layers,
args.dropout).to(device)

evaluator = Evaluator(name='ogbn-arxiv')

best_accs = []
for run in range(args.runs):
model.reset_parameters()
optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
best_valid = 0.0
best_test = 0.0
for epoch in range(1, 1 + args.epochs):
loss = train(model, x, y, edge_index, train_idx, optimizer)
result = test(model, x, y, edge_index, split_idx, evaluator)

if epoch % args.log_steps == 0:
train_acc, valid_acc, test_acc = result
print(f'Run: {run + 1:02d}, '
f'Epoch: {epoch:02d}, '
f'Loss: {loss:.4f}, '
f'Train: {100 * train_acc:.2f}%, '
f'Valid: {100 * valid_acc:.2f}% '
f'Test: {100 * test_acc:.2f}%')
if valid_acc > best_valid:
best_valid = valid_acc
best_test = test_acc
best_accs.append(best_test)
print(best_accs)
print(np.mean(best_accs))
print(np.std(best_accs))

if __name__ == "__main__":
main()

+ 188
- 0
examples/nodeclf_ogb_proteins.py View File

@@ -0,0 +1,188 @@
import argparse
import numpy as np
import torch
import torch.nn.functional as F

import torch_geometric.transforms as T
from torch_sparse import SparseTensor
from torch_geometric.nn import GCNConv, SAGEConv

from ogb.nodeproppred import PygNodePropPredDataset, Evaluator
from autogl import backend
from autogl.datasets import build_dataset_from_name

if backend.DependentBackend.is_dgl():
ylabel = 'label'
else:
ylabel = 'y'

class GCN(torch.nn.Module):
def __init__(self, in_channels, hidden_channels, out_channels, num_layers,
dropout):
super(GCN, self).__init__()

self.convs = torch.nn.ModuleList()
self.convs.append(
GCNConv(in_channels, hidden_channels, normalize=False))
for _ in range(num_layers - 2):
self.convs.append(
GCNConv(hidden_channels, hidden_channels, normalize=False))
self.convs.append(
GCNConv(hidden_channels, out_channels, normalize=False))

self.dropout = dropout

def reset_parameters(self):
for conv in self.convs:
conv.reset_parameters()

def forward(self, x, adj_t):
for conv in self.convs[:-1]:
x = conv(x, adj_t)
x = F.relu(x)
x = F.dropout(x, p=self.dropout, training=self.training)
x = self.convs[-1](x, adj_t)
return x


class SAGE(torch.nn.Module):
def __init__(self, in_channels, hidden_channels, out_channels, num_layers,
dropout):
super(SAGE, self).__init__()

self.convs = torch.nn.ModuleList()
self.convs.append(SAGEConv(in_channels, hidden_channels))
for _ in range(num_layers - 2):
self.convs.append(SAGEConv(hidden_channels, hidden_channels))
self.convs.append(SAGEConv(hidden_channels, out_channels))

self.dropout = dropout

def reset_parameters(self):
for conv in self.convs:
conv.reset_parameters()

def forward(self, x, adj_t):
for conv in self.convs[:-1]:
x = conv(x, adj_t)
x = F.relu(x)
x = F.dropout(x, p=self.dropout, training=self.training)
x = self.convs[-1](x, adj_t)
return x


def train(model, x, y, edge_index, train_idx, optimizer):
model.train()
criterion = torch.nn.BCEWithLogitsLoss()

optimizer.zero_grad()
out = model(x, edge_index)[train_idx]
loss = criterion(out, y[train_idx].to(torch.float))
loss.backward()
optimizer.step()

return loss.item()


@torch.no_grad()
def test(model, x, y, edge_index, split_idx, evaluator):
model.eval()

y_pred = model(x, edge_index)

train_rocauc = evaluator.eval({
'y_true': y[split_idx['train']],
'y_pred': y_pred[split_idx['train']],
})['rocauc']
valid_rocauc = evaluator.eval({
'y_true': y[split_idx['valid']],
'y_pred': y_pred[split_idx['valid']],
})['rocauc']
test_rocauc = evaluator.eval({
'y_true': y[split_idx['test']],
'y_pred': y_pred[split_idx['test']],
})['rocauc']

return train_rocauc, valid_rocauc, test_rocauc


def main():
parser = argparse.ArgumentParser(description='OGBN-Proteins (GNN)')
parser.add_argument('--device', type=int, default=0)
parser.add_argument('--log_steps', type=int, default=1)
parser.add_argument('--use_sage', action='store_true')
parser.add_argument('--num_layers', type=int, default=3)
parser.add_argument('--hidden_channels', type=int, default=256)
parser.add_argument('--dropout', type=float, default=0.0)
parser.add_argument('--lr', type=float, default=0.01)
parser.add_argument('--epochs', type=int, default=1000)
parser.add_argument('--eval_steps', type=int, default=5)
parser.add_argument('--runs', type=int, default=10)
args = parser.parse_args()
print(args)

device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
device = torch.device(device)

autogl_dataset = build_dataset_from_name('ogbn-proteins')
data = autogl_dataset[0]
y = data.nodes.data[ylabel].to(device)
num_nodes = data.nodes.data['species'].shape[0]
edge_index = data.edges.connections
row = edge_index[0].type(torch.long).to(device)
col = edge_index[1].type(torch.long).to(device)
edge_feat = data.edges.data['edge_feat'].to(device)
edge_index = SparseTensor(row=row, col=col, value=edge_feat, sparse_sizes=(num_nodes, num_nodes))
x = edge_index.mean(dim=1).to(device)
edge_index.set_value_(None)
train_mask = data.nodes.data['train_mask']
val_mask = data.nodes.data['val_mask']
test_mask = data.nodes.data['test_mask']
split_idx = {
'train': train_mask,
'valid': val_mask,
'test': test_mask
}
labels = data.nodes.data[ylabel]
num_classes = len(np.unique(labels.numpy()))
train_idx = split_idx['train']

if args.use_sage:
model = SAGE(x.size(1), args.hidden_channels, 112,
args.num_layers, args.dropout).to(device)
else:
model = GCN(x.size(1), args.hidden_channels, 112,
args.num_layers, args.dropout).to(device)

# Pre-compute GCN normalization.
adj_t = edge_index.set_diag()
deg = adj_t.sum(dim=1).to(torch.float)
deg_inv_sqrt = deg.pow(-0.5)
deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
adj_t = deg_inv_sqrt.view(-1, 1) * adj_t * deg_inv_sqrt.view(1, -1)
edge_index = adj_t

evaluator = Evaluator(name='ogbn-proteins')

for run in range(args.runs):
model.reset_parameters()
optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
for epoch in range(1, 1 + args.epochs):
loss = train(model, x, y, edge_index, train_idx, optimizer)

if epoch % args.eval_steps == 0:
result = test(model, x, y, edge_index, split_idx, evaluator)

if epoch % args.log_steps == 0:
train_rocauc, valid_rocauc, test_rocauc = result
print(f'Run: {run + 1:02d}, '
f'Epoch: {epoch:02d}, '
f'Loss: {loss:.4f}, '
f'Train: {100 * train_rocauc:.2f}%, '
f'Valid: {100 * valid_rocauc:.2f}% '
f'Test: {100 * test_rocauc:.2f}%')


if __name__ == "__main__":
main()

+ 1
- 1
setup.py View File

@@ -16,7 +16,7 @@ with open("README.md", 'r') as fh:
''' https://setuptools.readthedocs.io/en/latest/ '''
setup(
name='autogl',
version='0.3.0-pre',
version='0.3.1',
author='THUMNLab/aglteam',
maintainer='THUMNLab/aglteam',
author_email='autogl@tsinghua.edu.cn',


Loading…
Cancel
Save