Browse Source

node model

tags/v0.3.1
Beini Frozenmad 4 years ago
parent
commit
dd371e587f
5 changed files with 363 additions and 233 deletions
  1. +2
    -11
      autogl/module/model/dgl/__init__.py
  2. +0
    -85
      autogl/module/model/dgl/dataloader_gin.py
  3. +7
    -4
      autogl/module/model/dgl/gcn_dgl.py
  4. +299
    -0
      autogl/module/model/dgl/graph_saint_dgl.py
  5. +55
    -133
      test/model_nlf/nclf_dgl.py

+ 2
- 11
autogl/module/model/dgl/__init__.py View File

@@ -3,13 +3,8 @@ from .base import BaseModel
from .topkpool import AutoTopkpool

# from .graph_sage import AutoSAGE
from .graphsage import AutoSAGE
from .graph_saint import GraphSAINTAggregationModel
from .gcn import AutoGCN
from .gat import AutoGAT
from .gin import AutoGIN
from .gin_dgl import GIN
from .gcn_dgl import GCN
from .gcn_dgl import GCN,AutoGCN
from .graphsage_dgl import GraphSAGE
from .gat_dgl import GAT

@@ -18,13 +13,9 @@ __all__ = [
"register_model",
"BaseModel",
"AutoTopkpool",
"AutoSAGE",
"GraphSAINTAggregationModel",
"AutoGCN",
"AutoGAT",
"AutoGIN",
"GIN",
"GCN",
"AutoGCN",
"GraphSAGE",
"GAT"
]

+ 0
- 85
autogl/module/model/dgl/dataloader_gin.py View File

@@ -1,85 +0,0 @@
"""
PyTorch compatible dataloader
"""


import math
import numpy as np
import torch
from torch.utils.data.sampler import SubsetRandomSampler
from sklearn.model_selection import StratifiedKFold
import dgl
from dgl.dataloading import GraphDataLoader


class GINDataLoader():
def __init__(self,
dataset,
batch_size,
device,
collate_fn=None,
seed=0,
shuffle=True,
split_name='fold10',
fold_idx=0,
split_ratio=0.7):

self.shuffle = shuffle
self.seed = seed
self.kwargs = {'pin_memory': True} if 'cuda' in device.type else {}

labels = [l for _, l in dataset]

if split_name == 'fold10':
train_idx, valid_idx = self._split_fold10(
labels, fold_idx, seed, shuffle)
elif split_name == 'rand':
train_idx, valid_idx = self._split_rand(
labels, split_ratio, seed, shuffle)
else:
raise NotImplementedError()

train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

self.train_loader = GraphDataLoader(
dataset, sampler=train_sampler,
batch_size=batch_size, collate_fn=collate_fn, **self.kwargs)
self.valid_loader = GraphDataLoader(
dataset, sampler=valid_sampler,
batch_size=batch_size, collate_fn=collate_fn, **self.kwargs)

def train_valid_loader(self):
return self.train_loader, self.valid_loader

def _split_fold10(self, labels, fold_idx=0, seed=0, shuffle=True):
''' 10 flod '''
assert 0 <= fold_idx and fold_idx < 10, print(
"fold_idx must be from 0 to 9.")

skf = StratifiedKFold(n_splits=10, shuffle=shuffle, random_state=seed)
idx_list = []
for idx in skf.split(np.zeros(len(labels)), labels): # split(x, y)
idx_list.append(idx)
train_idx, valid_idx = idx_list[fold_idx]

print(
"train_set : test_set = %d : %d",
len(train_idx), len(valid_idx))

return train_idx, valid_idx

def _split_rand(self, labels, split_ratio=0.7, seed=0, shuffle=True):
num_entries = len(labels)
indices = list(range(num_entries))
np.random.seed(seed)
np.random.shuffle(indices)
split = int(math.floor(split_ratio * num_entries))
train_idx, valid_idx = indices[:split], indices[split:]

print(
"train_set : test_set = %d : %d",
len(train_idx), len(valid_idx))

return train_idx, valid_idx


+ 7
- 4
autogl/module/model/dgl/gcn_dgl.py View File

@@ -54,15 +54,12 @@ class GCN(ClassificationSupportedSequentialModel):
else:
self._dropout: _typing.Optional[torch.nn.Dropout] = None

def forward(self, data, enable_activation: bool = True) -> torch.Tensor:
x: torch.Tensor = data.ndata['feat']
def forward(self, data, x, enable_activation: bool = True) -> torch.Tensor:
if self.add_self_loops:
data = remove_self_loop(data)
data = add_self_loop(data)

x: torch.Tensor = self._convolution.forward(data, x)
if self._activation_name is not None and enable_activation:
x: torch.Tensor = activate_func(x, self._activation_name)
@@ -224,6 +221,12 @@ class GCN(ClassificationSupportedSequentialModel):
for __edge_index in getattr(data, "edge_indexes")
]

def forward(self, data):
x = data.ndata['x']
for gcn in self.__sequential_encoding_layers:
x = gcn(data,x)
return x

def cls_encode(self, data) -> torch.Tensor:
edge_indexes_and_weights: _typing.Union[
_typing.Sequence[


+ 299
- 0
autogl/module/model/dgl/graph_saint_dgl.py View File

@@ -0,0 +1,299 @@
import torch.nn as nn
import torch.nn.functional as F
import torch as th
import dgl.function as fn
import math
import os
import time
import torch as th
import random
import numpy as np
import dgl.function as fn
import dgl
from dgl.sampling import random_walk, pack_traces

class GCNLayer(nn.Module):
def __init__(self, in_dim, out_dim, order=1, act=None,
dropout=0, batch_norm=False, aggr="concat"):
super(GCNLayer, self).__init__()
self.lins = nn.ModuleList()
self.bias = nn.ParameterList()
for _ in range(order + 1):
self.lins.append(nn.Linear(in_dim, out_dim, bias=False))
self.bias.append(nn.Parameter(th.zeros(out_dim)))

self.order = order
self.act = act
self.dropout = nn.Dropout(dropout)

self.batch_norm = batch_norm
if batch_norm:
self.offset, self.scale = nn.ParameterList(), nn.ParameterList()
for _ in range(order + 1):
self.offset.append(nn.Parameter(th.zeros(out_dim)))
self.scale.append(nn.Parameter(th.ones(out_dim)))

self.aggr = aggr
self.reset_parameters()

def reset_parameters(self):
for lin in self.lins:
nn.init.xavier_normal_(lin.weight)

def feat_trans(self, features, idx):
h = self.lins[idx](features) + self.bias[idx]

if self.act is not None:
h = self.act(h)

if self.batch_norm:
mean = h.mean(dim=1).view(h.shape[0], 1)
var = h.var(dim=1, unbiased=False).view(h.shape[0], 1) + 1e-9
h = (h - mean) * self.scale[idx] * th.rsqrt(var) + self.offset[idx]

return h

def forward(self, graph, features):
g = graph.local_var()
h_in = self.dropout(features)
h_hop = [h_in]

D_norm = g.ndata['train_D_norm'] if 'train_D_norm' in g.ndata else g.ndata['full_D_norm']
for _ in range(self.order):
g.ndata['h'] = h_hop[-1]
if 'w' not in g.edata:
g.edata['w'] = th.ones((g.num_edges(), )).to(features.device)
g.update_all(fn.u_mul_e('h', 'w', 'm'),
fn.sum('m', 'h'))
h = g.ndata.pop('h')
h = h * D_norm
h_hop.append(h)

h_part = [self.feat_trans(ft, idx) for idx, ft in enumerate(h_hop)]
if self.aggr == "mean":
h_out = h_part[0]
for i in range(len(h_part) - 1):
h_out = h_out + h_part[i + 1]
elif self.aggr == "concat":
h_out = th.cat(h_part, 1)
else:
raise NotImplementedError

return h_out


class GCNNet(nn.Module):
def __init__(self, in_dim, hid_dim, out_dim, arch="1-1-0",
act=F.relu, dropout=0, batch_norm=False, aggr="concat"):
super(GCNNet, self).__init__()
self.gcn = nn.ModuleList()

orders = list(map(int, arch.split('-')))
self.gcn.append(GCNLayer(in_dim=in_dim, out_dim=hid_dim, order=orders[0],
act=act, dropout=dropout, batch_norm=batch_norm, aggr=aggr))
pre_out = ((aggr == "concat") * orders[0] + 1) * hid_dim

for i in range(1, len(orders)-1):
self.gcn.append(GCNLayer(in_dim=pre_out, out_dim=hid_dim, order=orders[i],
act=act, dropout=dropout, batch_norm=batch_norm, aggr=aggr))
pre_out = ((aggr == "concat") * orders[i] + 1) * hid_dim

self.gcn.append(GCNLayer(in_dim=pre_out, out_dim=hid_dim, order=orders[-1],
act=act, dropout=dropout, batch_norm=batch_norm, aggr=aggr))
pre_out = ((aggr == "concat") * orders[-1] + 1) * hid_dim

self.out_layer = GCNLayer(in_dim=pre_out, out_dim=out_dim, order=0,
act=None, dropout=dropout, batch_norm=False, aggr=aggr)

def forward(self, graph):
h = graph.ndata['feat']

for layer in self.gcn:
h = layer(graph, h)

h = F.normalize(h, p=2, dim=1)
h = self.out_layer(graph, h)

return h




# The base class of sampler
# (TODO): online sampling
class SAINTSampler(object):
def __init__(self, dn, g, train_nid, node_budget, num_repeat=50):
"""
:param dn: name of dataset
:param g: full graph
:param train_nid: ids of training nodes
:param node_budget: expected number of sampled nodes
:param num_repeat: number of times of repeating sampling one node
"""
self.g = g
self.train_g: dgl.graph = g.subgraph(train_nid)
self.dn, self.num_repeat = dn, num_repeat
self.node_counter = th.zeros((self.train_g.num_nodes(),))
self.edge_counter = th.zeros((self.train_g.num_edges(),))
self.prob = None

graph_fn, norm_fn = self.__generate_fn__()

if os.path.exists(graph_fn):
self.subgraphs = np.load(graph_fn, allow_pickle=True)
aggr_norm, loss_norm = np.load(norm_fn, allow_pickle=True)
else:
os.makedirs('./subgraphs/', exist_ok=True)

self.subgraphs = []
self.N, sampled_nodes = 0, 0

t = time.perf_counter()
while sampled_nodes <= self.train_g.num_nodes() * num_repeat:
subgraph = self.__sample__()
self.subgraphs.append(subgraph)
sampled_nodes += subgraph.shape[0]
self.N += 1
print(f'Sampling time: [{time.perf_counter() - t:.2f}s]')
np.save(graph_fn, self.subgraphs)

t = time.perf_counter()
self.__counter__()
aggr_norm, loss_norm = self.__compute_norm__()
print(f'Normalization time: [{time.perf_counter() - t:.2f}s]')
np.save(norm_fn, (aggr_norm, loss_norm))

self.train_g.ndata['l_n'] = th.Tensor(loss_norm)
self.train_g.edata['w'] = th.Tensor(aggr_norm)
self.__compute_degree_norm()

self.num_batch = math.ceil(self.train_g.num_nodes() / node_budget)
random.shuffle(self.subgraphs)
self.__clear__()
print("The number of subgraphs is: ", len(self.subgraphs))
print("The size of subgraphs is about: ", len(self.subgraphs[-1]))

def __clear__(self):
self.prob = None
self.node_counter = None
self.edge_counter = None
self.g = None

def __counter__(self):

for sampled_nodes in self.subgraphs:
sampled_nodes = th.from_numpy(sampled_nodes)
self.node_counter[sampled_nodes] += 1

subg = self.train_g.subgraph(sampled_nodes)
sampled_edges = subg.edata[dgl.EID]
self.edge_counter[sampled_edges] += 1

def __generate_fn__(self):
raise NotImplementedError

def __compute_norm__(self):
self.node_counter[self.node_counter == 0] = 1
self.edge_counter[self.edge_counter == 0] = 1

loss_norm = self.N / self.node_counter / self.train_g.num_nodes()

self.train_g.ndata['n_c'] = self.node_counter
self.train_g.edata['e_c'] = self.edge_counter
self.train_g.apply_edges(fn.v_div_e('n_c', 'e_c', 'a_n'))
aggr_norm = self.train_g.edata.pop('a_n')

self.train_g.ndata.pop('n_c')
self.train_g.edata.pop('e_c')

return aggr_norm.numpy(), loss_norm.numpy()

def __compute_degree_norm(self):

self.train_g.ndata['train_D_norm'] = 1. / self.train_g.in_degrees().float().clamp(min=1).unsqueeze(1)
self.g.ndata['full_D_norm'] = 1. / self.g.in_degrees().float().clamp(min=1).unsqueeze(1)

def __sample__(self):
raise NotImplementedError

def __len__(self):
return self.num_batch

def __iter__(self):
self.n = 0
return self

def __next__(self):
if self.n < self.num_batch:
result = self.train_g.subgraph(self.subgraphs[self.n])
self.n += 1
return result
else:
random.shuffle(self.subgraphs)
raise StopIteration()


class SAINTNodeSampler(SAINTSampler):
def __init__(self, node_budget, dn, g, train_nid, num_repeat=50):
self.node_budget = node_budget
super(SAINTNodeSampler, self).__init__(dn, g, train_nid, node_budget, num_repeat)

def __generate_fn__(self):
graph_fn = os.path.join('./subgraphs/{}_Node_{}_{}.npy'.format(self.dn, self.node_budget,
self.num_repeat))
norm_fn = os.path.join('./subgraphs/{}_Node_{}_{}_norm.npy'.format(self.dn, self.node_budget,
self.num_repeat))
return graph_fn, norm_fn

def __sample__(self):
if self.prob is None:
self.prob = self.train_g.in_degrees().float().clamp(min=1)

sampled_nodes = th.multinomial(self.prob, num_samples=self.node_budget, replacement=True).unique()
return sampled_nodes.numpy()


class SAINTEdgeSampler(SAINTSampler):
def __init__(self, edge_budget, dn, g, train_nid, num_repeat=50):
self.edge_budget = edge_budget
super(SAINTEdgeSampler, self).__init__(dn, g, train_nid, edge_budget * 2, num_repeat)

def __generate_fn__(self):
graph_fn = os.path.join('./subgraphs/{}_Edge_{}_{}.npy'.format(self.dn, self.edge_budget,
self.num_repeat))
norm_fn = os.path.join('./subgraphs/{}_Edge_{}_{}_norm.npy'.format(self.dn, self.edge_budget,
self.num_repeat))
return graph_fn, norm_fn

def __sample__(self):
if self.prob is None:
src, dst = self.train_g.edges()
src_degrees, dst_degrees = self.train_g.in_degrees(src).float().clamp(min=1),\
self.train_g.in_degrees(dst).float().clamp(min=1)
self.prob = 1. / src_degrees + 1. / dst_degrees

sampled_edges = th.multinomial(self.prob, num_samples=self.edge_budget, replacement=True).unique()

sampled_src, sampled_dst = self.train_g.find_edges(sampled_edges)
sampled_nodes = th.cat([sampled_src, sampled_dst]).unique()
return sampled_nodes.numpy()


class SAINTRandomWalkSampler(SAINTSampler):
def __init__(self, num_roots, length, dn, g, train_nid, num_repeat=50):
self.num_roots, self.length = num_roots, length
super(SAINTRandomWalkSampler, self).__init__(dn, g, train_nid, num_roots * length, num_repeat)

def __generate_fn__(self):
graph_fn = os.path.join('./subgraphs/{}_RW_{}_{}_{}.npy'.format(self.dn, self.num_roots,
self.length, self.num_repeat))
norm_fn = os.path.join('./subgraphs/{}_RW_{}_{}_{}_norm.npy'.format(self.dn, self.num_roots,
self.length, self.num_repeat))
return graph_fn, norm_fn

def __sample__(self):
sampled_roots = th.randint(0, self.train_g.num_nodes(), (self.num_roots, ))
traces, types = random_walk(self.train_g, nodes=sampled_roots, length=self.length)
sampled_nodes, _, _, _ = pack_traces(traces, types)
sampled_nodes = sampled_nodes.unique()
return sampled_nodes.numpy()

+ 55
- 133
test/model_nlf/nclf_dgl.py View File

@@ -3,6 +3,7 @@ import sys
import logging
logging.basicConfig(level=logging.INFO)
from tqdm import tqdm
import time

sys.path.append("../../")
print(os.getcwd())
@@ -16,9 +17,7 @@ import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from autogl.module.model.ginparser import Parser
from autogl.module.model.dataloader_gin import GINDataLoader
from autogl.module.model import GIN
from autogl.module.model import GCN

from pdb import set_trace
import numpy as np
@@ -26,151 +25,74 @@ from autogl.solver.utils import set_seed
set_seed(202106)


def train(args, net, trainloader, optimizer, criterion, epoch):
net.train()
def evaluate(model, graph, labels, mask):
model.eval()
with torch.no_grad():
logits = model(graph)
logits = logits[mask]
labels = labels[mask]
_, indices = torch.max(logits, dim=1)
correct = torch.sum(indices == labels)
return correct.item() * 1.0 / len(labels)

running_loss = 0
total_iters = len(trainloader)
# setup the offset to avoid the overlap with mouse cursor
bar = tqdm(range(total_iters), unit='batch', position=2, file=sys.stdout)

for pos, (graphs, labels) in zip(bar, trainloader):
# batch graphs will be shipped to device in forward part of model
labels = labels.to(args.device)
graphs = graphs.to(args.device)
feat = graphs.ndata.pop('attr')
outputs = net(graphs, feat)

loss = criterion(outputs, labels)
running_loss += loss.item()

# backprop
optimizer.zero_grad()
loss.backward()
optimizer.step()

# report
bar.set_description('epoch-{}'.format(epoch))
bar.close()
# the final batch will be aligned
running_loss = running_loss / total_iters

return running_loss


def eval_net(args, net, dataloader, criterion):
net.eval()

total = 0
total_loss = 0
total_correct = 0

for data in dataloader:
graphs, labels = data
graphs = graphs.to(args.device)
labels = labels.to(args.device)
feat = graphs.ndata.pop('attr')
total += len(labels)
outputs = net(graphs, feat)
_, predicted = torch.max(outputs.data, 1)

total_correct += (predicted == labels.data).sum().item()
loss = criterion(outputs, labels)
# crossentropy(reduce=True) for default
total_loss += loss.item() * len(labels)

loss, acc = 1.0*total_loss / total, 1.0*total_correct / total

net.train()

return loss, acc


def main(args):
def main():

# set up seeds, args.seed supported
torch.manual_seed(seed=args.seed)
np.random.seed(seed=args.seed)
torch.manual_seed(seed=202106)
np.random.seed(seed=202106)

is_cuda = not args.disable_cuda and torch.cuda.is_available()
is_cuda = torch.cuda.is_available()

if is_cuda:
args.device = torch.device("cuda:" + str(args.device))
torch.cuda.manual_seed_all(seed=args.seed)
device = torch.device("cuda")
torch.cuda.manual_seed_all(seed=202106)
else:
args.device = torch.device("cpu")
device = torch.device("cpu")

dataset = GINDataset(args.dataset, not args.learn_eps)
dataset = CoraGraphDataset()
data = dataset[0]
data.ndata['x'] = data.ndata['feat']
train_mask = data.ndata['train_mask']
val_mask = data.ndata['val_mask']
test_mask = data.ndata['test_mask']
labels = data.ndata['label']
n_edges = data.number_of_edges()

trainloader, validloader = GINDataLoader(
dataset, batch_size=args.batch_size, device=args.device,
seed=args.seed, shuffle=True,
split_name='fold10', fold_idx=args.fold_idx).train_valid_loader()
# or split_name='rand', split_ratio=0.7

model = GIN(
args.num_layers, args.num_mlp_layers,
dataset.dim_nfeats, args.hidden_dim, dataset.gclasses,
args.final_dropout, args.learn_eps,
args.graph_pooling_type, args.neighbor_pooling_type).to(args.device)
model = GCN(data.ndata['x'].size(1), dataset.num_classes, [16], activation_name='relu',
dropout = 0.5).to(device)

criterion = nn.CrossEntropyLoss() # defaul reduce is true
optimizer = optim.Adam(model.parameters(), lr=args.lr)
optimizer = optim.Adam(model.parameters(), lr=0.01)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5)

# it's not cost-effective to hanle the cursor and init 0
# https://stackoverflow.com/a/23121189
tbar = tqdm(range(args.epochs), unit="epoch", position=3, ncols=0, file=sys.stdout)
vbar = tqdm(range(args.epochs), unit="epoch", position=4, ncols=0, file=sys.stdout)
lrbar = tqdm(range(args.epochs), unit="epoch", position=5, ncols=0, file=sys.stdout)

for epoch, _, _ in zip(tbar, vbar, lrbar):

train(args, model, trainloader, optimizer, criterion, epoch)
scheduler.step()

train_loss, train_acc = eval_net(
args, model, trainloader, criterion)
tbar.set_description(
'train set - average loss: {:.4f}, accuracy: {:.0f}%'
.format(train_loss, 100. * train_acc))

valid_loss, valid_acc = eval_net(
args, model, validloader, criterion)
vbar.set_description(
'valid set - average loss: {:.4f}, accuracy: {:.0f}%'
.format(valid_loss, 100. * valid_acc))

if not args.filename == "":
with open(args.filename, 'a') as f:
f.write('%s %s %s %s' % (
args.dataset,
args.learn_eps,
args.neighbor_pooling_type,
args.graph_pooling_type
))
f.write("\n")
f.write("%f %f %f %f" % (
train_loss,
train_acc,
valid_loss,
valid_acc
))
f.write("\n")

lrbar.set_description(
"Learning eps with learn_eps={}: {}".format(
args.learn_eps, [layer.eps.data.item() for layer in model.ginlayers]))

tbar.close()
vbar.close()
lrbar.close()
dur = []
for epoch in range(200):
model.train()
if epoch >= 3:
t0 = time.time()
# forward
logits = model(data)
loss = criterion(logits[train_mask], labels[train_mask])

optimizer.zero_grad()
loss.backward()
optimizer.step()

if __name__ == '__main__':
args = Parser(description='GIN').args
print('show all arguments configuration...')
print(args)
if epoch >= 3:
dur.append(time.time() - t0)

acc = evaluate(model, data, labels, val_mask)
print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | Accuracy {:.4f} | "
"ETputs(KTEPS) {:.2f}". format(epoch, np.mean(dur), loss.item(),
acc, n_edges / np.mean(dur) / 1000))

main(args)
print()
acc = evaluate(model, data, labels, test_mask)
print("Test accuracy {:.2%}".format(acc))


if __name__ == '__main__':
main()


Loading…
Cancel
Save