diff --git a/autogl/module/model/dgl/__init__.py b/autogl/module/model/dgl/__init__.py index c9f4a8a..6a245eb 100644 --- a/autogl/module/model/dgl/__init__.py +++ b/autogl/module/model/dgl/__init__.py @@ -3,13 +3,8 @@ from .base import BaseModel from .topkpool import AutoTopkpool # from .graph_sage import AutoSAGE -from .graphsage import AutoSAGE from .graph_saint import GraphSAINTAggregationModel -from .gcn import AutoGCN -from .gat import AutoGAT -from .gin import AutoGIN -from .gin_dgl import GIN -from .gcn_dgl import GCN +from .gcn_dgl import GCN,AutoGCN from .graphsage_dgl import GraphSAGE from .gat_dgl import GAT @@ -18,13 +13,9 @@ __all__ = [ "register_model", "BaseModel", "AutoTopkpool", - "AutoSAGE", "GraphSAINTAggregationModel", - "AutoGCN", - "AutoGAT", - "AutoGIN", - "GIN", "GCN", + "AutoGCN", "GraphSAGE", "GAT" ] diff --git a/autogl/module/model/dgl/dataloader_gin.py b/autogl/module/model/dgl/dataloader_gin.py deleted file mode 100644 index 0721b3a..0000000 --- a/autogl/module/model/dgl/dataloader_gin.py +++ /dev/null @@ -1,85 +0,0 @@ -""" -PyTorch compatible dataloader -""" - - -import math -import numpy as np -import torch -from torch.utils.data.sampler import SubsetRandomSampler -from sklearn.model_selection import StratifiedKFold -import dgl -from dgl.dataloading import GraphDataLoader - - -class GINDataLoader(): - def __init__(self, - dataset, - batch_size, - device, - collate_fn=None, - seed=0, - shuffle=True, - split_name='fold10', - fold_idx=0, - split_ratio=0.7): - - self.shuffle = shuffle - self.seed = seed - self.kwargs = {'pin_memory': True} if 'cuda' in device.type else {} - - labels = [l for _, l in dataset] - - if split_name == 'fold10': - train_idx, valid_idx = self._split_fold10( - labels, fold_idx, seed, shuffle) - elif split_name == 'rand': - train_idx, valid_idx = self._split_rand( - labels, split_ratio, seed, shuffle) - else: - raise NotImplementedError() - - train_sampler = SubsetRandomSampler(train_idx) - valid_sampler = SubsetRandomSampler(valid_idx) - - self.train_loader = GraphDataLoader( - dataset, sampler=train_sampler, - batch_size=batch_size, collate_fn=collate_fn, **self.kwargs) - self.valid_loader = GraphDataLoader( - dataset, sampler=valid_sampler, - batch_size=batch_size, collate_fn=collate_fn, **self.kwargs) - - def train_valid_loader(self): - return self.train_loader, self.valid_loader - - def _split_fold10(self, labels, fold_idx=0, seed=0, shuffle=True): - ''' 10 flod ''' - assert 0 <= fold_idx and fold_idx < 10, print( - "fold_idx must be from 0 to 9.") - - skf = StratifiedKFold(n_splits=10, shuffle=shuffle, random_state=seed) - idx_list = [] - for idx in skf.split(np.zeros(len(labels)), labels): # split(x, y) - idx_list.append(idx) - train_idx, valid_idx = idx_list[fold_idx] - - print( - "train_set : test_set = %d : %d", - len(train_idx), len(valid_idx)) - - return train_idx, valid_idx - - def _split_rand(self, labels, split_ratio=0.7, seed=0, shuffle=True): - num_entries = len(labels) - indices = list(range(num_entries)) - np.random.seed(seed) - np.random.shuffle(indices) - split = int(math.floor(split_ratio * num_entries)) - train_idx, valid_idx = indices[:split], indices[split:] - - print( - "train_set : test_set = %d : %d", - len(train_idx), len(valid_idx)) - - return train_idx, valid_idx - diff --git a/autogl/module/model/dgl/gcn_dgl.py b/autogl/module/model/dgl/gcn_dgl.py index 4f24621..2ff57d0 100644 --- a/autogl/module/model/dgl/gcn_dgl.py +++ b/autogl/module/model/dgl/gcn_dgl.py @@ -54,15 +54,12 @@ class GCN(ClassificationSupportedSequentialModel): else: self._dropout: _typing.Optional[torch.nn.Dropout] = None - def forward(self, data, enable_activation: bool = True) -> torch.Tensor: - - x: torch.Tensor = data.ndata['feat'] + def forward(self, data, x, enable_activation: bool = True) -> torch.Tensor: if self.add_self_loops: data = remove_self_loop(data) data = add_self_loop(data) - x: torch.Tensor = self._convolution.forward(data, x) if self._activation_name is not None and enable_activation: x: torch.Tensor = activate_func(x, self._activation_name) @@ -224,6 +221,12 @@ class GCN(ClassificationSupportedSequentialModel): for __edge_index in getattr(data, "edge_indexes") ] + def forward(self, data): + x = data.ndata['x'] + for gcn in self.__sequential_encoding_layers: + x = gcn(data,x) + return x + def cls_encode(self, data) -> torch.Tensor: edge_indexes_and_weights: _typing.Union[ _typing.Sequence[ diff --git a/autogl/module/model/dgl/graph_saint_dgl.py b/autogl/module/model/dgl/graph_saint_dgl.py new file mode 100644 index 0000000..eef4205 --- /dev/null +++ b/autogl/module/model/dgl/graph_saint_dgl.py @@ -0,0 +1,299 @@ +import torch.nn as nn +import torch.nn.functional as F +import torch as th +import dgl.function as fn +import math +import os +import time +import torch as th +import random +import numpy as np +import dgl.function as fn +import dgl +from dgl.sampling import random_walk, pack_traces + +class GCNLayer(nn.Module): + def __init__(self, in_dim, out_dim, order=1, act=None, + dropout=0, batch_norm=False, aggr="concat"): + super(GCNLayer, self).__init__() + self.lins = nn.ModuleList() + self.bias = nn.ParameterList() + for _ in range(order + 1): + self.lins.append(nn.Linear(in_dim, out_dim, bias=False)) + self.bias.append(nn.Parameter(th.zeros(out_dim))) + + self.order = order + self.act = act + self.dropout = nn.Dropout(dropout) + + self.batch_norm = batch_norm + if batch_norm: + self.offset, self.scale = nn.ParameterList(), nn.ParameterList() + for _ in range(order + 1): + self.offset.append(nn.Parameter(th.zeros(out_dim))) + self.scale.append(nn.Parameter(th.ones(out_dim))) + + self.aggr = aggr + self.reset_parameters() + + def reset_parameters(self): + for lin in self.lins: + nn.init.xavier_normal_(lin.weight) + + def feat_trans(self, features, idx): + h = self.lins[idx](features) + self.bias[idx] + + if self.act is not None: + h = self.act(h) + + if self.batch_norm: + mean = h.mean(dim=1).view(h.shape[0], 1) + var = h.var(dim=1, unbiased=False).view(h.shape[0], 1) + 1e-9 + h = (h - mean) * self.scale[idx] * th.rsqrt(var) + self.offset[idx] + + return h + + def forward(self, graph, features): + g = graph.local_var() + h_in = self.dropout(features) + h_hop = [h_in] + + D_norm = g.ndata['train_D_norm'] if 'train_D_norm' in g.ndata else g.ndata['full_D_norm'] + for _ in range(self.order): + g.ndata['h'] = h_hop[-1] + if 'w' not in g.edata: + g.edata['w'] = th.ones((g.num_edges(), )).to(features.device) + g.update_all(fn.u_mul_e('h', 'w', 'm'), + fn.sum('m', 'h')) + h = g.ndata.pop('h') + h = h * D_norm + h_hop.append(h) + + h_part = [self.feat_trans(ft, idx) for idx, ft in enumerate(h_hop)] + if self.aggr == "mean": + h_out = h_part[0] + for i in range(len(h_part) - 1): + h_out = h_out + h_part[i + 1] + elif self.aggr == "concat": + h_out = th.cat(h_part, 1) + else: + raise NotImplementedError + + return h_out + + +class GCNNet(nn.Module): + def __init__(self, in_dim, hid_dim, out_dim, arch="1-1-0", + act=F.relu, dropout=0, batch_norm=False, aggr="concat"): + super(GCNNet, self).__init__() + self.gcn = nn.ModuleList() + + orders = list(map(int, arch.split('-'))) + self.gcn.append(GCNLayer(in_dim=in_dim, out_dim=hid_dim, order=orders[0], + act=act, dropout=dropout, batch_norm=batch_norm, aggr=aggr)) + pre_out = ((aggr == "concat") * orders[0] + 1) * hid_dim + + for i in range(1, len(orders)-1): + self.gcn.append(GCNLayer(in_dim=pre_out, out_dim=hid_dim, order=orders[i], + act=act, dropout=dropout, batch_norm=batch_norm, aggr=aggr)) + pre_out = ((aggr == "concat") * orders[i] + 1) * hid_dim + + self.gcn.append(GCNLayer(in_dim=pre_out, out_dim=hid_dim, order=orders[-1], + act=act, dropout=dropout, batch_norm=batch_norm, aggr=aggr)) + pre_out = ((aggr == "concat") * orders[-1] + 1) * hid_dim + + self.out_layer = GCNLayer(in_dim=pre_out, out_dim=out_dim, order=0, + act=None, dropout=dropout, batch_norm=False, aggr=aggr) + + def forward(self, graph): + h = graph.ndata['feat'] + + for layer in self.gcn: + h = layer(graph, h) + + h = F.normalize(h, p=2, dim=1) + h = self.out_layer(graph, h) + + return h + + + + +# The base class of sampler +# (TODO): online sampling +class SAINTSampler(object): + def __init__(self, dn, g, train_nid, node_budget, num_repeat=50): + """ + :param dn: name of dataset + :param g: full graph + :param train_nid: ids of training nodes + :param node_budget: expected number of sampled nodes + :param num_repeat: number of times of repeating sampling one node + """ + self.g = g + self.train_g: dgl.graph = g.subgraph(train_nid) + self.dn, self.num_repeat = dn, num_repeat + self.node_counter = th.zeros((self.train_g.num_nodes(),)) + self.edge_counter = th.zeros((self.train_g.num_edges(),)) + self.prob = None + + graph_fn, norm_fn = self.__generate_fn__() + + if os.path.exists(graph_fn): + self.subgraphs = np.load(graph_fn, allow_pickle=True) + aggr_norm, loss_norm = np.load(norm_fn, allow_pickle=True) + else: + os.makedirs('./subgraphs/', exist_ok=True) + + self.subgraphs = [] + self.N, sampled_nodes = 0, 0 + + t = time.perf_counter() + while sampled_nodes <= self.train_g.num_nodes() * num_repeat: + subgraph = self.__sample__() + self.subgraphs.append(subgraph) + sampled_nodes += subgraph.shape[0] + self.N += 1 + print(f'Sampling time: [{time.perf_counter() - t:.2f}s]') + np.save(graph_fn, self.subgraphs) + + t = time.perf_counter() + self.__counter__() + aggr_norm, loss_norm = self.__compute_norm__() + print(f'Normalization time: [{time.perf_counter() - t:.2f}s]') + np.save(norm_fn, (aggr_norm, loss_norm)) + + self.train_g.ndata['l_n'] = th.Tensor(loss_norm) + self.train_g.edata['w'] = th.Tensor(aggr_norm) + self.__compute_degree_norm() + + self.num_batch = math.ceil(self.train_g.num_nodes() / node_budget) + random.shuffle(self.subgraphs) + self.__clear__() + print("The number of subgraphs is: ", len(self.subgraphs)) + print("The size of subgraphs is about: ", len(self.subgraphs[-1])) + + def __clear__(self): + self.prob = None + self.node_counter = None + self.edge_counter = None + self.g = None + + def __counter__(self): + + for sampled_nodes in self.subgraphs: + sampled_nodes = th.from_numpy(sampled_nodes) + self.node_counter[sampled_nodes] += 1 + + subg = self.train_g.subgraph(sampled_nodes) + sampled_edges = subg.edata[dgl.EID] + self.edge_counter[sampled_edges] += 1 + + def __generate_fn__(self): + raise NotImplementedError + + def __compute_norm__(self): + self.node_counter[self.node_counter == 0] = 1 + self.edge_counter[self.edge_counter == 0] = 1 + + loss_norm = self.N / self.node_counter / self.train_g.num_nodes() + + self.train_g.ndata['n_c'] = self.node_counter + self.train_g.edata['e_c'] = self.edge_counter + self.train_g.apply_edges(fn.v_div_e('n_c', 'e_c', 'a_n')) + aggr_norm = self.train_g.edata.pop('a_n') + + self.train_g.ndata.pop('n_c') + self.train_g.edata.pop('e_c') + + return aggr_norm.numpy(), loss_norm.numpy() + + def __compute_degree_norm(self): + + self.train_g.ndata['train_D_norm'] = 1. / self.train_g.in_degrees().float().clamp(min=1).unsqueeze(1) + self.g.ndata['full_D_norm'] = 1. / self.g.in_degrees().float().clamp(min=1).unsqueeze(1) + + def __sample__(self): + raise NotImplementedError + + def __len__(self): + return self.num_batch + + def __iter__(self): + self.n = 0 + return self + + def __next__(self): + if self.n < self.num_batch: + result = self.train_g.subgraph(self.subgraphs[self.n]) + self.n += 1 + return result + else: + random.shuffle(self.subgraphs) + raise StopIteration() + + +class SAINTNodeSampler(SAINTSampler): + def __init__(self, node_budget, dn, g, train_nid, num_repeat=50): + self.node_budget = node_budget + super(SAINTNodeSampler, self).__init__(dn, g, train_nid, node_budget, num_repeat) + + def __generate_fn__(self): + graph_fn = os.path.join('./subgraphs/{}_Node_{}_{}.npy'.format(self.dn, self.node_budget, + self.num_repeat)) + norm_fn = os.path.join('./subgraphs/{}_Node_{}_{}_norm.npy'.format(self.dn, self.node_budget, + self.num_repeat)) + return graph_fn, norm_fn + + def __sample__(self): + if self.prob is None: + self.prob = self.train_g.in_degrees().float().clamp(min=1) + + sampled_nodes = th.multinomial(self.prob, num_samples=self.node_budget, replacement=True).unique() + return sampled_nodes.numpy() + + +class SAINTEdgeSampler(SAINTSampler): + def __init__(self, edge_budget, dn, g, train_nid, num_repeat=50): + self.edge_budget = edge_budget + super(SAINTEdgeSampler, self).__init__(dn, g, train_nid, edge_budget * 2, num_repeat) + + def __generate_fn__(self): + graph_fn = os.path.join('./subgraphs/{}_Edge_{}_{}.npy'.format(self.dn, self.edge_budget, + self.num_repeat)) + norm_fn = os.path.join('./subgraphs/{}_Edge_{}_{}_norm.npy'.format(self.dn, self.edge_budget, + self.num_repeat)) + return graph_fn, norm_fn + + def __sample__(self): + if self.prob is None: + src, dst = self.train_g.edges() + src_degrees, dst_degrees = self.train_g.in_degrees(src).float().clamp(min=1),\ + self.train_g.in_degrees(dst).float().clamp(min=1) + self.prob = 1. / src_degrees + 1. / dst_degrees + + sampled_edges = th.multinomial(self.prob, num_samples=self.edge_budget, replacement=True).unique() + + sampled_src, sampled_dst = self.train_g.find_edges(sampled_edges) + sampled_nodes = th.cat([sampled_src, sampled_dst]).unique() + return sampled_nodes.numpy() + + +class SAINTRandomWalkSampler(SAINTSampler): + def __init__(self, num_roots, length, dn, g, train_nid, num_repeat=50): + self.num_roots, self.length = num_roots, length + super(SAINTRandomWalkSampler, self).__init__(dn, g, train_nid, num_roots * length, num_repeat) + + def __generate_fn__(self): + graph_fn = os.path.join('./subgraphs/{}_RW_{}_{}_{}.npy'.format(self.dn, self.num_roots, + self.length, self.num_repeat)) + norm_fn = os.path.join('./subgraphs/{}_RW_{}_{}_{}_norm.npy'.format(self.dn, self.num_roots, + self.length, self.num_repeat)) + return graph_fn, norm_fn + + def __sample__(self): + sampled_roots = th.randint(0, self.train_g.num_nodes(), (self.num_roots, )) + traces, types = random_walk(self.train_g, nodes=sampled_roots, length=self.length) + sampled_nodes, _, _, _ = pack_traces(traces, types) + sampled_nodes = sampled_nodes.unique() + return sampled_nodes.numpy() diff --git a/test/model_nlf/nclf_dgl.py b/test/model_nlf/nclf_dgl.py index d9fab66..d1bdcb6 100644 --- a/test/model_nlf/nclf_dgl.py +++ b/test/model_nlf/nclf_dgl.py @@ -3,6 +3,7 @@ import sys import logging logging.basicConfig(level=logging.INFO) from tqdm import tqdm +import time sys.path.append("../../") print(os.getcwd()) @@ -16,9 +17,7 @@ import torch.nn as nn import torch.nn.functional as F import torch.optim as optim -from autogl.module.model.ginparser import Parser -from autogl.module.model.dataloader_gin import GINDataLoader -from autogl.module.model import GIN +from autogl.module.model import GCN from pdb import set_trace import numpy as np @@ -26,151 +25,74 @@ from autogl.solver.utils import set_seed set_seed(202106) -def train(args, net, trainloader, optimizer, criterion, epoch): - net.train() +def evaluate(model, graph, labels, mask): + model.eval() + with torch.no_grad(): + logits = model(graph) + logits = logits[mask] + labels = labels[mask] + _, indices = torch.max(logits, dim=1) + correct = torch.sum(indices == labels) + return correct.item() * 1.0 / len(labels) - running_loss = 0 - total_iters = len(trainloader) - # setup the offset to avoid the overlap with mouse cursor - bar = tqdm(range(total_iters), unit='batch', position=2, file=sys.stdout) - for pos, (graphs, labels) in zip(bar, trainloader): - # batch graphs will be shipped to device in forward part of model - labels = labels.to(args.device) - graphs = graphs.to(args.device) - feat = graphs.ndata.pop('attr') - outputs = net(graphs, feat) - - loss = criterion(outputs, labels) - running_loss += loss.item() - - # backprop - optimizer.zero_grad() - loss.backward() - optimizer.step() - - # report - bar.set_description('epoch-{}'.format(epoch)) - bar.close() - # the final batch will be aligned - running_loss = running_loss / total_iters - - return running_loss - - -def eval_net(args, net, dataloader, criterion): - net.eval() - - total = 0 - total_loss = 0 - total_correct = 0 - - for data in dataloader: - graphs, labels = data - graphs = graphs.to(args.device) - labels = labels.to(args.device) - feat = graphs.ndata.pop('attr') - total += len(labels) - outputs = net(graphs, feat) - _, predicted = torch.max(outputs.data, 1) - - total_correct += (predicted == labels.data).sum().item() - loss = criterion(outputs, labels) - # crossentropy(reduce=True) for default - total_loss += loss.item() * len(labels) - - loss, acc = 1.0*total_loss / total, 1.0*total_correct / total - - net.train() - - return loss, acc - - -def main(args): +def main(): # set up seeds, args.seed supported - torch.manual_seed(seed=args.seed) - np.random.seed(seed=args.seed) + torch.manual_seed(seed=202106) + np.random.seed(seed=202106) - is_cuda = not args.disable_cuda and torch.cuda.is_available() + is_cuda = torch.cuda.is_available() if is_cuda: - args.device = torch.device("cuda:" + str(args.device)) - torch.cuda.manual_seed_all(seed=args.seed) + device = torch.device("cuda") + torch.cuda.manual_seed_all(seed=202106) else: - args.device = torch.device("cpu") + device = torch.device("cpu") - dataset = GINDataset(args.dataset, not args.learn_eps) + dataset = CoraGraphDataset() + data = dataset[0] + data.ndata['x'] = data.ndata['feat'] + train_mask = data.ndata['train_mask'] + val_mask = data.ndata['val_mask'] + test_mask = data.ndata['test_mask'] + labels = data.ndata['label'] + n_edges = data.number_of_edges() - trainloader, validloader = GINDataLoader( - dataset, batch_size=args.batch_size, device=args.device, - seed=args.seed, shuffle=True, - split_name='fold10', fold_idx=args.fold_idx).train_valid_loader() - # or split_name='rand', split_ratio=0.7 - - model = GIN( - args.num_layers, args.num_mlp_layers, - dataset.dim_nfeats, args.hidden_dim, dataset.gclasses, - args.final_dropout, args.learn_eps, - args.graph_pooling_type, args.neighbor_pooling_type).to(args.device) + model = GCN(data.ndata['x'].size(1), dataset.num_classes, [16], activation_name='relu', + dropout = 0.5).to(device) criterion = nn.CrossEntropyLoss() # defaul reduce is true - optimizer = optim.Adam(model.parameters(), lr=args.lr) + optimizer = optim.Adam(model.parameters(), lr=0.01) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5) - # it's not cost-effective to hanle the cursor and init 0 - # https://stackoverflow.com/a/23121189 - tbar = tqdm(range(args.epochs), unit="epoch", position=3, ncols=0, file=sys.stdout) - vbar = tqdm(range(args.epochs), unit="epoch", position=4, ncols=0, file=sys.stdout) - lrbar = tqdm(range(args.epochs), unit="epoch", position=5, ncols=0, file=sys.stdout) - - for epoch, _, _ in zip(tbar, vbar, lrbar): - - train(args, model, trainloader, optimizer, criterion, epoch) - scheduler.step() - - train_loss, train_acc = eval_net( - args, model, trainloader, criterion) - tbar.set_description( - 'train set - average loss: {:.4f}, accuracy: {:.0f}%' - .format(train_loss, 100. * train_acc)) - - valid_loss, valid_acc = eval_net( - args, model, validloader, criterion) - vbar.set_description( - 'valid set - average loss: {:.4f}, accuracy: {:.0f}%' - .format(valid_loss, 100. * valid_acc)) - - if not args.filename == "": - with open(args.filename, 'a') as f: - f.write('%s %s %s %s' % ( - args.dataset, - args.learn_eps, - args.neighbor_pooling_type, - args.graph_pooling_type - )) - f.write("\n") - f.write("%f %f %f %f" % ( - train_loss, - train_acc, - valid_loss, - valid_acc - )) - f.write("\n") - - lrbar.set_description( - "Learning eps with learn_eps={}: {}".format( - args.learn_eps, [layer.eps.data.item() for layer in model.ginlayers])) - - tbar.close() - vbar.close() - lrbar.close() + dur = [] + for epoch in range(200): + model.train() + if epoch >= 3: + t0 = time.time() + # forward + logits = model(data) + loss = criterion(logits[train_mask], labels[train_mask]) + optimizer.zero_grad() + loss.backward() + optimizer.step() -if __name__ == '__main__': - args = Parser(description='GIN').args - print('show all arguments configuration...') - print(args) + if epoch >= 3: + dur.append(time.time() - t0) + + acc = evaluate(model, data, labels, val_mask) + print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | Accuracy {:.4f} | " + "ETputs(KTEPS) {:.2f}". format(epoch, np.mean(dur), loss.item(), + acc, n_edges / np.mean(dur) / 1000)) - main(args) + print() + acc = evaluate(model, data, labels, test_mask) + print("Test accuracy {:.2%}".format(acc)) + + +if __name__ == '__main__': + + main()