| @@ -51,7 +51,7 @@ class GCN(torch.nn.Module): | |||
| for i in range(self.num_layer - 2): | |||
| self.convs.append( | |||
| GraphConv( | |||
| self.args["hidden"][0], | |||
| self.args["hidden"][i], | |||
| self.args["hidden"][i + 1] | |||
| ) | |||
| ) | |||
| @@ -81,25 +81,16 @@ class GCN(torch.nn.Module): | |||
| def cls_decode(self, x: torch.Tensor) -> torch.Tensor: | |||
| return torch.nn.functional.log_softmax(x, dim=1) | |||
| # def lp_encode(self, data): | |||
| # x: torch.Tensor = data.ndata['feat'] | |||
| # for i in range(len(self.convs) - 2): | |||
| # x = self.convs[i]( | |||
| # autogl.data.Data(x, data.edges()) | |||
| # ) | |||
| # x = self.__sequential_encoding_layers[-2]( | |||
| # autogl.data.Data(x, data.edges()), enable_activation=False | |||
| # ) | |||
| # return x | |||
| def lp_encode(self, data): | |||
| # discard the last layer, only use the layer before | |||
| x = data.ndata['feat'] | |||
| for i in range(len(self.convs)): | |||
| if i!=0: | |||
| for i in range(len(self.convs) - 1): | |||
| if i != 0: | |||
| x = F.dropout(x, p=self.args["dropout"], training=self.training) | |||
| x = self.convs[i](data, x) | |||
| if i != self.num_layer - 1: | |||
| if i != len(self.convs) - 2: | |||
| x = activate_func(x, self.args["act"]) | |||
| return x | |||
| @@ -260,7 +260,7 @@ class LinkPredictionTrainer(BaseLinkPredictionTrainer): | |||
| else: | |||
| scheduler = None | |||
| for epoch in range(1, self.max_epoch): | |||
| for epoch in range(1, self.max_epoch + 1): | |||
| model.train() | |||
| optimizer.zero_grad() | |||
| @@ -1,17 +1,17 @@ | |||
| """ | |||
| Baseline that use early stopping | |||
| """ | |||
| import pickle | |||
| import dgl | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| import itertools | |||
| import numpy as np | |||
| import scipy.sparse as sp | |||
| import dgl.function as fn | |||
| import random | |||
| from dgl.data import CoraGraphDataset, PubmedGraphDataset, CiteseerGraphDataset | |||
| # from autogl.module.train.link_prediction_full import LinkPredictionTrainer | |||
| import sys | |||
| sys.path.insert(0, "../") | |||
| from autogl.module.model.dgl.graphsage import GraphSAGE | |||
| import dgl.data | |||
| from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter | |||
| @@ -31,10 +31,13 @@ parser.add_argument("--model", default="sage", type=str,help="model to use", cho | |||
| parser.add_argument("--seed", type=int, default=0, help="random seed") | |||
| parser.add_argument('--repeat', type=int, default=10) | |||
| parser.add_argument("--device", default=0, type=int, help="GPU device") | |||
| args = parser.parse_args() | |||
| args.device = torch.device('cuda:0') | |||
| device = torch.device('cuda:0') | |||
| if args.device >= 0: | |||
| device = torch.device(f"cuda:{args.device}") | |||
| else: | |||
| device = torch.device("cpu") | |||
| if args.dataset == 'Cora': | |||
| dataset = CoraGraphDataset() | |||
| @@ -95,15 +98,19 @@ class GAT(nn.Module): | |||
| return h | |||
| def split_train_test(g): | |||
| def split_train_valid_test(g): | |||
| u, v = g.edges() | |||
| eids = np.arange(g.number_of_edges()) | |||
| eids = np.random.permutation(eids) | |||
| valid_size = int(len(eids) * 0.1) | |||
| test_size = int(len(eids) * 0.1) | |||
| train_size = g.number_of_edges() - test_size | |||
| train_size = g.number_of_edges() - test_size - valid_size | |||
| test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]] | |||
| train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]] | |||
| valid_pos_u, valid_pos_v = u[eids[test_size:test_size+valid_size]], v[eids[test_size:test_size+valid_size]] | |||
| train_pos_u, train_pos_v = u[eids[test_size+valid_size:]], v[eids[test_size+valid_size:]] | |||
| # Find all negative edges and split them for training and testing | |||
| adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy()))) | |||
| @@ -112,17 +119,22 @@ def split_train_test(g): | |||
| neg_eids = np.random.choice(len(neg_u), g.number_of_edges()) | |||
| test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]] | |||
| train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]] | |||
| valid_neg_u, valid_neg_v = neg_u[neg_eids[test_size:test_size+valid_size]], neg_v[neg_eids[test_size:test_size+valid_size]] | |||
| train_neg_u, train_neg_v = neg_u[neg_eids[test_size+valid_size:]], neg_v[neg_eids[test_size+valid_size:]] | |||
| train_g = dgl.remove_edges(g, eids[:test_size]) | |||
| train_g = dgl.remove_edges(g, eids[:test_size+valid_size]) | |||
| train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes()) | |||
| train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes()) | |||
| valid_pos_g = dgl.graph((valid_pos_u, valid_pos_v), num_nodes=g.number_of_nodes()) | |||
| valid_neg_g = dgl.graph((valid_neg_u, valid_neg_v), num_nodes=g.number_of_nodes()) | |||
| test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes()) | |||
| test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes()) | |||
| return train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g | |||
| return train_g, train_pos_g, train_neg_g, valid_pos_g, valid_neg_g, test_pos_g, test_neg_g | |||
| def get_link_labels(pos_edge_index, neg_edge_index): | |||
| E = pos_edge_index.size(1) + neg_edge_index.size(1) | |||
| @@ -135,59 +147,71 @@ def lp_decode(z, pos_edge_index, neg_edge_index): | |||
| logits = (z[edge_index[0]] * z[edge_index[1]]).sum(dim=-1) | |||
| return logits | |||
| @torch.no_grad() | |||
| def evaluate(model, data, mask): | |||
| model.eval() | |||
| if mask == "val": offset = 3 | |||
| else: offset = 5 | |||
| z = model(data[0]) | |||
| link_logits = lp_decode( | |||
| z, torch.stack(data[offset].edges()), torch.stack(data[offset + 1].edges()) | |||
| ) | |||
| link_probs = link_logits.sigmoid() | |||
| link_labels = get_link_labels( | |||
| torch.stack(data[offset].edges()), torch.stack(data[offset + 1].edges()) | |||
| ) | |||
| result = roc_auc_score(link_labels.cpu().numpy(), link_probs.cpu().numpy()) | |||
| return result | |||
| res = [] | |||
| for seed in tqdm(range(1234, 1234+args.repeat)): | |||
| setup_seed(seed) | |||
| g = dataset[0].to(device) | |||
| train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g = split_train_test(g.cpu()) | |||
| train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g = train_g.to(device), train_pos_g.to(device), train_neg_g.to(device), test_pos_g.to(device), test_neg_g.to(device) | |||
| g = dataset[0] | |||
| splitted = list(split_train_valid_test(g)) | |||
| if args.model == 'gcn' or args.model == 'gat': | |||
| train_g = dgl.add_self_loop(train_g) | |||
| splitted[0] = dgl.add_self_loop(splitted[0]) | |||
| splitted = [g.to(device) for g in splitted] | |||
| if args.model == 'gcn': | |||
| model = GCN(train_g.ndata['feat'].shape[1], 16).to(device) | |||
| model = GCN(splitted[0].ndata['feat'].shape[1], 16).to(device) | |||
| elif args.model == 'gat': | |||
| model = GAT(train_g.ndata['feat'].shape[1], 64).to(device) | |||
| model = GAT(splitted[0].ndata['feat'].shape[1], 64).to(device) | |||
| elif args.model == 'sage': | |||
| model = GraphSAGE(train_g.ndata['feat'].shape[1], 16).to(device) | |||
| model = GraphSAGE(splitted[0].ndata['feat'].shape[1], 16).to(device) | |||
| else: | |||
| assert False | |||
| optimizer = torch.optim.Adam(model.parameters(), lr=0.01) | |||
| all_logits = [] | |||
| best_auc = 0. | |||
| for epoch in range(100): | |||
| model.train() | |||
| optimizer.zero_grad() | |||
| z = model(train_g) | |||
| z = model(splitted[0]) | |||
| link_logits = lp_decode( | |||
| z, torch.stack(train_pos_g.edges()), torch.stack(train_neg_g.edges()) | |||
| z, torch.stack(splitted[1].edges()), torch.stack(splitted[2].edges()) | |||
| ) | |||
| link_labels = get_link_labels( | |||
| torch.stack(train_pos_g.edges()), torch.stack(train_neg_g.edges()) | |||
| torch.stack(splitted[1].edges()), torch.stack(splitted[2].edges()) | |||
| ) | |||
| loss = F.binary_cross_entropy_with_logits(link_logits, link_labels) | |||
| loss.backward() | |||
| optimizer.step() | |||
| model.eval() | |||
| with torch.no_grad(): | |||
| z = model(train_g) | |||
| link_logits = lp_decode( | |||
| z, torch.stack(test_pos_g.edges()), torch.stack(test_neg_g.edges()) | |||
| ) | |||
| link_probs = link_logits.sigmoid() | |||
| link_labels = get_link_labels( | |||
| torch.stack(test_pos_g.edges()), torch.stack(test_neg_g.edges()) | |||
| ) | |||
| auc_val = evaluate(model, splitted, "val") | |||
| result = roc_auc_score(link_labels.cpu().numpy(), link_probs.cpu().numpy()) | |||
| res.append(result) | |||
| if auc_val > best_auc: | |||
| best_auc = auc_val | |||
| best_parameters = pickle.dumps(model.state_dict()) | |||
| model.load_state_dict(pickle.loads(best_parameters)) | |||
| res.append(evaluate(model, splitted, "test")) | |||
| print(np.mean(res), np.std(res)) | |||
| print("{:.2f} ~ {:.2f}".format(np.mean(res) * 100, np.std(res) * 100)) | |||
| """ | |||
| @@ -19,18 +19,18 @@ def get_encoder_decoder_hp(model='gin', decoder=None): | |||
| } | |||
| elif model == 'gcn': | |||
| model_hp = { | |||
| "num_layers": 2, | |||
| "hidden": [16], | |||
| "dropout": 0.0, | |||
| "act": "relu" | |||
| "num_layers": 3, | |||
| "hidden": [16, 16], | |||
| "dropout": 0., | |||
| "act": "relu", | |||
| } | |||
| elif model == 'sage': | |||
| model_hp = { | |||
| "num_layers": 2, | |||
| "hidden": [64], | |||
| "dropout": 0.0, | |||
| "act": "relu", | |||
| "agg": "mean", | |||
| 'num_layers': 3, | |||
| 'hidden': [16, 16], | |||
| 'dropout': 0.0, | |||
| 'act': 'relu', | |||
| 'agg': 'mean' | |||
| } | |||
| elif model == 'topk': | |||
| model_hp = { | |||
| @@ -1,174 +0,0 @@ | |||
| """ | |||
| Link Prediction using Graph Neural Networks | |||
| =========================================== | |||
| In the :doc:`introduction <1_introduction>`, you have already learned | |||
| the basic workflow of using GNNs for node classification, | |||
| i.e. predicting the category of a node in a graph. This tutorial will | |||
| teach you how to train a GNN for link prediction, i.e. predicting the | |||
| existence of an edge between two arbitrary nodes in a graph. | |||
| By the end of this tutorial you will be able to | |||
| - Build a GNN-based link prediction model. | |||
| - Train and evaluate the model on a small DGL-provided dataset. | |||
| (Time estimate: 28 minutes) | |||
| """ | |||
| import dgl | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| import itertools | |||
| import numpy as np | |||
| import scipy.sparse as sp | |||
| import dgl.function as fn | |||
| import random | |||
| import sys | |||
| sys.path.insert(0, "../") | |||
| from autogl.module.model.dgl.graphsage import GraphSAGE | |||
| def setup_seed(seed): | |||
| torch.manual_seed(seed) | |||
| torch.cuda.manual_seed_all(seed) | |||
| torch.backends.cudnn.deterministic = True | |||
| np.random.seed(seed) | |||
| random.seed(seed) | |||
| setup_seed(1234) | |||
| import dgl.data | |||
| dataset = dgl.data.CoraGraphDataset() | |||
| g = dataset[0] | |||
| def split_train_test(g): | |||
| u, v = g.edges() | |||
| eids = np.arange(g.number_of_edges()) | |||
| eids = np.random.permutation(eids) | |||
| test_size = int(len(eids) * 0.1) | |||
| train_size = g.number_of_edges() - test_size | |||
| test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]] | |||
| train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]] | |||
| # Find all negative edges and split them for training and testing | |||
| adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy()))) | |||
| adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes()) | |||
| neg_u, neg_v = np.where(adj_neg != 0) | |||
| neg_eids = np.random.choice(len(neg_u), g.number_of_edges()) | |||
| test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]] | |||
| train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]] | |||
| train_g = dgl.remove_edges(g, eids[:test_size]) | |||
| train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes()) | |||
| train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes()) | |||
| test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes()) | |||
| test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes()) | |||
| return train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g | |||
| train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g = split_train_test(g) | |||
| from dgl.nn import SAGEConv | |||
| class Net(nn.Module): | |||
| def __init__(self, in_feats, h_feats): | |||
| super(Net, self).__init__() | |||
| self.conv1 = SAGEConv(in_feats, h_feats, 'mean') | |||
| self.conv2 = SAGEConv(h_feats, h_feats, 'mean') | |||
| def forward(self, data): | |||
| g = data | |||
| in_feat = data.ndata['feat'] | |||
| h = self.conv1(g, in_feat) | |||
| h = F.relu(h) | |||
| h = self.conv2(g, h) | |||
| return h | |||
| # AUC on Cora: 0.78 | |||
| class GraphSAGE_ours(GraphSAGE): | |||
| def __init__(self, args): | |||
| super(GraphSAGE_ours, self).__init__(args) | |||
| pass | |||
| def forward(self, data): | |||
| return self.lp_encode(data) | |||
| # AUC on Cora: 0.78 | |||
| class DotPredictor(nn.Module): | |||
| def forward(self, g, h): | |||
| with g.local_scope(): | |||
| g.ndata['h'] = h | |||
| # Compute a new edge feature named 'score' by a dot-product between the | |||
| # source node feature 'h' and destination node feature 'h'. | |||
| g.apply_edges(fn.u_dot_v('h', 'h', 'score')) | |||
| # u_dot_v returns a 1-element vector for each edge so you need to squeeze it. | |||
| return g.edata['score'][:, 0] | |||
| # args.features_num = train_g.ndata['feat'].shape[1] | |||
| # args.num_class = 2 | |||
| # args.num_layers = 3 | |||
| # args.hidden = [16, 16] | |||
| # args.dropout = 0.0 | |||
| # args.act = 'relu' | |||
| # args.agg = 'mean' | |||
| args = { | |||
| 'features_num' : train_g.ndata['feat'].shape[1], | |||
| 'num_class' : 2, | |||
| 'num_layers' : 3, | |||
| 'hidden' : [16, 16], | |||
| 'dropout' : 0.0, | |||
| 'act' : 'relu', | |||
| 'agg' : 'mean', | |||
| } | |||
| model = GraphSAGE_ours(args) | |||
| # model = Net(train_g.ndata['feat'].shape[1], 16) | |||
| pred = DotPredictor() | |||
| def compute_loss(pos_score, neg_score): | |||
| scores = torch.cat([pos_score, neg_score]) | |||
| labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]) | |||
| return F.binary_cross_entropy_with_logits(scores.cpu(), labels) | |||
| def compute_auc(pos_score, neg_score): | |||
| scores = torch.cat([pos_score, neg_score]).numpy() | |||
| labels = torch.cat( | |||
| [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy() | |||
| return roc_auc_score(labels, scores) | |||
| optimizer = torch.optim.Adam(itertools.chain(model.parameters(), pred.parameters()), lr=0.01) | |||
| all_logits = [] | |||
| for e in range(100): | |||
| # forward | |||
| # h = model(train_g, train_g.ndata['feat']) | |||
| h = model(train_g) | |||
| pos_score = pred(train_pos_g, h) | |||
| neg_score = pred(train_neg_g, h) | |||
| loss = compute_loss(pos_score, neg_score) | |||
| # backward | |||
| optimizer.zero_grad() | |||
| loss.backward() | |||
| optimizer.step() | |||
| if e % 5 == 0: | |||
| print('In epoch {}, loss: {}'.format(e, loss)) | |||
| from sklearn.metrics import roc_auc_score | |||
| with torch.no_grad(): | |||
| pos_score = pred(test_pos_g, h) | |||
| neg_score = pred(test_neg_g, h) | |||
| print('AUC', compute_auc(pos_score, neg_score)) | |||
| @@ -1,188 +0,0 @@ | |||
| from tqdm import tqdm | |||
| from autogl.datasets import build_dataset_from_name | |||
| from autogl.solver.classifier.link_predictor import AutoLinkPredictor | |||
| from autogl.module.train.evaluation import Auc | |||
| import random | |||
| import torch | |||
| import numpy as np | |||
| import dgl | |||
| import torch | |||
| import numpy as np | |||
| import scipy.sparse as sp | |||
| from autogl.datasets.utils.conversion import to_dgl_dataset | |||
| from helper import get_encoder_decoder_hp | |||
| def construct_negative_graph(graph, k): | |||
| src, dst = graph.edges() | |||
| neg_src = src.repeat_interleave(k) | |||
| neg_dst = torch.randint(0, graph.num_nodes(), (len(src) * k,)) | |||
| # return dgl.graph((neg_src, neg_dst), num_nodes=graph.num_nodes()).edges() | |||
| return neg_src, neg_dst | |||
| def negative_sample(data): | |||
| return construct_negative_graph(data, 5) | |||
| import autogl.datasets.utils as tmp_utils | |||
| tmp_utils.negative_sampling = negative_sample | |||
| def setup_seed(seed): | |||
| torch.manual_seed(seed) | |||
| torch.cuda.manual_seed_all(seed) | |||
| torch.backends.cudnn.deterministic = True | |||
| np.random.seed(seed) | |||
| random.seed(seed) | |||
| def fixed(**kwargs): | |||
| return [{ | |||
| 'parameterName': k, | |||
| "type": "FIXED", | |||
| "value": v | |||
| } for k, v in kwargs.items()] | |||
| def split_train_test(g): | |||
| u, v = g.edges() | |||
| eids = np.arange(g.number_of_edges()) | |||
| eids = np.random.permutation(eids) | |||
| test_size = int(len(eids) * 0.1) | |||
| train_size = g.number_of_edges() - test_size | |||
| test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]] | |||
| train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]] | |||
| # Find all negative edges and split them for training and testing | |||
| adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy()))) | |||
| adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes()) | |||
| neg_u, neg_v = np.where(adj_neg != 0) | |||
| neg_eids = np.random.choice(len(neg_u), g.number_of_edges()) | |||
| test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]] | |||
| train_neg_u, train_neg_v = neg_u[neg_eids[train_size:]], neg_v[neg_eids[train_size:]] | |||
| train_g = dgl.remove_edges(g, eids[:test_size]) | |||
| train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes()) | |||
| train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes()) | |||
| test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes()) | |||
| test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes()) | |||
| return train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g | |||
| def split_train_valid_test(g): | |||
| u, v = g.edges() | |||
| eids = np.arange(g.number_of_edges()) | |||
| eids = np.random.permutation(eids) | |||
| valid_size = int(len(eids) * 0.1) | |||
| test_size = int(len(eids) * 0.1) | |||
| train_size = g.number_of_edges() - test_size - valid_size | |||
| test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]] | |||
| valid_pos_u, valid_pos_v = u[eids[test_size:test_size+valid_size]], v[eids[test_size:test_size+valid_size]] | |||
| train_pos_u, train_pos_v = u[eids[test_size+valid_size:]], v[eids[test_size+valid_size:]] | |||
| # Find all negative edges and split them for training and testing | |||
| adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy()))) | |||
| adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes()) | |||
| neg_u, neg_v = np.where(adj_neg != 0) | |||
| neg_eids = np.random.choice(len(neg_u), g.number_of_edges()) | |||
| test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]] | |||
| valid_neg_u, valid_neg_v = neg_u[neg_eids[test_size:test_size+valid_size]], neg_v[neg_eids[test_size:test_size+valid_size]] | |||
| train_neg_u, train_neg_v = neg_u[neg_eids[train_size:]], neg_v[neg_eids[train_size:]] | |||
| train_g = dgl.remove_edges(g, eids[:test_size+valid_size]) | |||
| train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes()) | |||
| train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes()) | |||
| valid_pos_g = dgl.graph((valid_pos_u, valid_pos_v), num_nodes=g.number_of_nodes()) | |||
| valid_neg_g = dgl.graph((valid_neg_u, valid_neg_v), num_nodes=g.number_of_nodes()) | |||
| test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes()) | |||
| test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes()) | |||
| return train_g, train_pos_g, train_neg_g, valid_pos_g, valid_neg_g, test_pos_g, test_neg_g | |||
| if __name__ == "__main__": | |||
| from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter | |||
| parser = ArgumentParser( | |||
| "auto link prediction", formatter_class=ArgumentDefaultsHelpFormatter | |||
| ) | |||
| parser.add_argument( | |||
| "--dataset", | |||
| default="Cora", | |||
| type=str, | |||
| help="dataset to use", | |||
| choices=[ | |||
| "Cora", | |||
| "CiteSeer", | |||
| "PubMed", | |||
| ], | |||
| ) | |||
| parser.add_argument( | |||
| "--model", | |||
| default="sage", | |||
| type=str, | |||
| help="model to use", | |||
| choices=[ | |||
| "gcn", | |||
| "gat", | |||
| "sage", | |||
| "gin", | |||
| "topk" | |||
| ], | |||
| ) | |||
| parser.add_argument("--seed", type=int, default=0, help="random seed") | |||
| parser.add_argument('--repeat', type=int, default=10) | |||
| parser.add_argument("--device", default="cuda", type=str, help="GPU device") | |||
| args = parser.parse_args() | |||
| dataset = build_dataset_from_name(args.dataset.lower()) | |||
| dataset = to_dgl_dataset(dataset) | |||
| train_g, train_pos_g, train_neg_g, val_pos_g, val_neg_g, test_pos_g, test_neg_g = split_train_valid_test(dataset[0].cpu()) | |||
| dataset = [[train_g, train_pos_g, train_neg_g, val_pos_g, val_neg_g, test_pos_g, test_neg_g]] | |||
| res = [] | |||
| for seed in tqdm(range(1234, 1234+args.repeat)): | |||
| # set random seed | |||
| random.seed(seed) | |||
| np.random.seed(seed) | |||
| torch.manual_seed(seed) | |||
| if torch.cuda.is_available(): | |||
| torch.cuda.manual_seed(seed) | |||
| torch.backends.cudnn.deterministic = True | |||
| torch.backends.cudnn.benchmark = False | |||
| model_hp, decoder_hp = get_encoder_decoder_hp(args.model) | |||
| autoClassifier = AutoLinkPredictor( | |||
| feature_module=None, | |||
| graph_models=(args.model,), | |||
| ensemble_module=None, | |||
| max_evals=1, | |||
| hpo_module='random', | |||
| trainer_hp_space=fixed(**{ | |||
| "max_epoch": 100, | |||
| "early_stopping_round": 100 + 1, | |||
| "lr":0.01, | |||
| "weight_decay": 0.0, | |||
| }), | |||
| model_hp_spaces=[{"encoder": fixed(**model_hp), "decoder": fixed(**decoder_hp)}] | |||
| ) | |||
| autoClassifier.fit( | |||
| dataset, | |||
| time_limit=3600, | |||
| evaluation_method=[Auc], | |||
| seed=seed, | |||
| ) | |||
| auc = autoClassifier.evaluate(metric='auc') | |||
| print("test auc: {:.4f}".format(auc)) | |||
| @@ -1,202 +0,0 @@ | |||
| from tqdm import tqdm | |||
| from autogl.datasets import build_dataset_from_name | |||
| from autogl.module.train.evaluation import Auc | |||
| import random | |||
| import torch | |||
| import numpy as np | |||
| import dgl | |||
| import torch | |||
| import numpy as np | |||
| import scipy.sparse as sp | |||
| from autogl.datasets.utils.conversion import to_dgl_dataset | |||
| from helper import get_encoder_decoder_hp | |||
| def construct_negative_graph(graph, k): | |||
| src, dst = graph.edges() | |||
| neg_src = src.repeat_interleave(k) | |||
| neg_dst = torch.randint(0, graph.num_nodes(), (len(src) * k,)) | |||
| # return dgl.graph((neg_src, neg_dst), num_nodes=graph.num_nodes()).edges() | |||
| return neg_src, neg_dst | |||
| def negative_sample(data): | |||
| return construct_negative_graph(data, 5) | |||
| import autogl.datasets.utils as tmp_utils | |||
| tmp_utils.negative_sampling = negative_sample | |||
| from autogl.module.train.link_prediction_full import LinkPredictionTrainer | |||
| def setup_seed(seed): | |||
| torch.manual_seed(seed) | |||
| torch.cuda.manual_seed_all(seed) | |||
| torch.backends.cudnn.deterministic = True | |||
| np.random.seed(seed) | |||
| random.seed(seed) | |||
| def split_train_test(g): | |||
| u, v = g.edges() | |||
| eids = np.arange(g.number_of_edges()) | |||
| eids = np.random.permutation(eids) | |||
| test_size = int(len(eids) * 0.1) | |||
| train_size = g.number_of_edges() - test_size | |||
| test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]] | |||
| train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]] | |||
| # Find all negative edges and split them for training and testing | |||
| adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy()))) | |||
| adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes()) | |||
| neg_u, neg_v = np.where(adj_neg != 0) | |||
| neg_eids = np.random.choice(len(neg_u), g.number_of_edges()) | |||
| test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]] | |||
| train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]] | |||
| train_g = dgl.remove_edges(g, eids[:test_size]) | |||
| train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes()) | |||
| train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes()) | |||
| test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes()) | |||
| test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes()) | |||
| return train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g | |||
| def split_train_valid_test(g): | |||
| u, v = g.edges() | |||
| eids = np.arange(g.number_of_edges()) | |||
| eids = np.random.permutation(eids) | |||
| valid_size = int(len(eids) * 0.1) | |||
| test_size = int(len(eids) * 0.1) | |||
| train_size = g.number_of_edges() - test_size - valid_size | |||
| test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]] | |||
| valid_pos_u, valid_pos_v = u[eids[test_size:test_size+valid_size]], v[eids[test_size:test_size+valid_size]] | |||
| train_pos_u, train_pos_v = u[eids[test_size+valid_size:]], v[eids[test_size+valid_size:]] | |||
| # Find all negative edges and split them for training and testing | |||
| adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy()))) | |||
| adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes()) | |||
| neg_u, neg_v = np.where(adj_neg != 0) | |||
| neg_eids = np.random.choice(len(neg_u), g.number_of_edges()) | |||
| test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]] | |||
| valid_neg_u, valid_neg_v = neg_u[neg_eids[test_size:test_size+valid_size]], neg_v[neg_eids[test_size:test_size+valid_size]] | |||
| train_neg_u, train_neg_v = neg_u[neg_eids[test_size+valid_size:]], neg_v[neg_eids[test_size+valid_size:]] | |||
| train_g = dgl.remove_edges(g, eids[:test_size+valid_size]) | |||
| train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes()) | |||
| train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes()) | |||
| valid_pos_g = dgl.graph((valid_pos_u, valid_pos_v), num_nodes=g.number_of_nodes()) | |||
| valid_neg_g = dgl.graph((valid_neg_u, valid_neg_v), num_nodes=g.number_of_nodes()) | |||
| test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes()) | |||
| test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes()) | |||
| return train_g, train_pos_g, train_neg_g, valid_pos_g, valid_neg_g, test_pos_g, test_neg_g | |||
| if __name__ == "__main__": | |||
| from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter | |||
| parser = ArgumentParser( | |||
| "auto link prediction", formatter_class=ArgumentDefaultsHelpFormatter | |||
| ) | |||
| parser.add_argument( | |||
| "--dataset", | |||
| default="Cora", | |||
| type=str, | |||
| help="dataset to use", | |||
| choices=[ | |||
| "Cora", | |||
| "CiteSeer", | |||
| "PubMed", | |||
| ], | |||
| ) | |||
| parser.add_argument( | |||
| "--model", | |||
| default="sage", | |||
| type=str, | |||
| help="model to use", | |||
| choices=[ | |||
| "gcn", | |||
| "gat", | |||
| "sage", | |||
| "gin", | |||
| "topk" | |||
| ], | |||
| ) | |||
| parser.add_argument("--seed", type=int, default=0, help="random seed") | |||
| parser.add_argument('--repeat', type=int, default=10) | |||
| parser.add_argument("--device", default="cuda", type=str, help="GPU device") | |||
| args = parser.parse_args() | |||
| dataset = build_dataset_from_name(args.dataset.lower()) | |||
| dataset = to_dgl_dataset(dataset) | |||
| res = [] | |||
| for seed in tqdm(range(1234, 1234+args.repeat)): | |||
| # set random seed | |||
| random.seed(seed) | |||
| np.random.seed(seed) | |||
| torch.manual_seed(seed) | |||
| if torch.cuda.is_available(): | |||
| torch.cuda.manual_seed(seed) | |||
| torch.backends.cudnn.deterministic = True | |||
| torch.backends.cudnn.benchmark = False | |||
| graph = dataset[0].to(args.device) | |||
| num_features = graph.ndata['feat'].size(1) | |||
| model_hp, decoder_hp = get_encoder_decoder_hp(args.model) | |||
| trainer = LinkPredictionTrainer( | |||
| model = args.model, | |||
| num_features = num_features, | |||
| lr = 1e-2, | |||
| max_epoch = 100, | |||
| early_stopping_round = 101, | |||
| weight_decay = 0.0, | |||
| device = "auto", | |||
| init = True, | |||
| feval = [Auc], | |||
| loss = "binary_cross_entropy_with_logits", | |||
| ).duplicate_from_hyper_parameter( | |||
| { | |||
| "trainer": {}, | |||
| "encoder": model_hp, | |||
| "decoder": decoder_hp | |||
| }, | |||
| restricted=False | |||
| ) | |||
| train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g = split_train_test(graph.cpu()) | |||
| dataset_splitted = { | |||
| 'train': train_g.to(args.device), | |||
| 'train_pos': train_pos_g.to(args.device), | |||
| 'train_neg': train_neg_g.to(args.device), | |||
| 'test_pos': test_pos_g.to(args.device), | |||
| 'test_neg': test_neg_g.to(args.device), | |||
| } | |||
| trainer.train([dataset_splitted], False) | |||
| pre = trainer.evaluate([dataset_splitted], mask="test", feval=Auc) | |||
| result = pre.item() | |||
| res.append(result) | |||
| print(np.mean(res), np.std(res)) | |||
| """ | |||
| AUC 0.8151564430268863 | |||
| """ | |||
| @@ -1,22 +1,17 @@ | |||
| import dgl | |||
| import torch | |||
| import pickle | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| import itertools | |||
| import numpy as np | |||
| import scipy.sparse as sp | |||
| import dgl.function as fn | |||
| import random | |||
| from dgl.data import CoraGraphDataset, PubmedGraphDataset, CiteseerGraphDataset | |||
| # from autogl.module.train.link_prediction_full import LinkPredictionTrainer | |||
| import sys | |||
| sys.path.insert(0, "../") | |||
| from autogl.module.model.dgl.graphsage import GraphSAGE | |||
| from autogl.module.model.dgl import AutoSAGE, AutoGCN, AutoGAT | |||
| import dgl.data | |||
| from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter | |||
| from tqdm import tqdm | |||
| from dgl.nn import SAGEConv | |||
| from sklearn.metrics import roc_auc_score | |||
| @@ -50,15 +45,19 @@ def setup_seed(seed): | |||
| random.seed(seed) | |||
| def split_train_test(g): | |||
| def split_train_valid_test(g): | |||
| u, v = g.edges() | |||
| eids = np.arange(g.number_of_edges()) | |||
| eids = np.random.permutation(eids) | |||
| valid_size = int(len(eids) * 0.1) | |||
| test_size = int(len(eids) * 0.1) | |||
| train_size = g.number_of_edges() - test_size | |||
| train_size = g.number_of_edges() - test_size - valid_size | |||
| test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]] | |||
| train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]] | |||
| valid_pos_u, valid_pos_v = u[eids[test_size:test_size+valid_size]], v[eids[test_size:test_size+valid_size]] | |||
| train_pos_u, train_pos_v = u[eids[test_size+valid_size:]], v[eids[test_size+valid_size:]] | |||
| # Find all negative edges and split them for training and testing | |||
| adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy()))) | |||
| @@ -67,17 +66,22 @@ def split_train_test(g): | |||
| neg_eids = np.random.choice(len(neg_u), g.number_of_edges()) | |||
| test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]] | |||
| train_neg_u, train_neg_v = neg_u[neg_eids[train_size:]], neg_v[neg_eids[train_size:]] | |||
| valid_neg_u, valid_neg_v = neg_u[neg_eids[test_size:test_size+valid_size]], neg_v[neg_eids[test_size:test_size+valid_size]] | |||
| train_neg_u, train_neg_v = neg_u[neg_eids[test_size+valid_size:]], neg_v[neg_eids[test_size+valid_size:]] | |||
| train_g = dgl.remove_edges(g, eids[:test_size]) | |||
| train_g = dgl.remove_edges(g, eids[:test_size+valid_size]) | |||
| train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes()) | |||
| train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes()) | |||
| valid_pos_g = dgl.graph((valid_pos_u, valid_pos_v), num_nodes=g.number_of_nodes()) | |||
| valid_neg_g = dgl.graph((valid_neg_u, valid_neg_v), num_nodes=g.number_of_nodes()) | |||
| test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes()) | |||
| test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes()) | |||
| return train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g | |||
| return train_g, train_pos_g, train_neg_g, valid_pos_g, valid_neg_g, test_pos_g, test_neg_g | |||
| class DotPredictor(nn.Module): | |||
| def forward(self, g, h): | |||
| @@ -108,74 +112,95 @@ def get_link_labels(pos_edge_index, neg_edge_index): | |||
| link_labels[: pos_edge_index.size(1)] = 1.0 | |||
| return link_labels | |||
| @torch.no_grad() | |||
| def evaluate(model, data, mask): | |||
| model.eval() | |||
| if mask == "val": offset = 3 | |||
| else: offset = 5 | |||
| z = model.lp_encode(data[0]) | |||
| link_logits = model.lp_decode( | |||
| z, torch.stack(data[offset].edges()), torch.stack(data[offset + 1].edges()) | |||
| ) | |||
| link_probs = link_logits.sigmoid() | |||
| link_labels = get_link_labels( | |||
| torch.stack(data[offset].edges()), torch.stack(data[offset + 1].edges()) | |||
| ) | |||
| result = roc_auc_score(link_labels.cpu().numpy(), link_probs.cpu().numpy()) | |||
| return result | |||
| res = [] | |||
| for seed in tqdm(range(1234, 1234+args.repeat)): | |||
| setup_seed(seed) | |||
| g = dataset[0].to(device) | |||
| train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g = split_train_test(g.cpu()) | |||
| train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g = train_g.to(device), train_pos_g.to(device), train_neg_g.to(device), test_pos_g.to(device), test_neg_g.to(device) | |||
| g = dataset[0] | |||
| splitted = list(split_train_valid_test(g)) | |||
| if args.model == 'gcn' or args.model == 'gat': | |||
| splitted[0] = dgl.add_self_loop(splitted[0]) | |||
| splitted = [g.to(device) for g in splitted] | |||
| if args.model == 'gcn': | |||
| pass | |||
| model = AutoGCN( | |||
| input_dimension=splitted[0].ndata['feat'].shape[1], | |||
| output_dimension=2, | |||
| device=args.device, | |||
| ).from_hyper_parameter({ | |||
| "num_layers": 3, | |||
| "hidden": [16, 16], | |||
| "dropout": 0., | |||
| "act": "relu", | |||
| }).model | |||
| elif args.model == 'gat': | |||
| pass | |||
| model = AutoGAT( | |||
| input_dimension=splitted[0].ndata['feat'].shape[1], | |||
| output_dimension=2, | |||
| device=args.device, | |||
| ).from_hyper_parameter({ | |||
| "num_layers": 3, | |||
| "hidden": [8], | |||
| "heads": 8, | |||
| "dropout": 0.0, | |||
| "act": "relu" | |||
| }) | |||
| elif args.model == 'sage': | |||
| para = { | |||
| 'features_num': train_g.ndata['feat'].shape[1], | |||
| 'num_class': 2, | |||
| model = AutoSAGE( | |||
| num_features=splitted[0].ndata['feat'].shape[1], | |||
| num_classes=2, | |||
| device=args.device | |||
| ).from_hyper_parameter({ | |||
| 'num_layers': 3, | |||
| 'hidden': [16, 16], | |||
| 'dropout': 0.0, | |||
| 'act': 'relu', | |||
| 'agg': 'mean', | |||
| } | |||
| model = GraphSAGE(para).to(device) | |||
| else: | |||
| assert False | |||
| 'agg': 'mean' | |||
| }).model | |||
| optimizer = torch.optim.Adam(model.parameters(), lr=0.01) | |||
| all_logits = [] | |||
| best_auc = 0. | |||
| for epoch in range(100): | |||
| model.train() | |||
| optimizer.zero_grad() | |||
| z = model.lp_encode(train_g) | |||
| z = model.lp_encode(splitted[0]) | |||
| link_logits = model.lp_decode( | |||
| z, torch.stack(train_pos_g.edges()), torch.stack(train_neg_g.edges()) | |||
| z, torch.stack(splitted[1].edges()), torch.stack(splitted[2].edges()) | |||
| ) | |||
| link_labels = get_link_labels( | |||
| torch.stack(train_pos_g.edges()), torch.stack(train_neg_g.edges()) | |||
| torch.stack(splitted[1].edges()), torch.stack(splitted[2].edges()) | |||
| ) | |||
| loss = F.binary_cross_entropy_with_logits(link_logits, link_labels) | |||
| loss.backward() | |||
| optimizer.step() | |||
| model.eval() | |||
| with torch.no_grad(): | |||
| z = model.lp_encode(train_g) | |||
| link_logits = model.lp_decode( | |||
| z, torch.stack(test_pos_g.edges()), torch.stack(test_neg_g.edges()) | |||
| ) | |||
| link_probs = link_logits.sigmoid() | |||
| link_labels = get_link_labels( | |||
| torch.stack(test_pos_g.edges()), torch.stack(test_neg_g.edges()) | |||
| ) | |||
| result = roc_auc_score(link_labels.cpu().numpy(), link_probs.cpu().numpy()) | |||
| res.append(result) | |||
| print(np.mean(res), np.std(res)) | |||
| auc_val = evaluate(model, splitted, "val") | |||
| if auc_val > best_auc: | |||
| best_auc = auc_val | |||
| best_parameters = pickle.dumps(model.state_dict()) | |||
| model.load_state_dict(pickle.loads(best_parameters)) | |||
| res.append(evaluate(model, splitted, "test")) | |||
| print("{:.2f} ~ {:.2f}".format(np.mean(res) * 100, np.std(res) * 100)) | |||
| @@ -0,0 +1,103 @@ | |||
| import torch | |||
| import random | |||
| import numpy as np | |||
| import dgl | |||
| from tqdm import tqdm | |||
| from autogl.datasets import build_dataset_from_name | |||
| from autogl.solver.classifier.link_predictor import AutoLinkPredictor | |||
| from autogl.datasets.utils.conversion import to_dgl_dataset | |||
| from autogl.datasets.utils import split_edges | |||
| from helper import get_encoder_decoder_hp | |||
| def fixed(**kwargs): | |||
| return [{ | |||
| 'parameterName': k, | |||
| "type": "FIXED", | |||
| "value": v | |||
| } for k, v in kwargs.items()] | |||
| if __name__ == "__main__": | |||
| from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter | |||
| parser = ArgumentParser( | |||
| "auto link prediction", formatter_class=ArgumentDefaultsHelpFormatter | |||
| ) | |||
| parser.add_argument( | |||
| "--dataset", | |||
| default="Cora", | |||
| type=str, | |||
| help="dataset to use", | |||
| choices=[ | |||
| "Cora", | |||
| "CiteSeer", | |||
| "PubMed", | |||
| ], | |||
| ) | |||
| parser.add_argument( | |||
| "--model", | |||
| default="sage", | |||
| type=str, | |||
| help="model to use", | |||
| choices=[ | |||
| "gcn", | |||
| "gat", | |||
| "sage", | |||
| "gin", | |||
| "topk" | |||
| ], | |||
| ) | |||
| parser.add_argument("--seed", type=int, default=0, help="random seed") | |||
| parser.add_argument('--repeat', type=int, default=10) | |||
| parser.add_argument("--device", default="cuda", type=str, help="GPU device") | |||
| args = parser.parse_args() | |||
| dataset = build_dataset_from_name(args.dataset.lower()) | |||
| dataset = to_dgl_dataset(dataset) | |||
| res = [] | |||
| for seed in tqdm(range(1234, 1234+args.repeat)): | |||
| # set random seed | |||
| random.seed(seed) | |||
| np.random.seed(seed) | |||
| torch.manual_seed(seed) | |||
| if torch.cuda.is_available(): | |||
| torch.cuda.manual_seed(seed) | |||
| torch.backends.cudnn.deterministic = True | |||
| torch.backends.cudnn.benchmark = False | |||
| gs = list(split_edges(dataset, 0.8, 0.1)[0]) | |||
| if args.model == 'gcn' or args.model == 'gat': | |||
| gs[0] = dgl.add_self_loop(gs[0]) | |||
| model_hp, decoder_hp = get_encoder_decoder_hp(args.model) | |||
| autoClassifier = AutoLinkPredictor( | |||
| feature_module=None, | |||
| graph_models=(args.model,), | |||
| ensemble_module=None, | |||
| max_evals=1, | |||
| hpo_module='random', | |||
| trainer_hp_space=fixed(**{ | |||
| "max_epoch": 100, | |||
| "early_stopping_round": 100 + 1, | |||
| "lr":0.01, | |||
| "weight_decay": 0.0, | |||
| }), | |||
| model_hp_spaces=[{"encoder": fixed(**model_hp), "decoder": fixed(**decoder_hp)}], | |||
| device="cpu" | |||
| ) | |||
| autoClassifier.fit( | |||
| [gs], | |||
| time_limit=3600, | |||
| evaluation_method=["auc"], | |||
| seed=seed, | |||
| ) | |||
| auc = autoClassifier.evaluate(metric='auc') | |||
| res.append(auc) | |||
| print("{:.2f} ~ {:.2f}".format(np.mean(res) * 100, np.std(res) * 100)) | |||
| @@ -9,21 +9,6 @@ import numpy as np | |||
| import scipy.sparse as sp | |||
| from helper import get_encoder_decoder_hp | |||
| def construct_negative_graph(graph, k): | |||
| src, dst = graph.edges() | |||
| neg_src = src.repeat_interleave(k) | |||
| neg_dst = torch.randint(0, graph.num_nodes(), (len(src) * k,)) | |||
| # return dgl.graph((neg_src, neg_dst), num_nodes=graph.num_nodes()).edges() | |||
| return neg_src, neg_dst | |||
| def negative_sample(data): | |||
| return construct_negative_graph(data, 5) | |||
| import autogl.datasets.utils as tmp_utils | |||
| tmp_utils.negative_sampling = negative_sample | |||
| from dgl.data import CoraGraphDataset, PubmedGraphDataset, CiteseerGraphDataset | |||
| from autogl.module.train.link_prediction_full import LinkPredictionTrainer | |||
| @@ -34,37 +19,6 @@ def setup_seed(seed): | |||
| np.random.seed(seed) | |||
| random.seed(seed) | |||
| def split_train_test(g): | |||
| u, v = g.edges() | |||
| eids = np.arange(g.number_of_edges()) | |||
| eids = np.random.permutation(eids) | |||
| test_size = int(len(eids) * 0.1) | |||
| train_size = g.number_of_edges() - test_size | |||
| test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]] | |||
| train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]] | |||
| # Find all negative edges and split them for training and testing | |||
| adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy()))) | |||
| adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes()) | |||
| neg_u, neg_v = np.where(adj_neg != 0) | |||
| neg_eids = np.random.choice(len(neg_u), g.number_of_edges()) | |||
| test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]] | |||
| train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]] | |||
| train_g = dgl.remove_edges(g, eids[:test_size]) | |||
| train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes()) | |||
| train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes()) | |||
| test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes()) | |||
| test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes()) | |||
| return train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g | |||
| def split_train_valid_test(g): | |||
| u, v = g.edges() | |||
| @@ -160,7 +114,7 @@ if __name__ == "__main__": | |||
| torch.backends.cudnn.deterministic = True | |||
| torch.backends.cudnn.benchmark = False | |||
| graph = dataset[0].to(args.device) | |||
| graph = dataset[0] | |||
| num_features = graph.ndata['feat'].size(1) | |||
| model_hp, decoder_hp = get_encoder_decoder_hp(args.model) | |||
| @@ -185,22 +139,27 @@ if __name__ == "__main__": | |||
| restricted=False | |||
| ) | |||
| train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g = split_train_test(graph.cpu()) | |||
| gs = list(split_train_valid_test(graph)) | |||
| if args.model == 'gcn' or args.model == 'gat': | |||
| gs[0] = dgl.add_self_loop(gs[0]) | |||
| dataset_splitted = { | |||
| 'train': train_g.to(args.device), | |||
| 'train_pos': train_pos_g.to(args.device), | |||
| 'train_neg': train_neg_g.to(args.device), | |||
| 'test_pos': test_pos_g.to(args.device), | |||
| 'test_neg': test_neg_g.to(args.device), | |||
| 'train': gs[0].to(args.device), | |||
| 'train_pos': gs[1].to(args.device), | |||
| 'train_neg': gs[2].to(args.device), | |||
| 'val_pos': gs[3].to(args.device), | |||
| 'val_neg': gs[4].to(args.device), | |||
| 'test_pos': gs[5].to(args.device), | |||
| 'test_neg': gs[6].to(args.device), | |||
| } | |||
| trainer.train([dataset_splitted], False) | |||
| trainer.train([dataset_splitted], True) | |||
| pre = trainer.evaluate([dataset_splitted], mask="test", feval=Auc) | |||
| result = pre.item() | |||
| result = pre | |||
| res.append(result) | |||
| print(np.mean(res), np.std(res)) | |||
| print("{:.2f} ~ {:.2f}".format(np.mean(res) * 100, np.std(res) * 100)) | |||
| """ | |||
| @@ -0,0 +1,124 @@ | |||
| from tqdm import tqdm | |||
| from autogl.datasets import build_dataset_from_name | |||
| from autogl.module.train.evaluation import Auc | |||
| import random | |||
| import torch | |||
| import numpy as np | |||
| import dgl | |||
| import torch | |||
| import numpy as np | |||
| from autogl.datasets.utils.conversion import to_dgl_dataset | |||
| from helper import get_encoder_decoder_hp | |||
| from autogl.datasets.utils import split_edges | |||
| from autogl.module.train.link_prediction_full import LinkPredictionTrainer | |||
| def setup_seed(seed): | |||
| torch.manual_seed(seed) | |||
| torch.cuda.manual_seed_all(seed) | |||
| torch.backends.cudnn.deterministic = True | |||
| np.random.seed(seed) | |||
| random.seed(seed) | |||
| if __name__ == "__main__": | |||
| from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter | |||
| parser = ArgumentParser( | |||
| "auto link prediction", formatter_class=ArgumentDefaultsHelpFormatter | |||
| ) | |||
| parser.add_argument( | |||
| "--dataset", | |||
| default="Cora", | |||
| type=str, | |||
| help="dataset to use", | |||
| choices=[ | |||
| "Cora", | |||
| "CiteSeer", | |||
| "PubMed", | |||
| ], | |||
| ) | |||
| parser.add_argument( | |||
| "--model", | |||
| default="sage", | |||
| type=str, | |||
| help="model to use", | |||
| choices=[ | |||
| "gcn", | |||
| "gat", | |||
| "sage", | |||
| "gin", | |||
| "topk" | |||
| ], | |||
| ) | |||
| parser.add_argument("--seed", type=int, default=0, help="random seed") | |||
| parser.add_argument('--repeat', type=int, default=10) | |||
| parser.add_argument("--device", default="cuda", type=str, help="GPU device") | |||
| args = parser.parse_args() | |||
| dataset = build_dataset_from_name(args.dataset.lower()) | |||
| dataset = to_dgl_dataset(dataset) | |||
| res = [] | |||
| for seed in tqdm(range(1234, 1234+args.repeat)): | |||
| # set random seed | |||
| random.seed(seed) | |||
| np.random.seed(seed) | |||
| torch.manual_seed(seed) | |||
| if torch.cuda.is_available(): | |||
| torch.cuda.manual_seed(seed) | |||
| torch.backends.cudnn.deterministic = True | |||
| torch.backends.cudnn.benchmark = False | |||
| graph = dataset[0].to(args.device) | |||
| num_features = graph.ndata['feat'].size(1) | |||
| model_hp, decoder_hp = get_encoder_decoder_hp(args.model) | |||
| trainer = LinkPredictionTrainer( | |||
| model = args.model, | |||
| num_features = num_features, | |||
| lr = 1e-2, | |||
| max_epoch = 100, | |||
| early_stopping_round = 101, | |||
| weight_decay = 0.0, | |||
| device = "auto", | |||
| init = False, | |||
| feval = [Auc], | |||
| loss = "binary_cross_entropy_with_logits", | |||
| ).duplicate_from_hyper_parameter( | |||
| { | |||
| "trainer": {}, | |||
| "encoder": model_hp, | |||
| "decoder": decoder_hp | |||
| }, | |||
| restricted=False | |||
| ) | |||
| gs = list(split_edges(dataset, 0.8, 0.1)[0]) | |||
| if args.model == 'gcn' or args.model == 'gat': | |||
| gs[0] = dgl.add_self_loop(gs[0]) | |||
| dataset_splitted = { | |||
| 'train': gs[0].to(args.device), | |||
| 'train_pos': gs[1].to(args.device), | |||
| 'train_neg': gs[2].to(args.device), | |||
| 'val_pos': gs[3].to(args.device), | |||
| 'val_neg': gs[4].to(args.device), | |||
| 'test_pos': gs[5].to(args.device), | |||
| 'test_neg': gs[6].to(args.device), | |||
| } | |||
| trainer.train([dataset_splitted], False) | |||
| pre = trainer.evaluate([dataset_splitted], mask="test", feval=Auc) | |||
| result = pre.item() | |||
| res.append(result) | |||
| print("{:.2f} ~ {:.2f}".format(np.mean(res) * 100, np.std(res) * 100)) | |||
| """ | |||
| AUC 0.8151564430268863 | |||
| """ | |||