From b3c8a440b07815f4e2cce5ab47b5d1efa990b850 Mon Sep 17 00:00:00 2001 From: lihy96 Date: Tue, 14 Dec 2021 17:39:04 +0800 Subject: [PATCH] fix bugs --- autogl/module/train/link_prediction_full.py | 126 +++++---- test/link_prediction_base.py | 202 +++++++++++++++ test/link_prediction_model.py | 181 +++++++++++++ test/link_prediction_solver.py | 261 +++++++++++++++++++ test/link_prediction_trainer.py | 153 ++++++----- test/link_prediction_trainer_dataset.py | 271 ++++++++++++++++++++ 6 files changed, 1086 insertions(+), 108 deletions(-) create mode 100644 test/link_prediction_base.py create mode 100644 test/link_prediction_model.py create mode 100644 test/link_prediction_solver.py create mode 100644 test/link_prediction_trainer_dataset.py diff --git a/autogl/module/train/link_prediction_full.py b/autogl/module/train/link_prediction_full.py index 655d8ff..671b865 100644 --- a/autogl/module/train/link_prediction_full.py +++ b/autogl/module/train/link_prediction_full.py @@ -169,7 +169,7 @@ class LinkPredictionTrainer(BaseLinkPredictionTrainer): # Get task name, i.e., `LinkPrediction`. return "LinkPrediction" - def train_only(self, data, train_mask=None): + def train_only_pyg(self, data, train_mask=None): """ The function of training on the given dataset and mask. Parameters @@ -241,8 +241,7 @@ class LinkPredictionTrainer(BaseLinkPredictionTrainer): The function of training on the given dataset and mask. Parameters ---------- - pos_data: positive links - neg_data: negative links + dataset: there are train, train_pos, train_neg graph in this dataset Returns ------- self: ``autogl.train.LinkPredictionTrainer`` @@ -308,7 +307,7 @@ class LinkPredictionTrainer(BaseLinkPredictionTrainer): self.early_stopping.load_checkpoint(self.model.model) - def predict_only(self, data, test_mask=None): + def predict_only_pyg(self, data, test_mask=None): """ The function of predicting on the given dataset and mask. @@ -342,9 +341,7 @@ class LinkPredictionTrainer(BaseLinkPredictionTrainer): Parameters ---------- - data: The link prediction dataset used to be predicted. - train_mask: The mask used in training stage. - + dataset: The link prediction dataset used to be predicted. Returns ------- res: The result of predicting on the given dataset. @@ -377,11 +374,11 @@ class LinkPredictionTrainer(BaseLinkPredictionTrainer): if self.pyg_dgl == 'pyg': data = dataset[0] data.edge_index = data.train_pos_edge_index - self.train_only(data) + self.train_only_pyg(data) if keep_valid_result: - self.valid_result = self.predict_only(data) - self.valid_result_prob = self.predict_proba(dataset, "val") - self.valid_score = self.evaluate(dataset, mask="val", feval=self.feval) + self.valid_result = self.predict_only_pyg(data) + self.valid_result_prob = self.predict_proba_pyg(dataset, "val") + self.valid_score = self.evaluate_pyg(dataset, mask="val", feval=self.feval) elif self.pyg_dgl == 'dgl': self.train_only_dgl(dataset) if keep_valid_result: @@ -405,11 +402,17 @@ class LinkPredictionTrainer(BaseLinkPredictionTrainer): The prediction result of ``predict_proba``. """ if self.pyg_dgl == 'pyg': - return self.predict_proba(dataset, mask=mask, in_log_format=False) + return self.predict_proba_pyg(dataset, mask=mask, in_log_format=False) elif self.pyg_dgl == 'dgl': return self.predict_proba_dgl(dataset, mask=mask, in_log_format=False) def predict_proba(self, dataset, mask=None, in_log_format=False): + if self.pyg_dgl == 'pyg': + return self.predict_proba_pyg(dataset, mask, in_log_format) + elif self.pyg_dgl == 'dgl': + return self.predict_proba_dgl(dataset, mask, in_log_format) + + def predict_proba_pyg(self, dataset, mask=None, in_log_format=False): """ The function of predicting the probability on the given dataset. @@ -443,14 +446,30 @@ class LinkPredictionTrainer(BaseLinkPredictionTrainer): self.model.model.eval() with torch.no_grad(): - z = self.predict_only(data) + z = self.predict_only_pyg(data) link_logits = self.model.model.lp_decode(z, pos_edge_index, neg_edge_index) link_probs = link_logits.sigmoid() return link_probs def predict_proba_dgl(self, dataset, mask=None, in_log_format=False): + """ + The function of predicting the probability on the given dataset. + + Parameters + ---------- + dataset: The link prediction dataset used to be predicted. + + mask: ``train``, ``val``, or ``test``. + The dataset mask. + in_log_format: ``bool``. + If True(False), the probability will (not) be log format. + + Returns + ------- + The prediction result. + """ train_graph = dataset['train'] try: try: @@ -547,43 +566,64 @@ class LinkPredictionTrainer(BaseLinkPredictionTrainer): """ if self.pyg_dgl == 'pyg': - data = dataset[0] - data = data.to(self.device) - test_mask = mask - if feval is None: - feval = self.feval - else: - feval = get_feval(feval) + return self.evaluate_pyg(self, dataset, mask, feval) + elif self.pyg_dgl == 'dgl': + return self.evaluate_dgl(dataset,mask,feval) - if mask in ["train", "val", "test"]: - pos_edge_index = data[f"{mask}_pos_edge_index"] - neg_edge_index = data[f"{mask}_neg_edge_index"] - else: - pos_edge_index = data[f"test_pos_edge_index"] - neg_edge_index = data[f"test_neg_edge_index"] + def evaluate_pyg(self, dataset, mask=None, feval=None): + data = dataset[0] + data = data.to(self.device) + test_mask = mask + if feval is None: + feval = self.feval + else: + feval = get_feval(feval) - self.model.model.eval() - with torch.no_grad(): - link_probs = self.predict_proba(dataset, mask) - link_labels = self.get_link_labels(pos_edge_index, neg_edge_index) + if mask in ["train", "val", "test"]: + pos_edge_index = data[f"{mask}_pos_edge_index"] + neg_edge_index = data[f"{mask}_neg_edge_index"] + else: + pos_edge_index = data[f"test_pos_edge_index"] + neg_edge_index = data[f"test_neg_edge_index"] - if not isinstance(feval, list): - feval = [feval] - return_signle = True - else: - return_signle = False + self.model.model.eval() + with torch.no_grad(): + link_probs = self.predict_proba_pyg(dataset, mask) + link_labels = self.get_link_labels(pos_edge_index, neg_edge_index) + + if not isinstance(feval, list): + feval = [feval] + return_signle = True + else: + return_signle = False + + res = [] + for f in feval: + res.append(f.evaluate(link_probs.cpu().numpy(), link_labels.cpu().numpy())) + if return_signle: + return res[0] + return res - res = [] - for f in feval: - res.append(f.evaluate(link_probs.cpu().numpy(), link_labels.cpu().numpy())) - if return_signle: - return res[0] - return res - elif self.pyg_dgl == 'dgl': - return self.evaluate_dgl(dataset,mask,feval) def evaluate_dgl(self, dataset, mask=None, feval=None): + """ + The function of training on the given dataset and keeping valid result. + + Parameters + ---------- + dataset: The link prediction dataset used to be evaluated. + mask: ``train``, ``val``, or ``test``. + The dataset mask. + + feval: ``str``. + The evaluation method used in this function. + + Returns + ------- + res: The evaluation result on the given dataset. + + """ if feval is None: feval = self.feval else: diff --git a/test/link_prediction_base.py b/test/link_prediction_base.py new file mode 100644 index 0000000..944947b --- /dev/null +++ b/test/link_prediction_base.py @@ -0,0 +1,202 @@ +import dgl +import torch +import torch.nn as nn +import torch.nn.functional as F +import itertools +import numpy as np +import scipy.sparse as sp +import dgl.function as fn +import random +from dgl.data import CoraGraphDataset, PubmedGraphDataset, CiteseerGraphDataset +# from autogl.module.train.link_prediction_full import LinkPredictionTrainer + +import sys +sys.path.insert(0, "../") +from autogl.module.model.dgl.graphsage import GraphSAGE +import dgl.data +from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter +from tqdm import tqdm + +from dgl.nn import SAGEConv +from dgl.nn.pytorch.conv import GraphConv +from dgl.nn import GATConv + +from sklearn.metrics import roc_auc_score + +parser = ArgumentParser( + "auto link prediction", formatter_class=ArgumentDefaultsHelpFormatter +) +parser.add_argument("--dataset", default="Cora", type=str, help="dataset to use", choices=["Cora", "CiteSeer", "PubMed"],) +parser.add_argument("--model", default="sage", type=str,help="model to use", choices=["gcn","gat","sage"],) +parser.add_argument("--seed", type=int, default=0, help="random seed") +parser.add_argument('--repeat', type=int, default=10) +parser.add_argument("--device", default=0, type=int, help="GPU device") +args = parser.parse_args() + +args.device = torch.device('cuda:0') +device = torch.device('cuda:0') + +if args.dataset == 'Cora': + dataset = CoraGraphDataset() +elif args.dataset == 'CiteSeer': + dataset = CiteseerGraphDataset() +elif args.dataset == 'PubMed': + dataset = PubmedGraphDataset() +else: + assert False + +def setup_seed(seed): + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + torch.backends.cudnn.deterministic = True + np.random.seed(seed) + random.seed(seed) + +class GraphSAGE(nn.Module): + def __init__(self, in_feats, h_feats): + super(GraphSAGE, self).__init__() + self.conv1 = SAGEConv(in_feats, h_feats, 'mean') + self.conv2 = SAGEConv(h_feats, h_feats, 'mean') + + def forward(self, data): + g = data + in_feat = data.ndata['feat'] + h = self.conv1(g, in_feat) + h = F.relu(h) + h = self.conv2(g, h) + return h + +class GCN(nn.Module): + def __init__(self, in_feats, h_feats): + super(GCN, self).__init__() + self.conv1 = GraphConv(in_feats, h_feats) + self.conv2 = GraphConv(h_feats, h_feats) + + def forward(self, data): + g = data + in_feat = data.ndata['feat'] + h = self.conv1(g, in_feat) + h = F.relu(h) + h = self.conv2(g, h) + return h + +class GAT(nn.Module): + def __init__(self, in_feats, h_feats): + super(GAT, self).__init__() + self.conv1 = GATConv(in_feats, h_feats // 4, 4) + self.conv2 = GATConv(h_feats, h_feats// 4, 4) + + def forward(self, data): + g = data + in_feat = data.ndata['feat'] + h = self.conv1(g, in_feat).flatten(1) + h = F.relu(h) + h = self.conv2(g, h).mean(1) + return h + + +def split_train_test(g): + u, v = g.edges() + + eids = np.arange(g.number_of_edges()) + eids = np.random.permutation(eids) + test_size = int(len(eids) * 0.1) + train_size = g.number_of_edges() - test_size + test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]] + train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]] + + # Find all negative edges and split them for training and testing + adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy()))) + adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes()) + neg_u, neg_v = np.where(adj_neg != 0) + + neg_eids = np.random.choice(len(neg_u), g.number_of_edges()) + test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]] + train_neg_u, train_neg_v = neg_u[neg_eids[train_size:]], neg_v[neg_eids[train_size:]] + + train_g = dgl.remove_edges(g, eids[:test_size]) + + train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes()) + train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes()) + + test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes()) + test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes()) + + return train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g + +def get_link_labels(pos_edge_index, neg_edge_index): + E = pos_edge_index.size(1) + neg_edge_index.size(1) + link_labels = torch.zeros(E, dtype=torch.float, device=device) + link_labels[: pos_edge_index.size(1)] = 1.0 + return link_labels + +def lp_decode(z, pos_edge_index, neg_edge_index): + edge_index = torch.cat([pos_edge_index, neg_edge_index], dim=-1) + logits = (z[edge_index[0]] * z[edge_index[1]]).sum(dim=-1) + return logits + + +res = [] +for seed in tqdm(range(1234, 1234+args.repeat)): + setup_seed(seed) + g = dataset[0].to(device) + train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g = split_train_test(g.cpu()) + train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g = train_g.to(device), train_pos_g.to(device), train_neg_g.to(device), test_pos_g.to(device), test_neg_g.to(device) + + if args.model == 'gcn' or args.model == 'gat': + train_g = dgl.add_self_loop(train_g) + + if args.model == 'gcn': + model = GCN(train_g.ndata['feat'].shape[1], 16).to(device) + elif args.model == 'gat': + model = GAT(train_g.ndata['feat'].shape[1], 16).to(device) + elif args.model == 'sage': + model = GraphSAGE(train_g.ndata['feat'].shape[1], 16).to(device) + else: + assert False + + optimizer = torch.optim.Adam(model.parameters(), lr=0.01) + + all_logits = [] + for epoch in range(100): + model.train() + optimizer.zero_grad() + + z = model(train_g) + link_logits = lp_decode( + z, torch.stack(train_pos_g.edges()), torch.stack(train_neg_g.edges()) + ) + link_labels = get_link_labels( + torch.stack(train_pos_g.edges()), torch.stack(train_neg_g.edges()) + ) + loss = F.binary_cross_entropy_with_logits(link_logits, link_labels) + loss.backward() + optimizer.step() + + model.eval() + with torch.no_grad(): + z = model(train_g) + link_logits = lp_decode( + z, torch.stack(test_pos_g.edges()), torch.stack(test_neg_g.edges()) + ) + link_probs = link_logits.sigmoid() + link_labels = get_link_labels( + torch.stack(test_pos_g.edges()), torch.stack(test_neg_g.edges()) + ) + + result = roc_auc_score(link_labels.cpu().numpy(), link_probs.cpu().numpy()) + res.append(result) + +print(np.mean(res), np.std(res)) + + + + + + + + + + + + diff --git a/test/link_prediction_model.py b/test/link_prediction_model.py new file mode 100644 index 0000000..3372181 --- /dev/null +++ b/test/link_prediction_model.py @@ -0,0 +1,181 @@ +import dgl +import torch +import torch.nn as nn +import torch.nn.functional as F +import itertools +import numpy as np +import scipy.sparse as sp +import dgl.function as fn +import random +from dgl.data import CoraGraphDataset, PubmedGraphDataset, CiteseerGraphDataset +# from autogl.module.train.link_prediction_full import LinkPredictionTrainer + +import sys +sys.path.insert(0, "../") +from autogl.module.model.dgl.graphsage import GraphSAGE +import dgl.data +from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter +from tqdm import tqdm +from dgl.nn import SAGEConv + +from sklearn.metrics import roc_auc_score + +parser = ArgumentParser( + "auto link prediction", formatter_class=ArgumentDefaultsHelpFormatter +) +parser.add_argument("--dataset", default="Cora", type=str, help="dataset to use", choices=["Cora", "CiteSeer", "PubMed"],) +parser.add_argument("--model", default="sage", type=str,help="model to use", choices=["gcn","gat","sage"],) +parser.add_argument("--seed", type=int, default=0, help="random seed") +parser.add_argument('--repeat', type=int, default=10) +parser.add_argument("--device", default=0, type=int, help="GPU device") +args = parser.parse_args() + +args.device = torch.device('cuda:0') +device = torch.device('cuda:0') + +if args.dataset == 'Cora': + dataset = CoraGraphDataset() +elif args.dataset == 'CiteSeer': + dataset = CiteseerGraphDataset() +elif args.dataset == 'PubMed': + dataset = PubmedGraphDataset() +else: + assert False + +def setup_seed(seed): + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + torch.backends.cudnn.deterministic = True + np.random.seed(seed) + random.seed(seed) + + +def split_train_test(g): + u, v = g.edges() + + eids = np.arange(g.number_of_edges()) + eids = np.random.permutation(eids) + test_size = int(len(eids) * 0.1) + train_size = g.number_of_edges() - test_size + test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]] + train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]] + + # Find all negative edges and split them for training and testing + adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy()))) + adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes()) + neg_u, neg_v = np.where(adj_neg != 0) + + neg_eids = np.random.choice(len(neg_u), g.number_of_edges()) + test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]] + train_neg_u, train_neg_v = neg_u[neg_eids[train_size:]], neg_v[neg_eids[train_size:]] + + train_g = dgl.remove_edges(g, eids[:test_size]) + + train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes()) + train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes()) + + test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes()) + test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes()) + + return train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g + +class DotPredictor(nn.Module): + def forward(self, g, h): + with g.local_scope(): + g.ndata['h'] = h + # Compute a new edge feature named 'score' by a dot-product between the + # source node feature 'h' and destination node feature 'h'. + g.apply_edges(fn.u_dot_v('h', 'h', 'score')) + # u_dot_v returns a 1-element vector for each edge so you need to squeeze it. + return g.edata['score'][:, 0] + + +def compute_loss(pos_score, neg_score): + scores = torch.cat([pos_score, neg_score]) + labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]) + return F.binary_cross_entropy_with_logits(scores.cpu(), labels) + + +def compute_auc(pos_score, neg_score): + scores = torch.cat([pos_score, neg_score]).numpy() + labels = torch.cat( + [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy() + return roc_auc_score(labels, scores) + +def get_link_labels(pos_edge_index, neg_edge_index): + E = pos_edge_index.size(1) + neg_edge_index.size(1) + link_labels = torch.zeros(E, dtype=torch.float, device=device) + link_labels[: pos_edge_index.size(1)] = 1.0 + return link_labels + + +res = [] +for seed in tqdm(range(1234, 1234+args.repeat)): + setup_seed(seed) + g = dataset[0].to(device) + train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g = split_train_test(g.cpu()) + train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g = train_g.to(device), train_pos_g.to(device), train_neg_g.to(device), test_pos_g.to(device), test_neg_g.to(device) + + if args.model == 'gcn': + pass + elif args.model == 'gat': + pass + elif args.model == 'sage': + para = { + 'features_num': train_g.ndata['feat'].shape[1], + 'num_class': 2, + 'num_layers': 3, + 'hidden': [16, 16], + 'dropout': 0.0, + 'act': 'relu', + 'agg': 'mean', + } + model = GraphSAGE(para).to(device) + else: + assert False + + optimizer = torch.optim.Adam(model.parameters(), lr=0.01) + + all_logits = [] + for epoch in range(100): + model.train() + optimizer.zero_grad() + + z = model.lp_encode(train_g) + link_logits = model.lp_decode( + z, torch.stack(train_pos_g.edges()), torch.stack(train_neg_g.edges()) + ) + link_labels = get_link_labels( + torch.stack(train_pos_g.edges()), torch.stack(train_neg_g.edges()) + ) + loss = F.binary_cross_entropy_with_logits(link_logits, link_labels) + loss.backward() + optimizer.step() + + model.eval() + with torch.no_grad(): + z = model.lp_encode(train_g) + link_logits = model.lp_decode( + z, torch.stack(test_pos_g.edges()), torch.stack(test_neg_g.edges()) + ) + link_probs = link_logits.sigmoid() + link_labels = get_link_labels( + torch.stack(test_pos_g.edges()), torch.stack(test_neg_g.edges()) + ) + + result = roc_auc_score(link_labels.cpu().numpy(), link_probs.cpu().numpy()) + res.append(result) + +print(np.mean(res), np.std(res)) + + + + + + + + + + + + diff --git a/test/link_prediction_solver.py b/test/link_prediction_solver.py new file mode 100644 index 0000000..2af8c4a --- /dev/null +++ b/test/link_prediction_solver.py @@ -0,0 +1,261 @@ +import sys + +sys.path.insert(0, "../") +from tqdm import tqdm + +# import autogl.module.train +# import torch_geometric +# exit(0) +# +from autogl.datasets import build_dataset_from_name +from autogl.solver.classifier.link_predictor import AutoLinkPredictor +from autogl.module.train.evaluation import Auc +import yaml +import random +import torch +import numpy as np +import dgl +import torch +import torch.nn as nn +import torch.nn.functional as F +import itertools +import numpy as np +import scipy.sparse as sp +from autogl.module.model.dgl import AutoSAGE, AutoGAT, AutoGCN + + +def construct_negative_graph(graph, k): + src, dst = graph.edges() + + neg_src = src.repeat_interleave(k) + neg_dst = torch.randint(0, graph.num_nodes(), (len(src) * k,)) + # return dgl.graph((neg_src, neg_dst), num_nodes=graph.num_nodes()).edges() + return neg_src, neg_dst + +def negative_sample(data): + return construct_negative_graph(data, 5) + +import autogl.datasets.utils as tmp_utils +tmp_utils.negative_sampling = negative_sample + +from dgl.data import CoraGraphDataset, PubmedGraphDataset, CiteseerGraphDataset +from autogl.module.train.link_prediction_full import LinkPredictionTrainer + +def setup_seed(seed): + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + torch.backends.cudnn.deterministic = True + np.random.seed(seed) + random.seed(seed) + +def fixed(**kwargs): + return [{ + 'parameterName': k, + "type": "FIXED", + "value": v + } for k, v in kwargs.items()] + +def split_train_test(g): + u, v = g.edges() + + eids = np.arange(g.number_of_edges()) + eids = np.random.permutation(eids) + test_size = int(len(eids) * 0.1) + train_size = g.number_of_edges() - test_size + test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]] + train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]] + + # Find all negative edges and split them for training and testing + adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy()))) + adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes()) + neg_u, neg_v = np.where(adj_neg != 0) + + neg_eids = np.random.choice(len(neg_u), g.number_of_edges()) + test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]] + train_neg_u, train_neg_v = neg_u[neg_eids[train_size:]], neg_v[neg_eids[train_size:]] + + train_g = dgl.remove_edges(g, eids[:test_size]) + + train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes()) + train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes()) + + test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes()) + test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes()) + + return train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g + +def split_train_valid_test(g): + u, v = g.edges() + + eids = np.arange(g.number_of_edges()) + eids = np.random.permutation(eids) + + valid_size = int(len(eids) * 0.1) + test_size = int(len(eids) * 0.1) + train_size = g.number_of_edges() - test_size - valid_size + + test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]] + valid_pos_u, valid_pos_v = u[eids[test_size:test_size+valid_size]], v[eids[test_size:test_size+valid_size]] + train_pos_u, train_pos_v = u[eids[test_size+valid_size:]], v[eids[test_size+valid_size:]] + + # Find all negative edges and split them for training and testing + adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy()))) + adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes()) + neg_u, neg_v = np.where(adj_neg != 0) + + neg_eids = np.random.choice(len(neg_u), g.number_of_edges()) + test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]] + valid_neg_u, valid_neg_v = neg_u[neg_eids[test_size:test_size+valid_size]], neg_v[neg_eids[test_size:test_size+valid_size]] + train_neg_u, train_neg_v = neg_u[neg_eids[train_size:]], neg_v[neg_eids[train_size:]] + + train_g = dgl.remove_edges(g, eids[:test_size+valid_size]) + + train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes()) + train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes()) + + valid_pos_g = dgl.graph((valid_pos_u, valid_pos_v), num_nodes=g.number_of_nodes()) + valid_neg_g = dgl.graph((valid_neg_u, valid_neg_v), num_nodes=g.number_of_nodes()) + + test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes()) + test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes()) + + return train_g, train_pos_g, train_neg_g, valid_pos_g, valid_neg_g, test_pos_g, test_neg_g + +if __name__ == "__main__": + + + from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter + + parser = ArgumentParser( + "auto link prediction", formatter_class=ArgumentDefaultsHelpFormatter + ) + parser.add_argument( + "--dataset", + default="Cora", + type=str, + help="dataset to use", + choices=[ + "Cora", + "CiteSeer", + "PubMed", + ], + ) + parser.add_argument( + "--model", + default="sage", + type=str, + help="model to use", + choices=[ + "gcn", + "gat", + "sage", + ], + ) + parser.add_argument("--seed", type=int, default=0, help="random seed") + parser.add_argument('--repeat', type=int, default=10) + parser.add_argument("--device", default=0, type=int, help="GPU device") + + args = parser.parse_args() + + args.device = torch.device('cuda:0') + device = torch.device('cuda:0') + + if torch.cuda.is_available(): + torch.cuda.set_device(args.device) + + if args.dataset == 'Cora': + dataset = CoraGraphDataset() + elif args.dataset == 'CiteSeer': + dataset = CiteseerGraphDataset() + elif args.dataset == 'PubMed': + dataset = PubmedGraphDataset() + else: + assert False + + res = [] + for seed in tqdm(range(1234, 1234+args.repeat)): + # set random seed + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + graph = dataset[0].to(args.device) + num_features = graph.ndata['feat'].size(1) + + if args.model == 'gcn': + model = AutoGCN + elif args.model == 'gat': + model = AutoGAT + elif args.model == 'sage': + automodel = AutoSAGE( + num_features=num_features, + num_classes=2, + device=args.device + ) + automodel.hyperparams = { + "num_layers": 3, + "hidden": [16, 16], + "dropout": 0.0, + "act": "relu", + "agg": "mean", + } + model_hp = { + "num_layers": 3, + "hidden": [16, 16], + "dropout": 0.0, + "act": "relu", + "agg": "mean", + } + else: + assert False + + automodel.initialize() + + + autoClassifier = AutoLinkPredictor( + feature_module=None, + graph_models='sage', + ensemble_module=None, + max_evals=1, + hpo_module='random', + trainer_hp_space=fixed(**{ + "max_epoch": 100, + "early_stopping_round": 100 + 1, + "lr":0.01, + "weight_decay": None, + }), + model_hp_spaces=[fixed(**model_hp)] + ) + autoClassifier.fit( + dataset, + time_limit=3600, + evaluation_method=[Auc], + seed=seed, + train_split=0.85, + val_split=0.05, + ) + autoClassifier.get_leaderboard().show() + + # test + predict_result = autoClassifier.predict_proba() + + pos_edge_index, neg_edge_index = ( + dataset[0].test_pos_edge_index, + dataset[0].test_neg_edge_index, + ) + E = pos_edge_index.size(1) + neg_edge_index.size(1) + link_labels = torch.zeros(E) + link_labels[: pos_edge_index.size(1)] = 1.0 + + print( + "test auc: %.4f" + % (Auc.evaluate(predict_result, link_labels.detach().cpu().numpy())) + ) + +""" +AUC 0.8151564430268863 +""" diff --git a/test/link_prediction_trainer.py b/test/link_prediction_trainer.py index 79ded00..e2175bb 100644 --- a/test/link_prediction_trainer.py +++ b/test/link_prediction_trainer.py @@ -1,6 +1,7 @@ import sys sys.path.insert(0, "../") +from tqdm import tqdm # import autogl.module.train # import torch_geometric @@ -20,7 +21,7 @@ import torch.nn.functional as F import itertools import numpy as np import scipy.sparse as sp -from autogl.module.model.dgl import AutoSAGE +from autogl.module.model.dgl import AutoSAGE, AutoGAT, AutoGCN def construct_negative_graph(graph, k): @@ -117,7 +118,6 @@ def split_train_valid_test(g): if __name__ == "__main__": - setup_seed(1234) from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter @@ -135,21 +135,28 @@ if __name__ == "__main__": "PubMed", ], ) + parser.add_argument( + "--model", + default="sage", + type=str, + help="model to use", + choices=[ + "gcn", + "gat", + "sage", + ], + ) parser.add_argument("--seed", type=int, default=0, help="random seed") + parser.add_argument('--repeat', type=int, default=10) parser.add_argument("--device", default=0, type=int, help="GPU device") args = parser.parse_args() + + args.device = torch.device('cuda:0') + device = torch.device('cuda:0') + if torch.cuda.is_available(): torch.cuda.set_device(args.device) - seed = args.seed - # set random seed - random.seed(seed) - np.random.seed(seed) - torch.manual_seed(seed) - if torch.cuda.is_available(): - torch.cuda.manual_seed(seed) - torch.backends.cudnn.deterministic = True - torch.backends.cudnn.benchmark = False if args.dataset == 'Cora': dataset = CoraGraphDataset() @@ -157,60 +164,76 @@ if __name__ == "__main__": dataset = CiteseerGraphDataset() elif args.dataset == 'PubMed': dataset = PubmedGraphDataset() - - # configs = yaml.load(open(args.configs, "r").read(), Loader=yaml.FullLoader) - # configs["hpo"]["name"] = args.hpo - # configs["hpo"]["max_evals"] = args.max_eval - # autoClassifier = AutoLinkPredictor.from_config(configs) - - graph = dataset[0].to(args.device) - num_features = graph.ndata['feat'].size(1) - - autoSAGE = AutoSAGE( - num_features=num_features, - num_classes=2, - device=args.device - ) - autoSAGE.hyperparams = { - "num_layers": 3, - "hidden": [16, 16], - "dropout": 0.0, - "act": "relu", - "agg": "mean", - } - autoSAGE.initialize() - - trainer = LinkPredictionTrainer( - model = autoSAGE, - num_features = num_features, - optimizer = None, - lr = 1e-2, - max_epoch = 100, - early_stopping_round = 101, - weight_decay = 0.0, - device = "auto", - init = True, - feval = [Auc], - loss = "binary_cross_entropy_with_logits", - ) - - train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g = split_train_test(graph.cpu()) - - dataset = { - 'train': train_g.to(args.device), - 'train_pos': train_pos_g.to(args.device), - 'train_neg': train_neg_g.to(args.device), - 'test_pos': test_pos_g.to(args.device), - 'test_neg': test_neg_g.to(args.device), - } - - trainer.train(dataset, True) - pre = trainer.evaluate(dataset, mask="test", feval=Auc) - print(pre.item()) - res = trainer.predict(dataset) - print(res) - - exit(0) + else: + assert False + + res = [] + for seed in tqdm(range(1234, 1234+args.repeat)): + # set random seed + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + graph = dataset[0].to(args.device) + num_features = graph.ndata['feat'].size(1) + + if args.model == 'gcn': + model = AutoGCN + elif args.model == 'gat': + model = AutoGAT + elif args.model == 'sage': + automodel = AutoSAGE( + num_features=num_features, + num_classes=2, + device=args.device + ) + automodel.hyperparams = { + "num_layers": 3, + "hidden": [16, 16], + "dropout": 0.0, + "act": "relu", + "agg": "mean", + } + else: + assert False + + automodel.initialize() + + trainer = LinkPredictionTrainer( + model = automodel, + num_features = num_features, + optimizer = None, + lr = 1e-2, + max_epoch = 100, + early_stopping_round = 101, + weight_decay = 0.0, + device = "auto", + init = True, + feval = [Auc], + loss = "binary_cross_entropy_with_logits", + ) + + train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g = split_train_test(graph.cpu()) + + dataset_splitted = { + 'train': train_g.to(args.device), + 'train_pos': train_pos_g.to(args.device), + 'train_neg': train_neg_g.to(args.device), + 'test_pos': test_pos_g.to(args.device), + 'test_neg': test_neg_g.to(args.device), + } + + trainer.train(dataset_splitted, False) + pre = trainer.evaluate(dataset_splitted, mask="test", feval=Auc) + result = pre.item() + res.append(result) + + print(np.mean(res), np.std(res)) + exit(1) # train autoClassifier.fit( diff --git a/test/link_prediction_trainer_dataset.py b/test/link_prediction_trainer_dataset.py new file mode 100644 index 0000000..b8138bd --- /dev/null +++ b/test/link_prediction_trainer_dataset.py @@ -0,0 +1,271 @@ +import sys + +sys.path.insert(0, "../") +from tqdm import tqdm + +# import autogl.module.train +# import torch_geometric +# exit(0) +# +from autogl.datasets import build_dataset_from_name +# from autogl.solver.classifier.link_predictor import AutoLinkPredictor +from autogl.module.train.evaluation import Auc +import yaml +import random +import torch +import numpy as np +import dgl +import torch +import torch.nn as nn +import torch.nn.functional as F +import itertools +import numpy as np +import scipy.sparse as sp +from autogl.module.model.dgl import AutoSAGE, AutoGAT, AutoGCN +from autogl.datasets.utils.conversion import general_static_graphs_to_dgl_dataset + + +def construct_negative_graph(graph, k): + src, dst = graph.edges() + + neg_src = src.repeat_interleave(k) + neg_dst = torch.randint(0, graph.num_nodes(), (len(src) * k,)) + # return dgl.graph((neg_src, neg_dst), num_nodes=graph.num_nodes()).edges() + return neg_src, neg_dst + +def negative_sample(data): + return construct_negative_graph(data, 5) + +import autogl.datasets.utils as tmp_utils +tmp_utils.negative_sampling = negative_sample + +from dgl.data import CoraGraphDataset, PubmedGraphDataset, CiteseerGraphDataset +from autogl.module.train.link_prediction_full import LinkPredictionTrainer + +def setup_seed(seed): + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + torch.backends.cudnn.deterministic = True + np.random.seed(seed) + random.seed(seed) + + + +def split_train_test(g): + u, v = g.edges() + + eids = np.arange(g.number_of_edges()) + eids = np.random.permutation(eids) + test_size = int(len(eids) * 0.1) + train_size = g.number_of_edges() - test_size + test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]] + train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]] + + # Find all negative edges and split them for training and testing + adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy()))) + adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes()) + neg_u, neg_v = np.where(adj_neg != 0) + + neg_eids = np.random.choice(len(neg_u), g.number_of_edges()) + test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]] + train_neg_u, train_neg_v = neg_u[neg_eids[train_size:]], neg_v[neg_eids[train_size:]] + + train_g = dgl.remove_edges(g, eids[:test_size]) + + train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes()) + train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes()) + + test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes()) + test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes()) + + return train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g + +def split_train_valid_test(g): + u, v = g.edges() + + eids = np.arange(g.number_of_edges()) + eids = np.random.permutation(eids) + + valid_size = int(len(eids) * 0.1) + test_size = int(len(eids) * 0.1) + train_size = g.number_of_edges() - test_size - valid_size + + test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]] + valid_pos_u, valid_pos_v = u[eids[test_size:test_size+valid_size]], v[eids[test_size:test_size+valid_size]] + train_pos_u, train_pos_v = u[eids[test_size+valid_size:]], v[eids[test_size+valid_size:]] + + # Find all negative edges and split them for training and testing + adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy()))) + adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes()) + neg_u, neg_v = np.where(adj_neg != 0) + + neg_eids = np.random.choice(len(neg_u), g.number_of_edges()) + test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]] + valid_neg_u, valid_neg_v = neg_u[neg_eids[test_size:test_size+valid_size]], neg_v[neg_eids[test_size:test_size+valid_size]] + train_neg_u, train_neg_v = neg_u[neg_eids[train_size:]], neg_v[neg_eids[train_size:]] + + train_g = dgl.remove_edges(g, eids[:test_size+valid_size]) + + train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes()) + train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes()) + + valid_pos_g = dgl.graph((valid_pos_u, valid_pos_v), num_nodes=g.number_of_nodes()) + valid_neg_g = dgl.graph((valid_neg_u, valid_neg_v), num_nodes=g.number_of_nodes()) + + test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes()) + test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes()) + + return train_g, train_pos_g, train_neg_g, valid_pos_g, valid_neg_g, test_pos_g, test_neg_g + +if __name__ == "__main__": + + + from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter + + parser = ArgumentParser( + "auto link prediction", formatter_class=ArgumentDefaultsHelpFormatter + ) + parser.add_argument( + "--dataset", + default="Cora", + type=str, + help="dataset to use", + choices=[ + "Cora", + "CiteSeer", + "PubMed", + ], + ) + parser.add_argument( + "--model", + default="sage", + type=str, + help="model to use", + choices=[ + "gcn", + "gat", + "sage", + ], + ) + parser.add_argument("--seed", type=int, default=0, help="random seed") + parser.add_argument('--repeat', type=int, default=10) + parser.add_argument("--device", default=0, type=int, help="GPU device") + + args = parser.parse_args() + + args.device = torch.device('cuda:0') + device = torch.device('cuda:0') + + if torch.cuda.is_available(): + torch.cuda.set_device(args.device) + + if args.dataset == 'Cora': + dataset = CoraGraphDataset() + elif args.dataset == 'CiteSeer': + dataset = CiteseerGraphDataset() + elif args.dataset == 'PubMed': + dataset = PubmedGraphDataset() + else: + assert False + + dataset = build_dataset_from_name(args.dataset.lower()) + dataset = general_static_graphs_to_dgl_dataset(dataset) + + res = [] + for seed in tqdm(range(1234, 1234+args.repeat)): + # set random seed + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + graph = dataset[0].to(args.device) + num_features = graph.ndata['feat'].size(1) + + if args.model == 'gcn': + model = AutoGCN + elif args.model == 'gat': + model = AutoGAT + elif args.model == 'sage': + automodel = AutoSAGE( + num_features=num_features, + num_classes=2, + device=args.device + ) + automodel.hyperparams = { + "num_layers": 3, + "hidden": [16, 16], + "dropout": 0.0, + "act": "relu", + "agg": "mean", + } + else: + assert False + + automodel.initialize() + + trainer = LinkPredictionTrainer( + model = automodel, + num_features = num_features, + optimizer = None, + lr = 1e-2, + max_epoch = 100, + early_stopping_round = 101, + weight_decay = 0.0, + device = "auto", + init = True, + feval = [Auc], + loss = "binary_cross_entropy_with_logits", + ) + + train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g = split_train_test(graph.cpu()) + + dataset_splitted = { + 'train': train_g.to(args.device), + 'train_pos': train_pos_g.to(args.device), + 'train_neg': train_neg_g.to(args.device), + 'test_pos': test_pos_g.to(args.device), + 'test_neg': test_neg_g.to(args.device), + } + + trainer.train(dataset_splitted, False) + pre = trainer.evaluate(dataset_splitted, mask="test", feval=Auc) + result = pre.item() + res.append(result) + + print(np.mean(res), np.std(res)) + exit(1) + + # train + autoClassifier.fit( + dataset, + time_limit=3600, + evaluation_method=[Auc], + seed=seed, + train_split=0.85, + val_split=0.05, + ) + autoClassifier.get_leaderboard().show() + + # test + predict_result = autoClassifier.predict_proba() + + pos_edge_index, neg_edge_index = ( + dataset[0].test_pos_edge_index, + dataset[0].test_neg_edge_index, + ) + E = pos_edge_index.size(1) + neg_edge_index.size(1) + link_labels = torch.zeros(E) + link_labels[: pos_edge_index.size(1)] = 1.0 + + print( + "test auc: %.4f" + % (Auc.evaluate(predict_result, link_labels.detach().cpu().numpy())) + ) + +""" +AUC 0.8151564430268863 +"""