Browse Source

fix bugs

tags/v0.3.1
lihy96 4 years ago
parent
commit
b3c8a440b0
6 changed files with 1086 additions and 108 deletions
  1. +83
    -43
      autogl/module/train/link_prediction_full.py
  2. +202
    -0
      test/link_prediction_base.py
  3. +181
    -0
      test/link_prediction_model.py
  4. +261
    -0
      test/link_prediction_solver.py
  5. +88
    -65
      test/link_prediction_trainer.py
  6. +271
    -0
      test/link_prediction_trainer_dataset.py

+ 83
- 43
autogl/module/train/link_prediction_full.py View File

@@ -169,7 +169,7 @@ class LinkPredictionTrainer(BaseLinkPredictionTrainer):
# Get task name, i.e., `LinkPrediction`.
return "LinkPrediction"

def train_only(self, data, train_mask=None):
def train_only_pyg(self, data, train_mask=None):
"""
The function of training on the given dataset and mask.
Parameters
@@ -241,8 +241,7 @@ class LinkPredictionTrainer(BaseLinkPredictionTrainer):
The function of training on the given dataset and mask.
Parameters
----------
pos_data: positive links
neg_data: negative links
dataset: there are train, train_pos, train_neg graph in this dataset
Returns
-------
self: ``autogl.train.LinkPredictionTrainer``
@@ -308,7 +307,7 @@ class LinkPredictionTrainer(BaseLinkPredictionTrainer):

self.early_stopping.load_checkpoint(self.model.model)

def predict_only(self, data, test_mask=None):
def predict_only_pyg(self, data, test_mask=None):
"""
The function of predicting on the given dataset and mask.

@@ -342,9 +341,7 @@ class LinkPredictionTrainer(BaseLinkPredictionTrainer):

Parameters
----------
data: The link prediction dataset used to be predicted.
train_mask: The mask used in training stage.

dataset: The link prediction dataset used to be predicted.
Returns
-------
res: The result of predicting on the given dataset.
@@ -377,11 +374,11 @@ class LinkPredictionTrainer(BaseLinkPredictionTrainer):
if self.pyg_dgl == 'pyg':
data = dataset[0]
data.edge_index = data.train_pos_edge_index
self.train_only(data)
self.train_only_pyg(data)
if keep_valid_result:
self.valid_result = self.predict_only(data)
self.valid_result_prob = self.predict_proba(dataset, "val")
self.valid_score = self.evaluate(dataset, mask="val", feval=self.feval)
self.valid_result = self.predict_only_pyg(data)
self.valid_result_prob = self.predict_proba_pyg(dataset, "val")
self.valid_score = self.evaluate_pyg(dataset, mask="val", feval=self.feval)
elif self.pyg_dgl == 'dgl':
self.train_only_dgl(dataset)
if keep_valid_result:
@@ -405,11 +402,17 @@ class LinkPredictionTrainer(BaseLinkPredictionTrainer):
The prediction result of ``predict_proba``.
"""
if self.pyg_dgl == 'pyg':
return self.predict_proba(dataset, mask=mask, in_log_format=False)
return self.predict_proba_pyg(dataset, mask=mask, in_log_format=False)
elif self.pyg_dgl == 'dgl':
return self.predict_proba_dgl(dataset, mask=mask, in_log_format=False)

def predict_proba(self, dataset, mask=None, in_log_format=False):
if self.pyg_dgl == 'pyg':
return self.predict_proba_pyg(dataset, mask, in_log_format)
elif self.pyg_dgl == 'dgl':
return self.predict_proba_dgl(dataset, mask, in_log_format)

def predict_proba_pyg(self, dataset, mask=None, in_log_format=False):
"""
The function of predicting the probability on the given dataset.

@@ -443,14 +446,30 @@ class LinkPredictionTrainer(BaseLinkPredictionTrainer):

self.model.model.eval()
with torch.no_grad():
z = self.predict_only(data)
z = self.predict_only_pyg(data)
link_logits = self.model.model.lp_decode(z, pos_edge_index, neg_edge_index)
link_probs = link_logits.sigmoid()

return link_probs

def predict_proba_dgl(self, dataset, mask=None, in_log_format=False):
"""
The function of predicting the probability on the given dataset.

Parameters
----------
dataset: The link prediction dataset used to be predicted.

mask: ``train``, ``val``, or ``test``.
The dataset mask.

in_log_format: ``bool``.
If True(False), the probability will (not) be log format.

Returns
-------
The prediction result.
"""
train_graph = dataset['train']
try:
try:
@@ -547,43 +566,64 @@ class LinkPredictionTrainer(BaseLinkPredictionTrainer):

"""
if self.pyg_dgl == 'pyg':
data = dataset[0]
data = data.to(self.device)
test_mask = mask
if feval is None:
feval = self.feval
else:
feval = get_feval(feval)
return self.evaluate_pyg(self, dataset, mask, feval)
elif self.pyg_dgl == 'dgl':
return self.evaluate_dgl(dataset,mask,feval)

if mask in ["train", "val", "test"]:
pos_edge_index = data[f"{mask}_pos_edge_index"]
neg_edge_index = data[f"{mask}_neg_edge_index"]
else:
pos_edge_index = data[f"test_pos_edge_index"]
neg_edge_index = data[f"test_neg_edge_index"]
def evaluate_pyg(self, dataset, mask=None, feval=None):
data = dataset[0]
data = data.to(self.device)
test_mask = mask
if feval is None:
feval = self.feval
else:
feval = get_feval(feval)

self.model.model.eval()
with torch.no_grad():
link_probs = self.predict_proba(dataset, mask)
link_labels = self.get_link_labels(pos_edge_index, neg_edge_index)
if mask in ["train", "val", "test"]:
pos_edge_index = data[f"{mask}_pos_edge_index"]
neg_edge_index = data[f"{mask}_neg_edge_index"]
else:
pos_edge_index = data[f"test_pos_edge_index"]
neg_edge_index = data[f"test_neg_edge_index"]

if not isinstance(feval, list):
feval = [feval]
return_signle = True
else:
return_signle = False
self.model.model.eval()
with torch.no_grad():
link_probs = self.predict_proba_pyg(dataset, mask)
link_labels = self.get_link_labels(pos_edge_index, neg_edge_index)

if not isinstance(feval, list):
feval = [feval]
return_signle = True
else:
return_signle = False

res = []
for f in feval:
res.append(f.evaluate(link_probs.cpu().numpy(), link_labels.cpu().numpy()))
if return_signle:
return res[0]
return res

res = []
for f in feval:
res.append(f.evaluate(link_probs.cpu().numpy(), link_labels.cpu().numpy()))
if return_signle:
return res[0]
return res
elif self.pyg_dgl == 'dgl':
return self.evaluate_dgl(dataset,mask,feval)

def evaluate_dgl(self, dataset, mask=None, feval=None):
"""
The function of training on the given dataset and keeping valid result.

Parameters
----------
dataset: The link prediction dataset used to be evaluated.

mask: ``train``, ``val``, or ``test``.
The dataset mask.

feval: ``str``.
The evaluation method used in this function.

Returns
-------
res: The evaluation result on the given dataset.

"""
if feval is None:
feval = self.feval
else:


+ 202
- 0
test/link_prediction_base.py View File

@@ -0,0 +1,202 @@
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import itertools
import numpy as np
import scipy.sparse as sp
import dgl.function as fn
import random
from dgl.data import CoraGraphDataset, PubmedGraphDataset, CiteseerGraphDataset
# from autogl.module.train.link_prediction_full import LinkPredictionTrainer

import sys
sys.path.insert(0, "../")
from autogl.module.model.dgl.graphsage import GraphSAGE
import dgl.data
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from tqdm import tqdm

from dgl.nn import SAGEConv
from dgl.nn.pytorch.conv import GraphConv
from dgl.nn import GATConv

from sklearn.metrics import roc_auc_score

parser = ArgumentParser(
"auto link prediction", formatter_class=ArgumentDefaultsHelpFormatter
)
parser.add_argument("--dataset", default="Cora", type=str, help="dataset to use", choices=["Cora", "CiteSeer", "PubMed"],)
parser.add_argument("--model", default="sage", type=str,help="model to use", choices=["gcn","gat","sage"],)
parser.add_argument("--seed", type=int, default=0, help="random seed")
parser.add_argument('--repeat', type=int, default=10)
parser.add_argument("--device", default=0, type=int, help="GPU device")
args = parser.parse_args()

args.device = torch.device('cuda:0')
device = torch.device('cuda:0')

if args.dataset == 'Cora':
dataset = CoraGraphDataset()
elif args.dataset == 'CiteSeer':
dataset = CiteseerGraphDataset()
elif args.dataset == 'PubMed':
dataset = PubmedGraphDataset()
else:
assert False

def setup_seed(seed):
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
np.random.seed(seed)
random.seed(seed)

class GraphSAGE(nn.Module):
def __init__(self, in_feats, h_feats):
super(GraphSAGE, self).__init__()
self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
self.conv2 = SAGEConv(h_feats, h_feats, 'mean')

def forward(self, data):
g = data
in_feat = data.ndata['feat']
h = self.conv1(g, in_feat)
h = F.relu(h)
h = self.conv2(g, h)
return h

class GCN(nn.Module):
def __init__(self, in_feats, h_feats):
super(GCN, self).__init__()
self.conv1 = GraphConv(in_feats, h_feats)
self.conv2 = GraphConv(h_feats, h_feats)

def forward(self, data):
g = data
in_feat = data.ndata['feat']
h = self.conv1(g, in_feat)
h = F.relu(h)
h = self.conv2(g, h)
return h

class GAT(nn.Module):
def __init__(self, in_feats, h_feats):
super(GAT, self).__init__()
self.conv1 = GATConv(in_feats, h_feats // 4, 4)
self.conv2 = GATConv(h_feats, h_feats// 4, 4)

def forward(self, data):
g = data
in_feat = data.ndata['feat']
h = self.conv1(g, in_feat).flatten(1)
h = F.relu(h)
h = self.conv2(g, h).mean(1)
return h


def split_train_test(g):
u, v = g.edges()

eids = np.arange(g.number_of_edges())
eids = np.random.permutation(eids)
test_size = int(len(eids) * 0.1)
train_size = g.number_of_edges() - test_size
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]

# Find all negative edges and split them for training and testing
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes())
neg_u, neg_v = np.where(adj_neg != 0)

neg_eids = np.random.choice(len(neg_u), g.number_of_edges())
test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
train_neg_u, train_neg_v = neg_u[neg_eids[train_size:]], neg_v[neg_eids[train_size:]]

train_g = dgl.remove_edges(g, eids[:test_size])

train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes())
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes())

test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes())
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes())

return train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g

def get_link_labels(pos_edge_index, neg_edge_index):
E = pos_edge_index.size(1) + neg_edge_index.size(1)
link_labels = torch.zeros(E, dtype=torch.float, device=device)
link_labels[: pos_edge_index.size(1)] = 1.0
return link_labels

def lp_decode(z, pos_edge_index, neg_edge_index):
edge_index = torch.cat([pos_edge_index, neg_edge_index], dim=-1)
logits = (z[edge_index[0]] * z[edge_index[1]]).sum(dim=-1)
return logits


res = []
for seed in tqdm(range(1234, 1234+args.repeat)):
setup_seed(seed)
g = dataset[0].to(device)
train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g = split_train_test(g.cpu())
train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g = train_g.to(device), train_pos_g.to(device), train_neg_g.to(device), test_pos_g.to(device), test_neg_g.to(device)

if args.model == 'gcn' or args.model == 'gat':
train_g = dgl.add_self_loop(train_g)

if args.model == 'gcn':
model = GCN(train_g.ndata['feat'].shape[1], 16).to(device)
elif args.model == 'gat':
model = GAT(train_g.ndata['feat'].shape[1], 16).to(device)
elif args.model == 'sage':
model = GraphSAGE(train_g.ndata['feat'].shape[1], 16).to(device)
else:
assert False

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

all_logits = []
for epoch in range(100):
model.train()
optimizer.zero_grad()

z = model(train_g)
link_logits = lp_decode(
z, torch.stack(train_pos_g.edges()), torch.stack(train_neg_g.edges())
)
link_labels = get_link_labels(
torch.stack(train_pos_g.edges()), torch.stack(train_neg_g.edges())
)
loss = F.binary_cross_entropy_with_logits(link_logits, link_labels)
loss.backward()
optimizer.step()

model.eval()
with torch.no_grad():
z = model(train_g)
link_logits = lp_decode(
z, torch.stack(test_pos_g.edges()), torch.stack(test_neg_g.edges())
)
link_probs = link_logits.sigmoid()
link_labels = get_link_labels(
torch.stack(test_pos_g.edges()), torch.stack(test_neg_g.edges())
)

result = roc_auc_score(link_labels.cpu().numpy(), link_probs.cpu().numpy())
res.append(result)

print(np.mean(res), np.std(res))













+ 181
- 0
test/link_prediction_model.py View File

@@ -0,0 +1,181 @@
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import itertools
import numpy as np
import scipy.sparse as sp
import dgl.function as fn
import random
from dgl.data import CoraGraphDataset, PubmedGraphDataset, CiteseerGraphDataset
# from autogl.module.train.link_prediction_full import LinkPredictionTrainer

import sys
sys.path.insert(0, "../")
from autogl.module.model.dgl.graphsage import GraphSAGE
import dgl.data
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from tqdm import tqdm
from dgl.nn import SAGEConv

from sklearn.metrics import roc_auc_score

parser = ArgumentParser(
"auto link prediction", formatter_class=ArgumentDefaultsHelpFormatter
)
parser.add_argument("--dataset", default="Cora", type=str, help="dataset to use", choices=["Cora", "CiteSeer", "PubMed"],)
parser.add_argument("--model", default="sage", type=str,help="model to use", choices=["gcn","gat","sage"],)
parser.add_argument("--seed", type=int, default=0, help="random seed")
parser.add_argument('--repeat', type=int, default=10)
parser.add_argument("--device", default=0, type=int, help="GPU device")
args = parser.parse_args()

args.device = torch.device('cuda:0')
device = torch.device('cuda:0')

if args.dataset == 'Cora':
dataset = CoraGraphDataset()
elif args.dataset == 'CiteSeer':
dataset = CiteseerGraphDataset()
elif args.dataset == 'PubMed':
dataset = PubmedGraphDataset()
else:
assert False

def setup_seed(seed):
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
np.random.seed(seed)
random.seed(seed)


def split_train_test(g):
u, v = g.edges()

eids = np.arange(g.number_of_edges())
eids = np.random.permutation(eids)
test_size = int(len(eids) * 0.1)
train_size = g.number_of_edges() - test_size
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]

# Find all negative edges and split them for training and testing
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes())
neg_u, neg_v = np.where(adj_neg != 0)

neg_eids = np.random.choice(len(neg_u), g.number_of_edges())
test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
train_neg_u, train_neg_v = neg_u[neg_eids[train_size:]], neg_v[neg_eids[train_size:]]

train_g = dgl.remove_edges(g, eids[:test_size])

train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes())
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes())

test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes())
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes())

return train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g

class DotPredictor(nn.Module):
def forward(self, g, h):
with g.local_scope():
g.ndata['h'] = h
# Compute a new edge feature named 'score' by a dot-product between the
# source node feature 'h' and destination node feature 'h'.
g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
# u_dot_v returns a 1-element vector for each edge so you need to squeeze it.
return g.edata['score'][:, 0]


def compute_loss(pos_score, neg_score):
scores = torch.cat([pos_score, neg_score])
labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
return F.binary_cross_entropy_with_logits(scores.cpu(), labels)


def compute_auc(pos_score, neg_score):
scores = torch.cat([pos_score, neg_score]).numpy()
labels = torch.cat(
[torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
return roc_auc_score(labels, scores)

def get_link_labels(pos_edge_index, neg_edge_index):
E = pos_edge_index.size(1) + neg_edge_index.size(1)
link_labels = torch.zeros(E, dtype=torch.float, device=device)
link_labels[: pos_edge_index.size(1)] = 1.0
return link_labels


res = []
for seed in tqdm(range(1234, 1234+args.repeat)):
setup_seed(seed)
g = dataset[0].to(device)
train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g = split_train_test(g.cpu())
train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g = train_g.to(device), train_pos_g.to(device), train_neg_g.to(device), test_pos_g.to(device), test_neg_g.to(device)

if args.model == 'gcn':
pass
elif args.model == 'gat':
pass
elif args.model == 'sage':
para = {
'features_num': train_g.ndata['feat'].shape[1],
'num_class': 2,
'num_layers': 3,
'hidden': [16, 16],
'dropout': 0.0,
'act': 'relu',
'agg': 'mean',
}
model = GraphSAGE(para).to(device)
else:
assert False

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

all_logits = []
for epoch in range(100):
model.train()
optimizer.zero_grad()

z = model.lp_encode(train_g)
link_logits = model.lp_decode(
z, torch.stack(train_pos_g.edges()), torch.stack(train_neg_g.edges())
)
link_labels = get_link_labels(
torch.stack(train_pos_g.edges()), torch.stack(train_neg_g.edges())
)
loss = F.binary_cross_entropy_with_logits(link_logits, link_labels)
loss.backward()
optimizer.step()

model.eval()
with torch.no_grad():
z = model.lp_encode(train_g)
link_logits = model.lp_decode(
z, torch.stack(test_pos_g.edges()), torch.stack(test_neg_g.edges())
)
link_probs = link_logits.sigmoid()
link_labels = get_link_labels(
torch.stack(test_pos_g.edges()), torch.stack(test_neg_g.edges())
)

result = roc_auc_score(link_labels.cpu().numpy(), link_probs.cpu().numpy())
res.append(result)

print(np.mean(res), np.std(res))













+ 261
- 0
test/link_prediction_solver.py View File

@@ -0,0 +1,261 @@
import sys

sys.path.insert(0, "../")
from tqdm import tqdm

# import autogl.module.train
# import torch_geometric
# exit(0)
#
from autogl.datasets import build_dataset_from_name
from autogl.solver.classifier.link_predictor import AutoLinkPredictor
from autogl.module.train.evaluation import Auc
import yaml
import random
import torch
import numpy as np
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import itertools
import numpy as np
import scipy.sparse as sp
from autogl.module.model.dgl import AutoSAGE, AutoGAT, AutoGCN


def construct_negative_graph(graph, k):
src, dst = graph.edges()

neg_src = src.repeat_interleave(k)
neg_dst = torch.randint(0, graph.num_nodes(), (len(src) * k,))
# return dgl.graph((neg_src, neg_dst), num_nodes=graph.num_nodes()).edges()
return neg_src, neg_dst

def negative_sample(data):
return construct_negative_graph(data, 5)

import autogl.datasets.utils as tmp_utils
tmp_utils.negative_sampling = negative_sample

from dgl.data import CoraGraphDataset, PubmedGraphDataset, CiteseerGraphDataset
from autogl.module.train.link_prediction_full import LinkPredictionTrainer

def setup_seed(seed):
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
np.random.seed(seed)
random.seed(seed)

def fixed(**kwargs):
return [{
'parameterName': k,
"type": "FIXED",
"value": v
} for k, v in kwargs.items()]

def split_train_test(g):
u, v = g.edges()

eids = np.arange(g.number_of_edges())
eids = np.random.permutation(eids)
test_size = int(len(eids) * 0.1)
train_size = g.number_of_edges() - test_size
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]

# Find all negative edges and split them for training and testing
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes())
neg_u, neg_v = np.where(adj_neg != 0)

neg_eids = np.random.choice(len(neg_u), g.number_of_edges())
test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
train_neg_u, train_neg_v = neg_u[neg_eids[train_size:]], neg_v[neg_eids[train_size:]]

train_g = dgl.remove_edges(g, eids[:test_size])

train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes())
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes())

test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes())
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes())

return train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g

def split_train_valid_test(g):
u, v = g.edges()

eids = np.arange(g.number_of_edges())
eids = np.random.permutation(eids)

valid_size = int(len(eids) * 0.1)
test_size = int(len(eids) * 0.1)
train_size = g.number_of_edges() - test_size - valid_size

test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
valid_pos_u, valid_pos_v = u[eids[test_size:test_size+valid_size]], v[eids[test_size:test_size+valid_size]]
train_pos_u, train_pos_v = u[eids[test_size+valid_size:]], v[eids[test_size+valid_size:]]

# Find all negative edges and split them for training and testing
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes())
neg_u, neg_v = np.where(adj_neg != 0)

neg_eids = np.random.choice(len(neg_u), g.number_of_edges())
test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
valid_neg_u, valid_neg_v = neg_u[neg_eids[test_size:test_size+valid_size]], neg_v[neg_eids[test_size:test_size+valid_size]]
train_neg_u, train_neg_v = neg_u[neg_eids[train_size:]], neg_v[neg_eids[train_size:]]

train_g = dgl.remove_edges(g, eids[:test_size+valid_size])

train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes())
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes())

valid_pos_g = dgl.graph((valid_pos_u, valid_pos_v), num_nodes=g.number_of_nodes())
valid_neg_g = dgl.graph((valid_neg_u, valid_neg_v), num_nodes=g.number_of_nodes())

test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes())
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes())

return train_g, train_pos_g, train_neg_g, valid_pos_g, valid_neg_g, test_pos_g, test_neg_g

if __name__ == "__main__":


from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter

parser = ArgumentParser(
"auto link prediction", formatter_class=ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"--dataset",
default="Cora",
type=str,
help="dataset to use",
choices=[
"Cora",
"CiteSeer",
"PubMed",
],
)
parser.add_argument(
"--model",
default="sage",
type=str,
help="model to use",
choices=[
"gcn",
"gat",
"sage",
],
)
parser.add_argument("--seed", type=int, default=0, help="random seed")
parser.add_argument('--repeat', type=int, default=10)
parser.add_argument("--device", default=0, type=int, help="GPU device")

args = parser.parse_args()

args.device = torch.device('cuda:0')
device = torch.device('cuda:0')

if torch.cuda.is_available():
torch.cuda.set_device(args.device)

if args.dataset == 'Cora':
dataset = CoraGraphDataset()
elif args.dataset == 'CiteSeer':
dataset = CiteseerGraphDataset()
elif args.dataset == 'PubMed':
dataset = PubmedGraphDataset()
else:
assert False

res = []
for seed in tqdm(range(1234, 1234+args.repeat)):
# set random seed
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

graph = dataset[0].to(args.device)
num_features = graph.ndata['feat'].size(1)

if args.model == 'gcn':
model = AutoGCN
elif args.model == 'gat':
model = AutoGAT
elif args.model == 'sage':
automodel = AutoSAGE(
num_features=num_features,
num_classes=2,
device=args.device
)
automodel.hyperparams = {
"num_layers": 3,
"hidden": [16, 16],
"dropout": 0.0,
"act": "relu",
"agg": "mean",
}
model_hp = {
"num_layers": 3,
"hidden": [16, 16],
"dropout": 0.0,
"act": "relu",
"agg": "mean",
}
else:
assert False

automodel.initialize()


autoClassifier = AutoLinkPredictor(
feature_module=None,
graph_models='sage',
ensemble_module=None,
max_evals=1,
hpo_module='random',
trainer_hp_space=fixed(**{
"max_epoch": 100,
"early_stopping_round": 100 + 1,
"lr":0.01,
"weight_decay": None,
}),
model_hp_spaces=[fixed(**model_hp)]
)
autoClassifier.fit(
dataset,
time_limit=3600,
evaluation_method=[Auc],
seed=seed,
train_split=0.85,
val_split=0.05,
)
autoClassifier.get_leaderboard().show()

# test
predict_result = autoClassifier.predict_proba()

pos_edge_index, neg_edge_index = (
dataset[0].test_pos_edge_index,
dataset[0].test_neg_edge_index,
)
E = pos_edge_index.size(1) + neg_edge_index.size(1)
link_labels = torch.zeros(E)
link_labels[: pos_edge_index.size(1)] = 1.0

print(
"test auc: %.4f"
% (Auc.evaluate(predict_result, link_labels.detach().cpu().numpy()))
)

"""
AUC 0.8151564430268863
"""

+ 88
- 65
test/link_prediction_trainer.py View File

@@ -1,6 +1,7 @@
import sys

sys.path.insert(0, "../")
from tqdm import tqdm

# import autogl.module.train
# import torch_geometric
@@ -20,7 +21,7 @@ import torch.nn.functional as F
import itertools
import numpy as np
import scipy.sparse as sp
from autogl.module.model.dgl import AutoSAGE
from autogl.module.model.dgl import AutoSAGE, AutoGAT, AutoGCN


def construct_negative_graph(graph, k):
@@ -117,7 +118,6 @@ def split_train_valid_test(g):

if __name__ == "__main__":

setup_seed(1234)

from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter

@@ -135,21 +135,28 @@ if __name__ == "__main__":
"PubMed",
],
)
parser.add_argument(
"--model",
default="sage",
type=str,
help="model to use",
choices=[
"gcn",
"gat",
"sage",
],
)
parser.add_argument("--seed", type=int, default=0, help="random seed")
parser.add_argument('--repeat', type=int, default=10)
parser.add_argument("--device", default=0, type=int, help="GPU device")

args = parser.parse_args()

args.device = torch.device('cuda:0')
device = torch.device('cuda:0')

if torch.cuda.is_available():
torch.cuda.set_device(args.device)
seed = args.seed
# set random seed
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

if args.dataset == 'Cora':
dataset = CoraGraphDataset()
@@ -157,60 +164,76 @@ if __name__ == "__main__":
dataset = CiteseerGraphDataset()
elif args.dataset == 'PubMed':
dataset = PubmedGraphDataset()

# configs = yaml.load(open(args.configs, "r").read(), Loader=yaml.FullLoader)
# configs["hpo"]["name"] = args.hpo
# configs["hpo"]["max_evals"] = args.max_eval
# autoClassifier = AutoLinkPredictor.from_config(configs)

graph = dataset[0].to(args.device)
num_features = graph.ndata['feat'].size(1)

autoSAGE = AutoSAGE(
num_features=num_features,
num_classes=2,
device=args.device
)
autoSAGE.hyperparams = {
"num_layers": 3,
"hidden": [16, 16],
"dropout": 0.0,
"act": "relu",
"agg": "mean",
}
autoSAGE.initialize()

trainer = LinkPredictionTrainer(
model = autoSAGE,
num_features = num_features,
optimizer = None,
lr = 1e-2,
max_epoch = 100,
early_stopping_round = 101,
weight_decay = 0.0,
device = "auto",
init = True,
feval = [Auc],
loss = "binary_cross_entropy_with_logits",
)

train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g = split_train_test(graph.cpu())

dataset = {
'train': train_g.to(args.device),
'train_pos': train_pos_g.to(args.device),
'train_neg': train_neg_g.to(args.device),
'test_pos': test_pos_g.to(args.device),
'test_neg': test_neg_g.to(args.device),
}

trainer.train(dataset, True)
pre = trainer.evaluate(dataset, mask="test", feval=Auc)
print(pre.item())
res = trainer.predict(dataset)
print(res)

exit(0)
else:
assert False

res = []
for seed in tqdm(range(1234, 1234+args.repeat)):
# set random seed
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

graph = dataset[0].to(args.device)
num_features = graph.ndata['feat'].size(1)

if args.model == 'gcn':
model = AutoGCN
elif args.model == 'gat':
model = AutoGAT
elif args.model == 'sage':
automodel = AutoSAGE(
num_features=num_features,
num_classes=2,
device=args.device
)
automodel.hyperparams = {
"num_layers": 3,
"hidden": [16, 16],
"dropout": 0.0,
"act": "relu",
"agg": "mean",
}
else:
assert False

automodel.initialize()

trainer = LinkPredictionTrainer(
model = automodel,
num_features = num_features,
optimizer = None,
lr = 1e-2,
max_epoch = 100,
early_stopping_round = 101,
weight_decay = 0.0,
device = "auto",
init = True,
feval = [Auc],
loss = "binary_cross_entropy_with_logits",
)

train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g = split_train_test(graph.cpu())

dataset_splitted = {
'train': train_g.to(args.device),
'train_pos': train_pos_g.to(args.device),
'train_neg': train_neg_g.to(args.device),
'test_pos': test_pos_g.to(args.device),
'test_neg': test_neg_g.to(args.device),
}

trainer.train(dataset_splitted, False)
pre = trainer.evaluate(dataset_splitted, mask="test", feval=Auc)
result = pre.item()
res.append(result)

print(np.mean(res), np.std(res))
exit(1)

# train
autoClassifier.fit(


+ 271
- 0
test/link_prediction_trainer_dataset.py View File

@@ -0,0 +1,271 @@
import sys

sys.path.insert(0, "../")
from tqdm import tqdm

# import autogl.module.train
# import torch_geometric
# exit(0)
#
from autogl.datasets import build_dataset_from_name
# from autogl.solver.classifier.link_predictor import AutoLinkPredictor
from autogl.module.train.evaluation import Auc
import yaml
import random
import torch
import numpy as np
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import itertools
import numpy as np
import scipy.sparse as sp
from autogl.module.model.dgl import AutoSAGE, AutoGAT, AutoGCN
from autogl.datasets.utils.conversion import general_static_graphs_to_dgl_dataset


def construct_negative_graph(graph, k):
src, dst = graph.edges()

neg_src = src.repeat_interleave(k)
neg_dst = torch.randint(0, graph.num_nodes(), (len(src) * k,))
# return dgl.graph((neg_src, neg_dst), num_nodes=graph.num_nodes()).edges()
return neg_src, neg_dst

def negative_sample(data):
return construct_negative_graph(data, 5)

import autogl.datasets.utils as tmp_utils
tmp_utils.negative_sampling = negative_sample

from dgl.data import CoraGraphDataset, PubmedGraphDataset, CiteseerGraphDataset
from autogl.module.train.link_prediction_full import LinkPredictionTrainer

def setup_seed(seed):
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
np.random.seed(seed)
random.seed(seed)



def split_train_test(g):
u, v = g.edges()

eids = np.arange(g.number_of_edges())
eids = np.random.permutation(eids)
test_size = int(len(eids) * 0.1)
train_size = g.number_of_edges() - test_size
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]

# Find all negative edges and split them for training and testing
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes())
neg_u, neg_v = np.where(adj_neg != 0)

neg_eids = np.random.choice(len(neg_u), g.number_of_edges())
test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
train_neg_u, train_neg_v = neg_u[neg_eids[train_size:]], neg_v[neg_eids[train_size:]]

train_g = dgl.remove_edges(g, eids[:test_size])

train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes())
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes())

test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes())
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes())

return train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g

def split_train_valid_test(g):
u, v = g.edges()

eids = np.arange(g.number_of_edges())
eids = np.random.permutation(eids)

valid_size = int(len(eids) * 0.1)
test_size = int(len(eids) * 0.1)
train_size = g.number_of_edges() - test_size - valid_size

test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
valid_pos_u, valid_pos_v = u[eids[test_size:test_size+valid_size]], v[eids[test_size:test_size+valid_size]]
train_pos_u, train_pos_v = u[eids[test_size+valid_size:]], v[eids[test_size+valid_size:]]

# Find all negative edges and split them for training and testing
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes())
neg_u, neg_v = np.where(adj_neg != 0)

neg_eids = np.random.choice(len(neg_u), g.number_of_edges())
test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
valid_neg_u, valid_neg_v = neg_u[neg_eids[test_size:test_size+valid_size]], neg_v[neg_eids[test_size:test_size+valid_size]]
train_neg_u, train_neg_v = neg_u[neg_eids[train_size:]], neg_v[neg_eids[train_size:]]

train_g = dgl.remove_edges(g, eids[:test_size+valid_size])

train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes())
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes())

valid_pos_g = dgl.graph((valid_pos_u, valid_pos_v), num_nodes=g.number_of_nodes())
valid_neg_g = dgl.graph((valid_neg_u, valid_neg_v), num_nodes=g.number_of_nodes())

test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes())
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes())

return train_g, train_pos_g, train_neg_g, valid_pos_g, valid_neg_g, test_pos_g, test_neg_g

if __name__ == "__main__":


from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter

parser = ArgumentParser(
"auto link prediction", formatter_class=ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"--dataset",
default="Cora",
type=str,
help="dataset to use",
choices=[
"Cora",
"CiteSeer",
"PubMed",
],
)
parser.add_argument(
"--model",
default="sage",
type=str,
help="model to use",
choices=[
"gcn",
"gat",
"sage",
],
)
parser.add_argument("--seed", type=int, default=0, help="random seed")
parser.add_argument('--repeat', type=int, default=10)
parser.add_argument("--device", default=0, type=int, help="GPU device")

args = parser.parse_args()

args.device = torch.device('cuda:0')
device = torch.device('cuda:0')

if torch.cuda.is_available():
torch.cuda.set_device(args.device)

if args.dataset == 'Cora':
dataset = CoraGraphDataset()
elif args.dataset == 'CiteSeer':
dataset = CiteseerGraphDataset()
elif args.dataset == 'PubMed':
dataset = PubmedGraphDataset()
else:
assert False

dataset = build_dataset_from_name(args.dataset.lower())
dataset = general_static_graphs_to_dgl_dataset(dataset)

res = []
for seed in tqdm(range(1234, 1234+args.repeat)):
# set random seed
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

graph = dataset[0].to(args.device)
num_features = graph.ndata['feat'].size(1)

if args.model == 'gcn':
model = AutoGCN
elif args.model == 'gat':
model = AutoGAT
elif args.model == 'sage':
automodel = AutoSAGE(
num_features=num_features,
num_classes=2,
device=args.device
)
automodel.hyperparams = {
"num_layers": 3,
"hidden": [16, 16],
"dropout": 0.0,
"act": "relu",
"agg": "mean",
}
else:
assert False

automodel.initialize()

trainer = LinkPredictionTrainer(
model = automodel,
num_features = num_features,
optimizer = None,
lr = 1e-2,
max_epoch = 100,
early_stopping_round = 101,
weight_decay = 0.0,
device = "auto",
init = True,
feval = [Auc],
loss = "binary_cross_entropy_with_logits",
)

train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g = split_train_test(graph.cpu())

dataset_splitted = {
'train': train_g.to(args.device),
'train_pos': train_pos_g.to(args.device),
'train_neg': train_neg_g.to(args.device),
'test_pos': test_pos_g.to(args.device),
'test_neg': test_neg_g.to(args.device),
}

trainer.train(dataset_splitted, False)
pre = trainer.evaluate(dataset_splitted, mask="test", feval=Auc)
result = pre.item()
res.append(result)

print(np.mean(res), np.std(res))
exit(1)

# train
autoClassifier.fit(
dataset,
time_limit=3600,
evaluation_method=[Auc],
seed=seed,
train_split=0.85,
val_split=0.05,
)
autoClassifier.get_leaderboard().show()

# test
predict_result = autoClassifier.predict_proba()

pos_edge_index, neg_edge_index = (
dataset[0].test_pos_edge_index,
dataset[0].test_neg_edge_index,
)
E = pos_edge_index.size(1) + neg_edge_index.size(1)
link_labels = torch.zeros(E)
link_labels[: pos_edge_index.size(1)] = 1.0

print(
"test auc: %.4f"
% (Auc.evaluate(predict_result, link_labels.detach().cpu().numpy()))
)

"""
AUC 0.8151564430268863
"""

Loading…
Cancel
Save