|
- import numpy as np
- import scipy.sparse as sp
- import typing as _typing
- import dgl
- import autogl.data.graph
- from autogl.data.graph.utils.conversion import general_static_graph_to_dgl_graph
-
-
- class _SplitEdgesDGLImpl:
- @classmethod
- def __split_edges_train_val_test(
- cls, g: dgl.DGLGraph,
- train_ratio: float, val_ratio: float
- ) -> _typing.Tuple[
- dgl.DGLGraph, dgl.DGLGraph, dgl.DGLGraph,
- dgl.DGLGraph, dgl.DGLGraph, dgl.DGLGraph, dgl.DGLGraph
- ]:
- u, v = g.edges()
-
- eids = np.arange(g.number_of_edges())
- eids = np.random.permutation(eids)
-
- valid_size = int(len(eids) * val_ratio)
- test_size = int(len(eids) * (1 - train_ratio - val_ratio))
- train_size = g.number_of_edges() - test_size - valid_size
-
- test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
- valid_pos_u, valid_pos_v = u[eids[test_size:test_size + valid_size]], v[eids[test_size:test_size + valid_size]]
- train_pos_u, train_pos_v = u[eids[test_size + valid_size:]], v[eids[test_size + valid_size:]]
-
- # Find all negative edges and split them for training and testing
- adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
- adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes())
- neg_u, neg_v = np.where(adj_neg != 0)
-
- neg_eids = np.random.choice(len(neg_u), g.number_of_edges())
- test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
- valid_neg_u, valid_neg_v = neg_u[neg_eids[test_size:test_size + valid_size]], neg_v[neg_eids[test_size:test_size + valid_size]]
- train_neg_u, train_neg_v = neg_u[neg_eids[test_size + valid_size:]], neg_v[neg_eids[test_size + valid_size:]]
-
- train_g = dgl.remove_edges(g, eids[:test_size + valid_size])
-
- train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes())
- train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes())
-
- valid_pos_g = dgl.graph((valid_pos_u, valid_pos_v), num_nodes=g.number_of_nodes())
- valid_neg_g = dgl.graph((valid_neg_u, valid_neg_v), num_nodes=g.number_of_nodes())
-
- test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes())
- test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes())
-
- return (
- train_g, train_pos_g, train_neg_g,
- valid_pos_g, valid_neg_g, test_pos_g, test_neg_g
- )
-
- @classmethod
- def __split_edges_train_test(
- cls, g: dgl.DGLGraph, train_ratio: float
- ) -> _typing.Tuple[
- dgl.DGLGraph, dgl.DGLGraph, dgl.DGLGraph,
- dgl.DGLGraph, dgl.DGLGraph,
- ]:
- u, v = g.edges()
-
- eids = np.arange(g.number_of_edges())
- eids = np.random.permutation(eids)
- test_size = int(len(eids) * (1 - train_ratio))
- train_size = g.number_of_edges() - test_size
- test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
- train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]
-
- # Find all negative edges and split them for training and testing
- adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
- adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes())
- neg_u, neg_v = np.where(adj_neg != 0)
-
- neg_eids = np.random.choice(len(neg_u), g.number_of_edges())
- test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
- train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]]
-
- train_g = dgl.remove_edges(g, eids[:test_size])
-
- train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes())
- train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes())
-
- test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes())
- test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes())
-
- return train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g
-
- @classmethod
- def split_edges_for_dgl_graph(
- cls, graph: dgl.DGLGraph,
- train_ratio: float, val_ratio: _typing.Optional[float] = ...
- ) -> _typing.Union[
- _typing.Tuple[
- dgl.DGLGraph, dgl.DGLGraph,
- dgl.DGLGraph, dgl.DGLGraph,
- dgl.DGLGraph, dgl.DGLGraph,
- dgl.DGLGraph
- ],
- _typing.Tuple[
- dgl.DGLGraph, dgl.DGLGraph, dgl.DGLGraph,
- dgl.DGLGraph, dgl.DGLGraph,
- ]
- ]:
- if not 0 < train_ratio < 1:
- raise ValueError(f"Invalid train_ratio as {train_ratio}")
- if isinstance(val_ratio, float):
- if not 0 < val_ratio < 1:
- raise ValueError(f"Invalid val_ratio as {val_ratio}")
- if not 0 < train_ratio + val_ratio < 1:
- raise ValueError(
- f"Invalid combination (train_ratio, val_ratio) "
- f"as ({train_ratio}, {val_ratio})"
- )
- return cls.__split_edges_train_val_test(graph, train_ratio, val_ratio)
- else:
- return cls.__split_edges_train_test(graph, train_ratio)
-
-
- def split_edges_for_data(
- data: _typing.Union[dgl.DGLGraph, autogl.data.graph.GeneralStaticGraph],
- train_ratio: float, val_ratio: _typing.Optional[float] = ...
- ) -> _typing.Union[
- _typing.Tuple[
- dgl.DGLGraph, dgl.DGLGraph,
- dgl.DGLGraph, dgl.DGLGraph,
- dgl.DGLGraph, dgl.DGLGraph,
- dgl.DGLGraph
- ],
- _typing.Tuple[
- dgl.DGLGraph, dgl.DGLGraph, dgl.DGLGraph,
- dgl.DGLGraph, dgl.DGLGraph,
- ]
- ]:
- if isinstance(data, dgl.DGLGraph):
- if not data.is_homogeneous:
- raise ValueError(
- "provided DGL graph to split edges MUST be homogeneous"
- )
- else:
- return _SplitEdgesDGLImpl.split_edges_for_dgl_graph(
- data, train_ratio, val_ratio
- )
- elif isinstance(data, autogl.data.graph.GeneralStaticGraph):
- if not (data.nodes.is_homogeneous and data.edges.is_homogeneous):
- raise ValueError(
- "Provided instance of GeneralStaticGraph MUST be homogeneous"
- )
- else:
- return _SplitEdgesDGLImpl.split_edges_for_dgl_graph(
- general_static_graph_to_dgl_graph(data), train_ratio, val_ratio
- )
- else:
- raise TypeError(f"Illegal provided data {data}")
|