|
- import logging
- import numpy as np
- import torch
- from tqdm import tqdm
- import autogl
- from ._basic import BaseFeatureGenerator
- from .._feature_engineer_registry import FeatureEngineerUniversalRegistry
-
- _LOGGER = logging.getLogger("FE")
-
-
- class _Graphlet:
- def __init__(self, data, sample_error=0.1, sample_confidence=0.1):
- self._data = data
- self._init()
-
- self._sample_error = sample_error
- self._sample_confidence = sample_confidence
- self._dw = int(
- np.ceil(
- 0.5 * (self._sample_error ** -2) * np.log(2 / self._sample_confidence)
- )
- )
- _LOGGER.info(
- "sample error {} , confidence {},num {}".format(
- self._sample_error, self._sample_confidence, self._dw
- )
- )
-
- def _init(self):
- self._edges = list(self._data.edge_index)
- self._edges = [self._edges[0], self._edges[1]]
- self._num_nodes = self._data.x.shape[0]
- self._num_edges = len(self._edges[0])
- self._neighbours = [[] for _ in range(self._num_nodes)]
- for i in range(len(self._edges[0])):
- u, v = self._edges[0][i], self._edges[1][i]
- self._neighbours[u].append(v)
-
- _LOGGER.info("nodes {} , edges {}".format(self._num_nodes, self._num_edges))
-
- # sorting
- self._node_degrees = np.array([len(x) for x in self._neighbours])
- self._nodes = np.argsort(self._node_degrees)
- for i in self._nodes:
- self._neighbours[i] = [
- x
- for _, x in sorted(
- zip(self._node_degrees[self._neighbours[i]], self._neighbours[i]),
- reverse=True,
- )
- ]
- self._neighbours = [np.array(x) for x in self._neighbours]
-
- def _get_gdv(self, v, u):
- if self._node_degrees[v] >= self._node_degrees[u]:
- pass
- else:
- u, v = v, u
- Sv, Su, Te = set(), set(), set()
- sigma1, sigma2 = 0, 0
- nb = self._neighbours
- N = self._num_nodes
- M = self._num_edges
- phi = np.zeros(self._num_nodes, dtype=int)
- c1, c2, c3, c4 = 1, 2, 3, 4
- x = np.zeros(16, dtype=int)
- # p1
- for w in nb[v]:
- if w != u:
- Sv.add(w)
- phi[w] = c1
- # p2
- for w in nb[u]:
- if w != v:
- if phi[w] == c1:
- Te.add(w)
- phi[w] = c3
- Sv.remove(w)
- else:
- Su.add(w)
- phi[w] = c2
- # p3
- for w in Te:
- for r in nb[w]:
- if phi[r] == c3:
- x[5] += 1
- phi[w] = c4
- sigma2 = sigma2 + len(nb[w]) - 2
- # p4
- for w in Su:
- for r in nb[w]:
- if phi[r] == c1:
- x[8] += 1
- if phi[r] == c2:
- x[7] += 1
- if phi[r] == c4:
- sigma1 += 1
- phi[w] = 0
- sigma2 = sigma2 + len(nb[w]) - 1
- # p5
- for w in Sv:
- for r in nb[w]:
- if phi[r] == c1:
- x[7] += 1
- if phi[r] == c4:
- sigma1 += 1
- phi[w] = 0
- sigma2 = sigma2 + len(nb[w]) - 1
-
- lsv, lsu, lte, du, dv = len(Sv), len(Su), len(Te), len(nb[u]), len(nb[v])
- # 3-graphlet
- x[1] = lte
- x[2] = du + dv - 2 - 2 * x[1]
- x[3] = N - x[2] - x[1] - 2
- x[4] = N * (N - 1) * (N - 2) / 6 - (x[1] + x[2] + x[3])
- # 4 connected graphlets
- x[6] = x[1] * (x[1] - 1) / 2 - x[5]
- x[10] = lsv * lsu - x[8]
- x[9] = lsv * (lsv - 1) / 2 + lsu * (lsu - 1) / 2 - x[7]
- # 4 disconnected graphlets
- t1 = N - (lte + lsu + lsv + 2)
- x[11] = x[1] * t1
- x[12] = M - (du + dv - 1) - (sigma2 - sigma1 - x[5] - x[8] - x[7])
- x[13] = (lsu + lsv) * t1
- x[14] = t1 * (t1 - 1) / 2 - x[12]
- x[15] = N * (N - 1) * (N - 2) * (N - 3) / 24 - np.sum(x[5:15])
-
- return x
-
- def _get_gdv_sample(self, v, u):
- if self._node_degrees[v] >= self._node_degrees[u]:
- pass
- else:
- u, v = v, u
- Sv = set()
- sigma1, sigma2 = 0, 0
- nb = self._neighbours
- N = self._num_nodes
- M = self._num_edges
- phi = np.zeros(self._num_nodes, dtype=int)
- c1, c2, c3, c4 = 1, 2, 3, 4
- x = np.zeros(16)
- dw = self._dw
-
- # p1
- Sv = set(nb[v][nb[v] != u])
- phi[list(Sv)] = c1
- # p2
- p2w = nb[u][nb[u] != c1]
- p2w1 = p2w[phi[p2w] == c1]
- p2w2 = p2w[phi[p2w] != c1]
- Te = p2w1
- phi[p2w1] = c3
- Sv -= set(list(p2w1))
- Su = p2w2
- phi[p2w2] = c2
- # p3
- for w in Te:
- if dw >= len(nb[w]):
- region = nb[w]
- inc = 1
- else:
- region = np.random.choice(nb[w], dw, replace=False)
- inc = self._node_degrees[w] / dw
- phir = phi[region]
- x[5] += inc * np.sum(phir == c3)
- phi[w] = c4
- sigma2 = sigma2 + len(nb[w]) - 2
- # p4
- for w in Su:
- if dw >= len(nb[w]):
- region = nb[w]
- inc = 1
- else:
- region = np.random.choice(nb[w], dw, replace=False)
- inc = self._node_degrees[w] / dw
- phir = phi[region]
- x[8] += inc * np.sum(phir == c1)
- x[7] += inc * np.sum(phir == c2)
- sigma1 += inc * np.sum(phir == c4)
- phi[w] = 0
- sigma2 = sigma2 + len(nb[w]) - 1
- # p5
- for w in Sv:
- if dw >= len(nb[w]):
- region = nb[w]
- inc = 1
- else:
- region = np.random.choice(nb[w], dw, replace=False)
- inc = self._node_degrees[w] / dw
- phir = phi[region]
- x[7] += inc * np.sum(phir == c1)
- sigma1 += inc * np.sum(phir == c4)
- phi[w] = 0
- sigma2 = sigma2 + len(nb[w]) - 1
-
- lsv, lsu, lte, du, dv = len(Sv), len(Su), len(Te), len(nb[u]), len(nb[v])
- # 3-graphlet
- x[1] = lte
- x[2] = du + dv - 2 - 2 * x[1]
- x[3] = N - x[2] - x[1] - 2
- x[4] = N * (N - 1) * (N - 2) / 6 - (x[1] + x[2] + x[3])
- # 4 connected graphlets
- x[6] = x[1] * (x[1] - 1) / 2 - x[5]
- x[10] = lsv * lsu - x[8]
- x[9] = lsv * (lsv - 1) / 2 + lsu * (lsu - 1) / 2 - x[7]
- # 4 disconnected graphlets
- t1 = N - (lte + lsu + lsv + 2)
- x[11] = x[1] * t1
- x[12] = M - (du + dv - 1) - (sigma2 - sigma1 - x[5] - x[8] - x[7])
- x[13] = (lsu + lsv) * t1
- x[14] = t1 * (t1 - 1) / 2 - x[12]
- x[15] = N * (N - 1) * (N - 2) * (N - 3) / 24 - np.sum(x[5:15])
-
- return x
-
- def get_gdvs(self, sample=True):
- res = np.zeros((self._num_nodes, 15))
- for u in tqdm(range(self._num_nodes)):
- vs = self._neighbours[u]
- if len(vs) != 0:
- gdvs = []
- for v in tqdm(vs, disable=len(vs) < 100):
- if sample:
- gdvs.append(self._get_gdv_sample(u, v))
- else:
- gdvs.append(self._get_gdv(u, v))
- res[u, :] = np.mean(gdvs, axis=0)[1:]
- return res
-
-
- @FeatureEngineerUniversalRegistry.register_feature_engineer("graph" + "let")
- class GraphletGenerator(BaseFeatureGenerator):
- r"""generate local graphlet numbers as features. The implementation refers to [#]_ .
-
- References
- ----------
- .. [#] Ahmed, N. K., Willke, T. L., & Rossi, R. A. (2016).
- Estimation of local subgraph counts. Proceedings - 2016 IEEE International Conference on Big Data, Big Data 2016, 586–595.
- https://doi.org/10.1109/BigData.2016.7840651
-
- """
-
- def _extract_nodes_feature(self, data: autogl.data.Data) -> torch.Tensor:
- result: np.ndarray = _Graphlet(data).get_gdvs()
- return torch.from_numpy(result)
|