|
- import numpy as np
- import copy
- from .base import BaseGenerator
- from tqdm import tqdm
- from ....utils import get_logger
- from .. import register_feature
-
- LOGGER = get_logger("Feature")
-
-
- class Graphlet:
- def __init__(self, data, sample_error=0.1, sample_confidence=0.1):
- self._data = data
- self._init()
-
- self._sample_error = sample_error
- self._sample_confidence = sample_confidence
- self._dw = int(
- np.ceil(
- 0.5 * (self._sample_error ** -2) * np.log(2 / self._sample_confidence)
- )
- )
- LOGGER.info(
- "sample error {} , confidence {},num {}".format(
- self._sample_error, self._sample_confidence, self._dw
- )
- )
-
- def _init(self):
- self._edges = list(self._data.edge_index)
- self._edges = [self._edges[0], self._edges[1]]
- self._num_nodes = self._data.x.shape[0]
- self._num_edges = len(self._edges[0])
- self._neighbours = [[] for _ in range(self._num_nodes)]
- for i in range(len(self._edges[0])):
- u, v = self._edges[0][i], self._edges[1][i]
- self._neighbours[u].append(v)
-
- LOGGER.info("nodes {} , edges {}".format(self._num_nodes, self._num_edges))
-
- # sorting
- self._node_degrees = np.array([len(x) for x in self._neighbours])
- self._nodes = np.argsort(self._node_degrees)
- for i in self._nodes:
- self._neighbours[i] = [
- x
- for _, x in sorted(
- zip(self._node_degrees[self._neighbours[i]], self._neighbours[i]),
- reverse=True,
- )
- ]
- self._neighbours = [np.array(x) for x in self._neighbours]
-
- def _get_gdv(self, v, u):
- if self._node_degrees[v] >= self._node_degrees[u]:
- pass
- else:
- u, v = v, u
- Sv, Su, Te = set(), set(), set()
- sigma1, sigma2 = 0, 0
- nb = self._neighbours
- N = self._num_nodes
- M = self._num_edges
- phi = np.zeros(self._num_nodes, dtype=int)
- c1, c2, c3, c4 = 1, 2, 3, 4
- x = np.zeros(16, dtype=int)
- # p1
- for w in nb[v]:
- if w != u:
- Sv.add(w)
- phi[w] = c1
- # p2
- for w in nb[u]:
- if w != v:
- if phi[w] == c1:
- Te.add(w)
- phi[w] = c3
- Sv.remove(w)
- else:
- Su.add(w)
- phi[w] = c2
- # p3
- for w in Te:
- for r in nb[w]:
- if phi[r] == c3:
- x[5] += 1
- phi[w] = c4
- sigma2 = sigma2 + len(nb[w]) - 2
- # p4
- for w in Su:
- for r in nb[w]:
- if phi[r] == c1:
- x[8] += 1
- if phi[r] == c2:
- x[7] += 1
- if phi[r] == c4:
- sigma1 += 1
- phi[w] = 0
- sigma2 = sigma2 + len(nb[w]) - 1
- # p5
- for w in Sv:
- for r in nb[w]:
- if phi[r] == c1:
- x[7] += 1
- if phi[r] == c4:
- sigma1 += 1
- phi[w] = 0
- sigma2 = sigma2 + len(nb[w]) - 1
-
- lsv, lsu, lte, du, dv = len(Sv), len(Su), len(Te), len(nb[u]), len(nb[v])
- # 3-graphlet
- x[1] = lte
- x[2] = du + dv - 2 - 2 * x[1]
- x[3] = N - x[2] - x[1] - 2
- x[4] = N * (N - 1) * (N - 2) / 6 - (x[1] + x[2] + x[3])
- # 4 connected graphlets
- x[6] = x[1] * (x[1] - 1) / 2 - x[5]
- x[10] = lsv * lsu - x[8]
- x[9] = lsv * (lsv - 1) / 2 + lsu * (lsu - 1) / 2 - x[7]
- # 4 diconnected graphlets
- t1 = N - (lte + lsu + lsv + 2)
- x[11] = x[1] * t1
- x[12] = M - (du + dv - 1) - (sigma2 - sigma1 - x[5] - x[8] - x[7])
- x[13] = (lsu + lsv) * t1
- x[14] = t1 * (t1 - 1) / 2 - x[12]
- x[15] = N * (N - 1) * (N - 2) * (N - 3) / 24 - np.sum(x[5:15])
-
- return x
-
- def _get_gdv_sample(self, v, u):
- if self._node_degrees[v] >= self._node_degrees[u]:
- pass
- else:
- u, v = v, u
- Sv = set()
- sigma1, sigma2 = 0, 0
- nb = self._neighbours
- N = self._num_nodes
- M = self._num_edges
- phi = np.zeros(self._num_nodes, dtype=int)
- c1, c2, c3, c4 = 1, 2, 3, 4
- x = np.zeros(16)
- dw = self._dw
-
- # p1
- Sv = set(nb[v][nb[v] != u])
- phi[list(Sv)] = c1
- # p2
- p2w = nb[u][nb[u] != c1]
- p2w1 = p2w[phi[p2w] == c1]
- p2w2 = p2w[phi[p2w] != c1]
- Te = p2w1
- phi[p2w1] = c3
- Sv -= set(list(p2w1))
- Su = p2w2
- phi[p2w2] = c2
- # p3
- for w in Te:
- if dw >= len(nb[w]):
- region = nb[w]
- inc = 1
- else:
- region = np.random.choice(nb[w], dw, replace=False)
- inc = self._node_degrees[w] / dw
- phir = phi[region]
- x[5] += inc * np.sum(phir == c3)
- phi[w] = c4
- sigma2 = sigma2 + len(nb[w]) - 2
- # p4
- for w in Su:
- if dw >= len(nb[w]):
- region = nb[w]
- inc = 1
- else:
- region = np.random.choice(nb[w], dw, replace=False)
- inc = self._node_degrees[w] / dw
- phir = phi[region]
- x[8] += inc * np.sum(phir == c1)
- x[7] += inc * np.sum(phir == c2)
- sigma1 += inc * np.sum(phir == c4)
- phi[w] = 0
- sigma2 = sigma2 + len(nb[w]) - 1
- # p5
- for w in Sv:
- if dw >= len(nb[w]):
- region = nb[w]
- inc = 1
- else:
- region = np.random.choice(nb[w], dw, replace=False)
- inc = self._node_degrees[w] / dw
- phir = phi[region]
- x[7] += inc * np.sum(phir == c1)
- sigma1 += inc * np.sum(phir == c4)
- phi[w] = 0
- sigma2 = sigma2 + len(nb[w]) - 1
-
- lsv, lsu, lte, du, dv = len(Sv), len(Su), len(Te), len(nb[u]), len(nb[v])
- # 3-graphlet
- x[1] = lte
- x[2] = du + dv - 2 - 2 * x[1]
- x[3] = N - x[2] - x[1] - 2
- x[4] = N * (N - 1) * (N - 2) / 6 - (x[1] + x[2] + x[3])
- # 4 connected graphlets
- x[6] = x[1] * (x[1] - 1) / 2 - x[5]
- x[10] = lsv * lsu - x[8]
- x[9] = lsv * (lsv - 1) / 2 + lsu * (lsu - 1) / 2 - x[7]
- # 4 diconnected graphlets
- t1 = N - (lte + lsu + lsv + 2)
- x[11] = x[1] * t1
- x[12] = M - (du + dv - 1) - (sigma2 - sigma1 - x[5] - x[8] - x[7])
- x[13] = (lsu + lsv) * t1
- x[14] = t1 * (t1 - 1) / 2 - x[12]
- x[15] = N * (N - 1) * (N - 2) * (N - 3) / 24 - np.sum(x[5:15])
-
- return x
-
- def get_gdvs(self, sample=True):
- res = np.zeros((self._num_nodes, 15))
- for u in tqdm(range(self._num_nodes)):
- vs = self._neighbours[u]
- if len(vs) != 0:
- gdvs = []
- for v in tqdm(vs, disable=len(vs) < 100):
- if sample:
- gdvs.append(self._get_gdv_sample(u, v))
- else:
- gdvs.append(self._get_gdv(u, v))
- res[u, :] = np.mean(gdvs, axis=0)[1:]
- return res
-
- def get_gdvs_cp(self, workers="max"):
- r"""
- c++ parallel , same function as get_gdvs
- """
- tmpfile = "tmp.mtx"
- tmpmicro = "tmp.micro"
- self._save(tmpfile)
- os.system(
- "{} -f {} --micro {} -w {}".format(pgd_path, tmpfile, tmpmicro, workers)
- )
- return self._load(tmpmicro)
-
- def _save(self, filename):
- with open(filename, "w") as file:
- file.write(
- "{} {} {}\n".format(self._num_nodes, self._num_nodes, self._num_edges)
- )
- for u in self._nodes:
- for v in self._neighbours[u]:
- file.write("{} {}\n".format(u + 1, v + 1))
-
- def _load(self, filename):
- df = pd.read_csv(filename)
- edges = df[["% src", "dst"]].values
- egdvs = df.values[:, 2:]
-
- num_nodes = np.max(edges)
- ngdvs = np.zeros((num_nodes, 8))
-
- nbs = [[] for _ in range(num_nodes)]
- for i, (u, v) in enumerate(edges):
- u -= 1
- v -= 1
- nbs[u].append(i)
- nbs[v].append(i)
-
- for i in range(num_nodes):
- if len(nbs[i]) != 0:
- ngdvs[i, :] = np.mean(egdvs[nbs[i]], axis=0)
- return ngdvs
-
-
- @register_feature("graphlet")
- class GeGraphlet(BaseGenerator):
- r"""generate local graphlet numbers as features. The implementation refers to [#]_ .
-
- References
- ----------
- .. [#] Ahmed, N. K., Willke, T. L., & Rossi, R. A. (2016).
- Estimation of local subgraph counts. Proceedings - 2016 IEEE International Conference on Big Data, Big Data 2016, 586–595.
- https://doi.org/10.1109/BigData.2016.7840651
-
- """
-
- def __init__(self, workers=1):
- super(GeGraphlet, self).__init__()
- self.workers = workers
-
- def _transform(self, data):
- r"""graphlet degree vectors"""
- gl = Graphlet(data)
- # res=gl.get_gdvs_cp(self.workers)
- res = gl.get_gdvs()
- data.x = np.concatenate([data.x, res], axis=1)
- return data
|