|
- from sklearn import preprocessing
- import numpy as np
- from sklearn.metrics.pairwise import cosine_similarity as cos_sim
- import copy
- from tqdm import tqdm
- from tabulate import tabulate
- import time
-
- from .base import BaseFeature, BaseFeatureEngineer
- from .selectors import SeGBDT
- from . import register_feature
-
- from ...utils import get_logger
- import torch
-
- LOGGER = get_logger("Feature")
-
-
- @register_feature("identity")
- class FeIdentity(BaseFeatureEngineer):
- r"""it is a dummy feature engineer , which directly returns identical data"""
-
- def __init__(self, *args, **kwargs):
- super(FeIdentity, self).__init__(multigraph=True, *args, **kwargs)
-
-
- @register_feature("onlyconst")
- class Onlyconst(BaseFeatureEngineer):
- r"""it is a dummy feature engineer , which directly returns identical data"""
-
- def __init__(self, *args, **kwargs):
- super(Onlyconst, self).__init__(
- data_t="tensor", multigraph=True, *args, **kwargs
- )
-
- def _transform(self, data):
- if "x" in data:
- data.x = torch.ones((data.x.shape[0], 1))
- else:
- data.x = torch.ones((torch.unique(data.edge_index).shape[0], 1))
- return data
-
-
- def op_sum(x, nbs):
- res = np.zeros_like(x)
- for u in range(len(nbs)):
- nb = nbs[u]
- if len(nb != 0):
- res[u] = np.sum(x[nb], axis=0)
- return res
-
-
- def op_mean(x, nbs):
- res = np.zeros_like(x)
- for u in range(len(nbs)):
- nb = nbs[u]
- if len(nb != 0):
- res[u] = np.mean(x[nb], axis=0)
- return res
-
-
- def op_max(x, nbs):
- res = np.zeros_like(x)
- for u in range(len(nbs)):
- nb = nbs[u]
- if len(nb != 0):
- res[u] = np.max(x[nb], axis=0)
- return res
-
-
- def op_min(x, nbs):
- res = np.zeros_like(x)
- for u in range(len(nbs)):
- nb = nbs[u]
- if len(nb != 0):
- res[u] = np.min(x[nb], axis=0)
- return res
-
-
- def op_prod(x, nbs):
- res = np.zeros_like(x)
- for u in range(len(nbs)):
- nb = nbs[u]
- if len(nb != 0):
- res[u] = np.prod(x[nb], axis=0)
- return res
-
-
- mms = preprocessing.MinMaxScaler()
- ss = preprocessing.StandardScaler()
-
-
- def scale(x):
- return ss.fit_transform(x)
-
-
- class Timer:
- def __init__(self, timebudget=None):
- self._timebudget = timebudget
- self._esti_time = 0
- self._g_start = time.time()
-
- def start(self):
- self._start = time.time()
-
- def end(self):
- time_use = time.time() - self._start
- self._esti_time = (self._esti_time + time_use) / 2
-
- def is_timeout(self):
- timebudget = self._timebudget
- if timebudget:
- timebudget = self._timebudget - (time.time() - self._g_start)
- if timebudget < self._esti_time:
- return True
- return False
-
-
- @register_feature("deepgl")
- class AutoFeatureEngineer(BaseFeatureEngineer):
- r"""
- Notes
- -----
- An implementation of auto feature engineering method Deepgl [#]_ ,which iteratively generates features by aggregating neighbour features
- and select a fixed number of features to automatically add important graph-aware features.
- References
- ----------
- .. [#] Rossi, R. A., Zhou, R., & Ahmed, N. K. (2020).
- Deep Inductive Graph Representation Learning.
- IEEE Transactions on Knowledge and Data Engineering, 32(3), 438–452.
- https://doi.org/10.1109/TKDE.2018.2878247
- Parameters
- ----------
- fixlen : int
- fixed number of features for every epoch. The final number of features added will be
- ``fixlen`` \times ``max_epoch``, 200 \times 5 in default.
- max_epoch : int
- number of epochs in total process.
- timebudget : int
- timebudget(seconds) for the feature engineering process, None for no time budget . Note that
- this time budget is a soft budget ,which is obtained by rough time estimation through previous iterations and
- may finally exceed the actual timebudget
- y_sel_func : Callable
- feature selector function object for selection at each iteration ,lightgbm in default. Note that in original paper,
- connected components of feature graph is used , and you may implement it by yourself if you want.
- verbosity : int
- hide any infomation except error and fatal if ``verbosity`` < 1
- """
-
- def __init__(
- self,
- fixlen=200,
- max_epoch=5,
- timebudget=None,
- y_sel_func=SeGBDT,
- verbosity=-1,
- *args,
- **kwargs
- ):
-
- super(AutoFeatureEngineer, self).__init__(multigraph=False, *args, **kwargs)
- self._ops = [op_sum, op_mean, op_max, op_min]
- self._sim = cos_sim
- self._fixlen = fixlen
- self._max_epoch = max_epoch
- self._timebudget = timebudget
- self._y_sel_func = y_sel_func(
- fixlen, verbose_eval=verbosity >= 1, *args, **kwargs
- )
- self._verbosity = verbosity
-
- def _init(self, data):
- # self._data=copy.deepcopy(data)
- self._data = data
- self._num_nodes = data.x.shape[0]
- self._x = data.x
- self._edges = data.edge_index
- self._neighbours = [[] for _ in range(self._num_nodes)]
- for u, v in self._edges.T:
- self._neighbours[u].append(v)
- self._neighbours = [np.array(v) for v in self._neighbours]
-
- def _gen(self, x):
- res = []
- for i, op in enumerate(self._ops):
- res.append(op(x, self._neighbours))
- res = np.concatenate(res, axis=1)
- return res
-
- def _fit(self, data):
- self._init(data)
- x = self._x.copy()
- gx = x.copy()
- verbs = []
- data = self._data
- max_epoch = self._max_epoch
- timebudget = self._timebudget
- y_sel_func = self._y_sel_func
- soft_timer = Timer(timebudget)
- self._sel = []
- for epoch in tqdm(range(max_epoch), disable=self._verbosity <= 0):
- soft_timer.start()
- verb = [epoch, gx.shape[1]]
- gx = self._gen(gx)
- gx = scale(gx)
- verb.append(gx.shape[1])
- data.x = gx
- # data = feat_diffuse(data)
- y_sel_func._fit(data)
- self._sel.append(y_sel_func._sel)
- data = y_sel_func._transform(data)
- gx = data.x
- verb.append(gx.shape[1])
- x = np.concatenate([x, gx], axis=1)
- verbs.append(verb)
- soft_timer.end()
- if soft_timer.is_timeout():
- break
- if self._verbosity >= 1:
- LOGGER.info(
- tabulate(verbs, headers="epoch origin after-gen after-sel".split())
- )
- data.x = x
- return data
-
- def _transform(self, data):
- x = data.x
- gx = x.copy()
- for sel in self._sel:
- gx = self._gen(gx)
- gx = scale(gx)
- gx = gx[:, sel]
- x = np.concatenate([x, gx], axis=1)
- data.x = x
- return data
|