From 2fd9237e375914def421a7d2d94fc7d1851f1bbd Mon Sep 17 00:00:00 2001 From: CoreLeader Date: Sat, 16 Oct 2021 05:00:00 +0800 Subject: [PATCH] Revision for Refactor of Dataset and Feature Engineer Debug for Default Implementation of GeneralStaticGraph Debug for OGB nodes datasets Improvement for feature generators and graph feature extractors to support any generic provided conventional data --- ...ral_static_graph_default_implementation.py | 2 +- .../utils/conversion/_nx.py | 2 +- autogl/datasets/_ogb.py | 29 ++--- .../module/feature/_base_feature_engineer.py | 100 +++++++----------- .../feature/_composed_feature_engineer.py | 37 ------- autogl/module/feature/_generators/_basic.py | 80 ++++++-------- autogl/module/feature/_graph/_netlsd.py | 65 +++++++++--- autogl/module/feature/_graph/_networkx.py | 65 ++++++++++-- autogl/module/hpo/autone.py | 2 +- 9 files changed, 193 insertions(+), 189 deletions(-) delete mode 100644 autogl/module/feature/_composed_feature_engineer.py diff --git a/autogl/data/graph/_general_static_graph/_general_static_graph_default_implementation.py b/autogl/data/graph/_general_static_graph/_general_static_graph_default_implementation.py index 469fa9b..b47de07 100644 --- a/autogl/data/graph/_general_static_graph/_general_static_graph_default_implementation.py +++ b/autogl/data/graph/_general_static_graph/_general_static_graph_default_implementation.py @@ -154,7 +154,7 @@ class HeterogeneousNodesContainerImplementation(HeterogeneousNodesContainer): if data.size(0) != obsolete_data.size(0): raise ValueError elif len(self.__nodes_data.get(__node_t)) > 0: - num_nodes: int = self.__nodes_data.get(__node_t).get(list(self.node_types)[0]).size(0) + num_nodes: int = self.__nodes_data[__node_t][list(self.__nodes_data[__node_t].keys())[0]].size(0) if data.size(0) != num_nodes: raise ValueError self.__nodes_data[__node_t][data_key] = data diff --git a/autogl/data/graph/_general_static_graph/utils/conversion/_nx.py b/autogl/data/graph/_general_static_graph/utils/conversion/_nx.py index 2fa440f..d98cf09 100644 --- a/autogl/data/graph/_general_static_graph/utils/conversion/_nx.py +++ b/autogl/data/graph/_general_static_graph/utils/conversion/_nx.py @@ -61,7 +61,7 @@ class HomogeneousStaticGraphToNetworkX: homogeneous_static_graph.edges.data[data_key].squeeze().tolist() ) for i, (u, v) in enumerate(homogeneous_static_graph.edges.connections.t().tolist()): - if (__remove_self_loops and v == u) or (__to_undirected and v > u): + if __remove_self_loops and v == u: continue g.add_edge(u, v) for data_key in edges_data: diff --git a/autogl/datasets/_ogb.py b/autogl/datasets/_ogb.py index a4abb26..016fe75 100644 --- a/autogl/datasets/_ogb.py +++ b/autogl/datasets/_ogb.py @@ -57,11 +57,11 @@ class _OGBNDatasetUtil(_OGBDatasetUtil): homogeneous_static_graph.nodes.data['train_mask'] = index_to_mask( torch.from_numpy(train_index), ogbn_data['num_nodes'] ) - if val_index not in (Ellipsis, None) and isinstance(val_index, np.ndarray): + if isinstance(val_index, np.ndarray): homogeneous_static_graph.nodes.data['val_mask'] = index_to_mask( torch.from_numpy(val_index), ogbn_data['num_nodes'] ) - if test_index not in (Ellipsis, None) and isinstance(test_index, np.ndarray): + if isinstance(test_index, np.ndarray): homogeneous_static_graph.nodes.data['test_mask'] = index_to_mask( torch.from_numpy(test_index), ogbn_data['num_nodes'] ) @@ -105,8 +105,7 @@ class OGBNProductsDataset(InMemoryStaticGraphSet): super(OGBNProductsDataset, self).__init__([ _OGBNDatasetUtil.ogbn_dataset_to_general_static_graph( ogbn_dataset, "y", - {"node_feat": "x"}, - {"edge_feat": "edge_feat"} + {"node_feat": "x"} ) ]) @@ -119,10 +118,7 @@ class OGBNProteinsDataset(InMemoryStaticGraphSet): super(OGBNProteinsDataset, self).__init__([ _OGBNDatasetUtil.ogbn_dataset_to_general_static_graph( ogbn_dataset, "label", - { - "node_feat": "feat", - "node_species": "species" - }, + {"node_species": "species"}, {"edge_feat": "edge_feat"} ) ]) @@ -130,10 +126,7 @@ class OGBNProteinsDataset(InMemoryStaticGraphSet): super(OGBNProteinsDataset, self).__init__([ _OGBNDatasetUtil.ogbn_dataset_to_general_static_graph( ogbn_dataset, "y", - { - "node_feat": "x", - "node_species": "species" - }, + {"node_species": "species"}, {"edge_feat": "edge_feat"} ) ]) @@ -150,8 +143,7 @@ class OGBNArxivDataset(InMemoryStaticGraphSet): { "node_feat": "feat", "node_year": "year" - }, - {"edge_feat": "edge_feat"} + } ) ]) elif _backend.DependentBackend.is_pyg(): @@ -161,8 +153,7 @@ class OGBNArxivDataset(InMemoryStaticGraphSet): { "node_feat": "x", "node_year": "year" - }, - {"edge_feat": "edge_feat"} + } ) ]) @@ -178,8 +169,7 @@ class OGBNPapers100MDataset(InMemoryStaticGraphSet): { "node_feat": "feat", "node_year": "year" - }, - {"edge_feat": "edge_feat"} + } ) ]) elif _backend.DependentBackend.is_pyg(): @@ -189,8 +179,7 @@ class OGBNPapers100MDataset(InMemoryStaticGraphSet): { "node_feat": "x", "node_year": "year" - }, - {"edge_feat": "edge_feat"} + } ) ]) diff --git a/autogl/module/feature/_base_feature_engineer.py b/autogl/module/feature/_base_feature_engineer.py index 87031dd..9f77f3e 100644 --- a/autogl/module/feature/_base_feature_engineer.py +++ b/autogl/module/feature/_base_feature_engineer.py @@ -1,28 +1,27 @@ import copy +import logging import torch import typing as _typing -from autogl.data.graph import GeneralStaticGraph -from autogl.data import InMemoryStaticGraphSet +from autogl.data import Dataset -from ...utils import get_logger - -LOGGER = get_logger("FeatureEngineer") +LOGGER = logging.getLogger("FeatureEngineer") class _BaseFeatureEngineer: def __and__(self, other): raise NotImplementedError - def fit( - self, in_memory_static_graph_set: InMemoryStaticGraphSet, - inplace: bool = True - ): + def fit_transform(self, dataset: Dataset, inplace=True) -> Dataset: + """ + Fit and transform dataset inplace or not w.r.t bool argument ``inplace`` + """ + dataset = self.fit(dataset) + return self.transform(dataset, inplace=inplace) + + def fit(self, dataset: Dataset) -> Dataset: raise NotImplementedError - def transform( - self, in_memory_static_graph_set: InMemoryStaticGraphSet, - inplace: bool = True - ) -> InMemoryStaticGraphSet: + def transform(self, dataset: Dataset, inplace: bool = True) -> Dataset: raise NotImplementedError @@ -42,73 +41,50 @@ class _ComposedFeatureEngineer(_BaseFeatureEngineer): def __and__(self, other: _BaseFeatureEngineer): return _ComposedFeatureEngineer((self, other)) - def fit(self, in_memory_static_graph_set, inplace: bool = True): + def fit(self, dataset) -> Dataset: for fe in self.fe_components: - fe.fit(in_memory_static_graph_set, inplace) + dataset = fe.fit(dataset) + return dataset - def transform( - self, in_memory_static_graph_set, - inplace: bool = True - ): + def transform(self, dataset: Dataset, inplace: bool = True) -> Dataset: for fe in self.fe_components: - in_memory_static_graph_set = fe.transform( - in_memory_static_graph_set, inplace - ) - return in_memory_static_graph_set + dataset = fe.transform(dataset, inplace) + return dataset -class BaseFeatureEngineer: +class BaseFeature(_BaseFeatureEngineer): def __init__(self, multi_graph: bool = True, subgraph=False): self._multi_graph: bool = multi_graph def __and__(self, other): return _ComposedFeatureEngineer((self, other)) - @classmethod - def __reset_graph_set( - cls, graphs: _typing.Sequence[GeneralStaticGraph], - in_memory_static_graph_set: InMemoryStaticGraphSet - ): - in_memory_static_graph_set.reset_dataset(graphs) - - def _preprocess(self, static_graph: GeneralStaticGraph) -> GeneralStaticGraph: - return static_graph + def _preprocess(self, data: _typing.Any) -> _typing.Any: + return data - def _fit(self, static_graph: GeneralStaticGraph) -> GeneralStaticGraph: - return static_graph + def _fit(self, data: _typing.Any) -> _typing.Any: + return data - def _transform(self, static_graph: GeneralStaticGraph) -> GeneralStaticGraph: - return static_graph + def _transform(self, data: _typing.Any) -> _typing.Any: + return data - def _postprocess(self, static_graph: GeneralStaticGraph) -> GeneralStaticGraph: - return static_graph + def _postprocess(self, data: _typing.Any) -> _typing.Any: + return data - def fit( - self, in_memory_static_graph_set: InMemoryStaticGraphSet, - inplace: bool = True - ): - if not inplace: - in_memory_static_graph_set = copy.deepcopy(in_memory_static_graph_set) + def fit(self, dataset: Dataset) -> Dataset: with torch.no_grad(): - __graphs: _typing.Sequence[GeneralStaticGraph] = [ - self._postprocess(self._transform(self._fit(self._preprocess(g)))) - for g in in_memory_static_graph_set - ] - self.__reset_graph_set(__graphs, in_memory_static_graph_set) - - def transform( - self, in_memory_static_graph_set: InMemoryStaticGraphSet, - inplace: bool = True - ) -> InMemoryStaticGraphSet: + for i, data in enumerate(dataset): + dataset[i] = self._postprocess(self._transform(self._fit(self._preprocess(data)))) + return dataset + + def transform(self, dataset: Dataset, inplace: bool = True) -> Dataset: if not inplace: - in_memory_static_graph_set = copy.deepcopy(in_memory_static_graph_set) + dataset = copy.deepcopy(dataset) with torch.no_grad(): - __graphs: _typing.Sequence[GeneralStaticGraph] = [ - self._postprocess(self._transform(self._preprocess(g))) - for g in in_memory_static_graph_set - ] - return in_memory_static_graph_set + for i, data in enumerate(dataset): + dataset[i] = self._postprocess(self._transform(self._preprocess(data))) + return dataset -class BaseFeature(BaseFeatureEngineer): +class BaseFeatureEngineer(BaseFeature): ... diff --git a/autogl/module/feature/_composed_feature_engineer.py b/autogl/module/feature/_composed_feature_engineer.py deleted file mode 100644 index 1d99198..0000000 --- a/autogl/module/feature/_composed_feature_engineer.py +++ /dev/null @@ -1,37 +0,0 @@ -import typing as _typing -from . import _base_feature_engineer - - -class _ComposedFeatureEngineer(_base_feature_engineer.BaseFeatureEngineer): - ... - - -class ComposedFeatureEngineer(_ComposedFeatureEngineer): - @property - def fe_components(self) -> _typing.Iterable[_base_feature_engineer.BaseFeatureEngineer]: - raise NotImplementedError # todo - - def __init__(self, feature_engineers: _typing.Iterable[_base_feature_engineer.BaseFeatureEngineer]): - super(ComposedFeatureEngineer, self).__init__() - self.__fe_components: _typing.List[_base_feature_engineer.BaseFeatureEngineer] = [] - for fe in feature_engineers: - if isinstance(fe, ComposedFeatureEngineer): - self.__fe_components.extend(fe.fe_components) - elif isinstance(fe, _base_feature_engineer.BaseFeatureEngineer): - self.__fe_components.append(fe) - else: - raise TypeError - - def fit(self, in_memory_static_graph_set, inplace: bool = True): - for fe in self.fe_components: - fe.fit(in_memory_static_graph_set, inplace) - - def transform( - self, in_memory_static_graph_set, - inplace: bool = True - ): - for fe in self.fe_components: - in_memory_static_graph_set = fe.transform( - in_memory_static_graph_set, inplace - ) - return in_memory_static_graph_set diff --git a/autogl/module/feature/_generators/_basic.py b/autogl/module/feature/_generators/_basic.py index 674a851..b92866b 100644 --- a/autogl/module/feature/_generators/_basic.py +++ b/autogl/module/feature/_generators/_basic.py @@ -7,29 +7,29 @@ from .._feature_engineer_registry import FeatureEngineerUniversalRegistry class BaseFeatureGenerator(BaseFeatureEngineer): - def _preprocess(self, static_graph: GeneralStaticGraph) -> GeneralStaticGraph: - if not ( - static_graph.nodes.is_homogeneous and - static_graph.edges.is_homogeneous - ): - raise ValueError("Provided static graph must be homogeneous") - else: - return static_graph - def _extract_nodes_feature(self, data: autogl.data.Data) -> torch.Tensor: raise NotImplementedError - @classmethod - def __to_data(cls, homogeneous_static_graph: GeneralStaticGraph) -> autogl.data.Data: + def __transform_homogeneous_static_graph( + self, homogeneous_static_graph: GeneralStaticGraph + ) -> GeneralStaticGraph: + if not ( + homogeneous_static_graph.nodes.is_homogeneous and + homogeneous_static_graph.edges.is_homogeneous + ): + raise ValueError("Provided static graph must be homogeneous") if 'x' in homogeneous_static_graph.nodes.data: + feature_key: _typing.Optional[str] = 'x' features: _typing.Optional[torch.Tensor] = ( homogeneous_static_graph.nodes.data['x'] ) elif 'feat' in homogeneous_static_graph.nodes.data: + feature_key: _typing.Optional[str] = 'feat' features: _typing.Optional[torch.Tensor] = ( homogeneous_static_graph.nodes.data['feat'] ) else: + feature_key: _typing.Optional[str] = None features: _typing.Optional[torch.Tensor] = None if 'y' in homogeneous_static_graph.nodes.data: label: _typing.Optional[torch.Tensor] = ( @@ -57,47 +57,37 @@ class BaseFeatureGenerator(BaseFeatureEngineer): x=features, y=label ) setattr(data, "edge_weight", edge_weight) - return data - - def _transform(self, homogeneous_static_graph: GeneralStaticGraph) -> GeneralStaticGraph: - nodes_features: torch.Tensor = self._extract_nodes_feature( - self.__to_data(homogeneous_static_graph) - ) - if not isinstance(nodes_features, torch.Tensor): - raise TypeError - elif nodes_features.dim() == 0: - raise ValueError - elif nodes_features.dim() == 1: - nodes_features = nodes_features.view(-1, 1) - if 'x' in homogeneous_static_graph.nodes.data: - x: torch.Tensor = ( - homogeneous_static_graph.nodes.data['x'].view(-1, 1) - if homogeneous_static_graph.nodes.data['x'].dim() == 1 - else homogeneous_static_graph.nodes.data['x'] - ) - assert nodes_features.size(0) == x.size(0) - assert nodes_features.dim() == x.dim() == 2 - homogeneous_static_graph.nodes.data['x'] = torch.cat( - [x, nodes_features.to(x.dtype)], dim=-1 - ) - elif 'feat' in homogeneous_static_graph.nodes.data: - x: torch.Tensor = ( - homogeneous_static_graph.nodes.data['feat'].view(-1, 1) - if homogeneous_static_graph.nodes.data['feat'].dim() == 1 - else homogeneous_static_graph.nodes.data['feat'] + extracted_features: torch.Tensor = self._extract_nodes_feature(data) + if isinstance(feature_key, str): + nodes_features: torch.Tensor = ( + homogeneous_static_graph.nodes.data[feature_key].view(-1, 1) + if homogeneous_static_graph.nodes.data[feature_key].dim() == 1 + else homogeneous_static_graph.nodes.data[feature_key] ) - assert nodes_features.size(0) == x.size(0) - assert nodes_features.dim() == x.dim() == 2 - homogeneous_static_graph.nodes.data['feat'] = torch.cat( - [x, nodes_features.to(x.dtype)], dim=-1 + assert extracted_features.size(0) == nodes_features.size(0) + assert extracted_features.dim() == nodes_features.dim() == 2 + homogeneous_static_graph.nodes.data[feature_key] = torch.cat( + [ + nodes_features, + extracted_features.to(nodes_features.device) + ], + dim=-1 ) else: if autogl.backend.DependentBackend.is_pyg(): - homogeneous_static_graph.nodes.data['x'] = nodes_features + homogeneous_static_graph.nodes.data['x'] = extracted_features elif autogl.backend.DependentBackend.is_dgl(): - homogeneous_static_graph.nodes.data['feat'] = nodes_features + homogeneous_static_graph.nodes.data['feat'] = extracted_features return homogeneous_static_graph + def _transform(self, data: _typing.Any) -> _typing.Any: + if isinstance(data, GeneralStaticGraph): + return self.__transform_homogeneous_static_graph(data) + else: + raise NotImplementedError( + f"Feature Generator only support instance of {GeneralStaticGraph} as provided data" + ) + @FeatureEngineerUniversalRegistry.register_feature_engineer("OneHot".lower()) class OneHotFeatureGenerator(BaseFeatureGenerator): diff --git a/autogl/module/feature/_graph/_netlsd.py b/autogl/module/feature/_graph/_netlsd.py index a5a6f80..0c94ea2 100644 --- a/autogl/module/feature/_graph/_netlsd.py +++ b/autogl/module/feature/_graph/_netlsd.py @@ -1,6 +1,6 @@ import netlsd +import networkx import torch - from autogl.data.graph import GeneralStaticGraph from autogl.data.graph.utils import conversion from .._base_feature_engineer import BaseFeatureEngineer @@ -25,17 +25,58 @@ class NetLSD(BaseFeatureEngineer): self.__kwargs = kwargs super(NetLSD, self).__init__() - def _transform(self, static_graph: GeneralStaticGraph) -> GeneralStaticGraph: - temp = netlsd.heat( + def __extract(self, nx_g: networkx.Graph) -> torch.Tensor: + return torch.tensor(netlsd.heat(nx_g, *self.__args, **self.__kwargs)).view(-1) + + def __transform_homogeneous_static_graph( + self, homogeneous_static_graph: GeneralStaticGraph + ) -> GeneralStaticGraph: + if not ( + homogeneous_static_graph.nodes.is_homogeneous and + homogeneous_static_graph.edges.is_homogeneous + ): + raise ValueError("Provided static graph must be homogeneous") + dsc: torch.Tensor = self.__extract( conversion.HomogeneousStaticGraphToNetworkX(to_undirected=True).__call__( - static_graph, to_undirected=True - ), - *self.__args, **self.__kwargs + homogeneous_static_graph, to_undirected=True + ) ) - dsc: torch.Tensor = torch.tensor([temp]).view(-1) - if 'gf' in static_graph.data: - gf = static_graph.data['gf'].view(-1) - static_graph.data['gf'] = torch.cat([gf, dsc]) + if 'gf' in homogeneous_static_graph.data: + gf = homogeneous_static_graph.data['gf'].view(-1) + homogeneous_static_graph.data['gf'] = torch.cat([gf, dsc]) + else: + homogeneous_static_graph.data['gf'] = dsc + return homogeneous_static_graph + + @classmethod + def __edge_index_to_nx_graph(cls, edge_index: torch.Tensor) -> networkx.Graph: + g: networkx.Graph = networkx.Graph() + for u, v in edge_index.t().tolist(): + if u == v: + continue + else: + g.add_edge(u, v) + return g + + def __transform_data(self, data): + if not ( + hasattr(data, "edge_index") and + torch.is_tensor(data.edge_index) and + isinstance(data.edge_index, torch.Tensor) and + data.edge_index.dim() == data.edge_index.size(0) == 2 and + data.edge_index.dtype == torch.long + ): + raise TypeError("Unsupported provided data") + dsc: torch.Tensor = self.__extract(self.__edge_index_to_nx_graph(data.edge_index)) + if hasattr(data, 'gf') and isinstance(data.gf, torch.Tensor): + gf = data.gf.view(-1) + data.gf = torch.cat([gf, dsc]) + else: + data.gf = dsc + return data + + def _transform(self, data): + if isinstance(data, GeneralStaticGraph): + return self.__transform_homogeneous_static_graph(data) else: - static_graph.data['gf'] = dsc - return static_graph + return self.__transform_data(data) diff --git a/autogl/module/feature/_graph/_networkx.py b/autogl/module/feature/_graph/_networkx.py index 88ea885..92cb42e 100644 --- a/autogl/module/feature/_graph/_networkx.py +++ b/autogl/module/feature/_graph/_networkx.py @@ -28,17 +28,62 @@ class _NetworkXGraphFeatureEngineer(BaseFeatureEngineer): self.__feature_extractor: _typing.Callable[[networkx.Graph], _typing.Any] = feature_extractor super(_NetworkXGraphFeatureEngineer, self).__init__() - def _transform(self, static_graph: GeneralStaticGraph) -> GeneralStaticGraph: - dsc = self.__feature_extractor( - conversion.HomogeneousStaticGraphToNetworkX(to_undirected=True)(static_graph) - ) - dsc: torch.Tensor = torch.tensor([dsc]).view(-1) - if 'gf' in static_graph.data: - gf = static_graph.data['gf'].view(-1) - static_graph.data['gf'] = torch.cat([gf, dsc]) + def __transform_homogeneous_static_graph( + self, homogeneous_static_graph: GeneralStaticGraph + ) -> GeneralStaticGraph: + if not ( + homogeneous_static_graph.nodes.is_homogeneous and + homogeneous_static_graph.edges.is_homogeneous + ): + raise ValueError("Provided static graph must be homogeneous") + dsc: torch.Tensor = torch.tensor( + [ + self.__feature_extractor( + conversion.HomogeneousStaticGraphToNetworkX(to_undirected=True)(homogeneous_static_graph) + ) + ] + ).view(-1) + if 'gf' in homogeneous_static_graph.data: + gf = homogeneous_static_graph.data['gf'].view(-1) + homogeneous_static_graph.data['gf'] = torch.cat([gf, dsc]) else: - static_graph.data['gf'] = dsc - return static_graph + homogeneous_static_graph.data['gf'] = dsc + return homogeneous_static_graph + + @classmethod + def __edge_index_to_nx_graph(cls, edge_index: torch.Tensor) -> networkx.Graph: + g: networkx.Graph = networkx.Graph() + for u, v in edge_index.t().tolist(): + if u == v: + continue + else: + g.add_edge(u, v) + return g + + def __transform_data(self, data): + if not ( + hasattr(data, "edge_index") and + torch.is_tensor(data.edge_index) and + isinstance(data.edge_index, torch.Tensor) and + data.edge_index.dim() == data.edge_index.size(0) == 2 and + data.edge_index.dtype == torch.long + ): + raise TypeError("Unsupported provided data") + dsc: torch.Tensor = torch.tensor( + [self.__feature_extractor(self.__edge_index_to_nx_graph(data.edge_index))] + ).view(-1) + if hasattr(data, 'gf') and isinstance(data.gf, torch.Tensor): + gf = data.gf.view(-1) + data.gf = torch.cat([gf, dsc]) + else: + data.gf = dsc + return data + + def _transform(self, data): + if isinstance(data, GeneralStaticGraph): + return self.__transform_homogeneous_static_graph(data) + else: + return self.__transform_data(data) @FeatureEngineerUniversalRegistry.register_feature_engineer("NXLargeCliqueSize") diff --git a/autogl/module/hpo/autone.py b/autogl/module/hpo/autone.py index bc6d6d8..3f9625c 100644 --- a/autogl/module/hpo/autone.py +++ b/autogl/module/hpo/autone.py @@ -12,7 +12,7 @@ from .autone_file import utils from torch_geometric.data import GraphSAINTRandomWalkSampler -from ..feature.graph import SgNetLSD +from ..feature import NetLSD as SgNetLSD from torch_geometric.data import InMemoryDataset