From b6965053af783fa85c8ee5b63ea67cbd9f381482 Mon Sep 17 00:00:00 2001 From: BeiniXie Date: Tue, 12 Apr 2022 15:08:33 +0800 Subject: [PATCH 01/16] model tutorial --- docs/docfile/tutorial/t_model_chinese.rst | 215 ++++++++++++++++++++++ 1 file changed, 215 insertions(+) create mode 100644 docs/docfile/tutorial/t_model_chinese.rst diff --git a/docs/docfile/tutorial/t_model_chinese.rst b/docs/docfile/tutorial/t_model_chinese.rst new file mode 100644 index 0000000..4e438ce --- /dev/null +++ b/docs/docfile/tutorial/t_model_chinese.rst @@ -0,0 +1,215 @@ +.. _model: + +AutoGL 模型 +============ +在AutoGL中,我们使用``model``和``automodel``类定义图神经网络模型,并让它们和超参数优化(hyper parameter optimization, HPO)模块兼容。 + +当前版本下,我们支持节点分类、图分类和链接预测三种任务任务,支持的具体模型如下: + ++----------------------+----------------------------+ +| 任务 | 模型 | ++======================+============================+ +| 节点分类 | ``gcn``, ``gat``, ``sage`` | ++----------------------+----------------------------+ +| 图分类 | ``gin``, ``topk`` | ++----------------------+----------------------------+ +| 链接预测 | ``gcn``, ``gat``, ``sage`` | ++----------------------+----------------------------+ + +自定义模型和自动模型 +------------------ +我们强烈建议您同时定义``model``类和``automodel``类。 +其中,``model``类来管理参数的初始化与模型前向传播逻辑,``automodel``类组织超参数相关的搜索。 +``automodel``在``solver``和``trainer``模块会被调用。 + +示例 +^^^^ +以一个用于节点分类任务的多层感知机(MLP)为例。您可以使用AutoGL来帮您找到最合适的超参数。 + +首先,您可以定义一个MLP模型,并假设所有超参数已经给定。 + +.. code-block:: python + + import torch + + class MyMLP(torch.nn.Module): + # 假定所有超参数可获得 + def __init__(self, args): + super().__init__() + in_channels, num_classes = args['in_channels'], args['num_classes'] + layer_num, dim = args['layer_num'], int(args['dim']) + + if layer_num == 1: + ops = [torch.nn.Linear(in_channels, num_classes)] + else: + ops = [torch.nn.Linear(in_channels, dim)] + for i in range(layer_num - 2): + ops.append(torch.nn.Linear(dim, dim)) + ops.append(torch.nn.Linear(dim, num_classes)) + + self.core = torch.nn.Sequential(*ops) + + # 必须利用forward函数定义模型的前向传播逻辑 + def forward(self, data): + assert hasattr(data, 'x'), 'MLP only support graph data with features' + x = data.x + return torch.nn.functional.log_softmax(self.core(x)) + + +接下来,您可以定义自动模型``automodel``类以更好管理您的超参数。 +对于来自于数据集的参数如输入维度与输出维度,可以直接传入``automodel``类中的初始化函数中``__init__()``。 +而对于需要搜索的其他超参数,需要自定义搜索空间。 + +.. code-block:: python + + from autogl.module.model import BaseAutoModel + + # 定义自动模型类,需要从BaseAutoModel类继承 + class MyAutoMLP(BaseAutoModel): + def __init__(self, num_features=None, num_classes=None, device=None, **args + ): + super().__init__(num_features, num_classes, device, **args) + + # (required) 需要定义搜索空间(包含超参数、超参数的类型以及搜索范围) + self.space = [ + {'parameterName': 'layer_num', 'type': 'INTEGER', 'minValue': 1, 'maxValue': 5, 'scalingType': 'LINEAR'}, + {'parameterName': 'dim', 'type': 'INTEGER', 'minValue': 64, 'maxValue': 128, 'scalingType': 'LINEAR'} + ] + + # 设置默认超参数 + self.hyper_parameters = { + "layer_num": 2, + "dim": 72, + } + + + # # (required) since we don't know the num_classes and num_features until we see the dataset, + # # we cannot initialize the models when instantiated. the initialized will be set to False. + # self.initialized = False + + + # (required) instantiate the core MLP model using corresponding hyper-parameters + def _initialize(self): + # (required) you need to make sure the core model is named as `self.model` + self.model = MyMLP({ + "in_channels": self.input_dimension, + "num_classes": self.output_dimension, + **self.hyper_parameters + } + ).to(self.device) + + + +接着,只需要将定义好的自动图模型输入自动图分类任务的``solver``中,就可以利用它完成节点分类任务。 +具体代码示例如下: +.. code-block :: python + + from autogl.solver import AutoNodeClassifier + + solver = AutoNodeClassifier(graph_models=(MyAutoMLP(num_features, num_classes,device=torch.device('cuda')),)) + + + +图分类任务的模型定义和整个流程和节点分类任务相似。详情参考图分类模型的tutorial。 + + +用于链接预测任务的模型 +^^^^^^^^^^^^^^^^^^^^ + +对于链接预测任务,模型的定义在``forward()``函数中略有不同。 +为了更好地和链接预测训练器``LinkPredictionTrainer``与自动链接预测器``AutoLinkPredictor``交互,您需要定义编码函数``lp_encode(self, data)``与解码函数``lp_decode(self, x, pos_edge_index, neg_edge_index)`` + +用同样的多层感知机作为示例,如果您想要将其用于链接预测任务,那么您不必再定义``forward()``函数,而是定义``lp_encode(self, data)``与``lp_decode(self, x, pos_edge_index, neg_edge_index)``两个函数。具体代码示例如下: + +.. code-block:: python + + class MyMLPForLP(torch.nn.Module): + def __init__(self, in_channels, layer_num, dim): + super().__init__() + ops = [torch.nn.Linear(in_channels, dim)] + for i in range(layer_num - 1): + ops.append(torch.nn.Linear(dim, dim)) + + self.core = torch.nn.Sequential(*ops) + + # (required) 和trainer与solver模块交互 + def lp_encode(self, data): + return self.core(data.x) + + # (required) 和trainer与solver模块交互 + def lp_decode(self, x, pos_edge_index, neg_edge_index): + # 首先得到所有需要的正样本边与负样本边集合 + edge_index = torch.cat([pos_edge_index, neg_edge_index], dim=-1) + # 利用点积计算logits,或者使用其他decode方法 + logits = (x[edge_index[0]] * x[edge_index[1]]).sum(dim=-1) + return logits + + class MyAutoMLPForLP(MyAutoMLP): + def initialize(self): + self.model = MyMLPForLP( + in_channels = self.num_features, + layer_num = self.layer_num, + dim = self.dim + ).to(self.device) + + + +支持采样的模型 +^^^^^^^^^^^^^ +为了高效地实现大规模图上表示学习,AutoGL目前支持使用节点级别(node-wise)的采样、层级别(layer-wise)的采样和子图级别(subgraph-wise)的采样等采样技术进行节点分类。 +有关采样的更多信息,请参阅::ref:`trainer`。 + +根据图神经网络中的消息传递机制,一个节点的表达由它多跳邻居构成的子图决定。 +但是,节点的邻居数量随着神经网络层数的增加呈现指数级增长,计算并储存所有节点的表达会占用许多的计算资源。 +因此,在得到节点表达时,我们可以在每层神经网络输入不同的采样后的子图以达到高效计算的目的。 +以torch_geometric的data为例,一个图包含节点特征x和边集合edge_index,在AutoGL的采样技巧中,我们会为data提供edge_indexes属性以表示不同的图卷积层采样出来的不同子图。 + +.. code-block:: python + + import autogl + from autogl.module.model import ClassificationSupportedSequentialModel + + # 重新定义接收图作为输入的Linear类 + class Linear(torch.nn.Linear): + def forward(self, data): + return super().forward(data.x) + + class MyMLPSampling(ClassificationSupportedSequentialModel): + def __init__(self, in_channels, num_classes, layer_num, dim): + super().__init__() + if layer_num == 1: + ops = [Linear(in_channels, num_classes)] + else: + ops = [Linear(in_channels, dim)] + for i in range(layer_num - 2): + ops.append(Linear(dim, dim)) + ops.append(Linear(dim, num_classes)) + + self.core = torch.nn.ModuleList(ops) + + # (required) 覆盖序列编码层sequential_encoding_layers(),和sampling交互 + @property + def sequential_encoding_layers(self) -> torch.nn.ModuleList: + return self.core + + # (required) define the encode logic of classification for sampling + def cls_encode(self, data): + if hasattr(data, 'edge_indexes'): + # edge_indexes是由edge_index组成的列表,每个edge_index代表每层图卷积所使用的边 + edge_indexes = data.edge_indexes + edge_weights = [None] * len(self.core) if getattr(data, 'edge_weights', None) is None else data.edge_weights + else: + # 默认edge_index和edge_weight是相同的 + edge_indexes = [data.edge_index] * len(self.core) + edge_weights = [getattr(data, 'edge_weight', None)] * len(self.core) + + x = data.x + for i in range(len(self.core)): + data = autogl.data.Data(x=x, edge_index=edge_indexes[i]) + data.edge_weight = edge_weights[i] + x = self.sequential_encoding_layers[i](data) + return x + + def cls_decode(self, x): + return torch.nn.functional.log_softmax(x) + From dca18c2ca2eb17fd1fb068c4a1dc9a64642df183 Mon Sep 17 00:00:00 2001 From: BeiniXie Date: Tue, 12 Apr 2022 20:44:02 +0800 Subject: [PATCH 02/16] model_cn --- docs/docfile/tutorial/t_model_chinese.rst | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/docfile/tutorial/t_model_chinese.rst b/docs/docfile/tutorial/t_model_chinese.rst index 4e438ce..5a76e8f 100644 --- a/docs/docfile/tutorial/t_model_chinese.rst +++ b/docs/docfile/tutorial/t_model_chinese.rst @@ -2,7 +2,7 @@ AutoGL 模型 ============ -在AutoGL中,我们使用``model``和``automodel``类定义图神经网络模型,并让它们和超参数优化(hyper parameter optimization, HPO)模块兼容。 +在AutoGL中,我们使用 ``model``和 ``automodel``类定义图神经网络模型,并让它们和超参数优化(hyper parameter optimization, HPO)模块兼容。 当前版本下,我们支持节点分类、图分类和链接预测三种任务任务,支持的具体模型如下: @@ -18,8 +18,8 @@ AutoGL 模型 自定义模型和自动模型 ------------------ -我们强烈建议您同时定义``model``类和``automodel``类。 -其中,``model``类来管理参数的初始化与模型前向传播逻辑,``automodel``类组织超参数相关的搜索。 +我们强烈建议您同时定义 ``model``类和 ``automodel``类。 +其中, ``model`` 类来管理参数的初始化与模型前向传播逻辑, ``automodel``类组织超参数相关的搜索。 ``automodel``在``solver``和``trainer``模块会被调用。 示例 @@ -56,8 +56,8 @@ AutoGL 模型 return torch.nn.functional.log_softmax(self.core(x)) -接下来,您可以定义自动模型``automodel``类以更好管理您的超参数。 -对于来自于数据集的参数如输入维度与输出维度,可以直接传入``automodel``类中的初始化函数中``__init__()``。 +接下来,您可以定义自动模型 ``automodel`` 类以更好管理您的超参数。 +对于来自于数据集的参数如输入维度与输出维度,可以直接传入 ``automodel`` 类中的初始化函数中 ``__init__()`` 。 而对于需要搜索的其他超参数,需要自定义搜索空间。 .. code-block:: python @@ -100,7 +100,7 @@ AutoGL 模型 -接着,只需要将定义好的自动图模型输入自动图分类任务的``solver``中,就可以利用它完成节点分类任务。 +接着,只需要将定义好的自动图模型输入自动图分类任务的 ``solver`` 中,就可以利用它完成节点分类任务。 具体代码示例如下: .. code-block :: python @@ -116,10 +116,10 @@ AutoGL 模型 用于链接预测任务的模型 ^^^^^^^^^^^^^^^^^^^^ -对于链接预测任务,模型的定义在``forward()``函数中略有不同。 -为了更好地和链接预测训练器``LinkPredictionTrainer``与自动链接预测器``AutoLinkPredictor``交互,您需要定义编码函数``lp_encode(self, data)``与解码函数``lp_decode(self, x, pos_edge_index, neg_edge_index)`` +对于链接预测任务,模型的定义在 ``forward()`` 函数中略有不同。 +为了更好地和链接预测训练器 ``LinkPredictionTrainer`` 与自动链接预测器 ``AutoLinkPredictor`` 交互,您需要定义编码函数 ``lp_encode(self, data)`` 与解码函数 ``lp_decode(self, x, pos_edge_index, neg_edge_index)`` 。 -用同样的多层感知机作为示例,如果您想要将其用于链接预测任务,那么您不必再定义``forward()``函数,而是定义``lp_encode(self, data)``与``lp_decode(self, x, pos_edge_index, neg_edge_index)``两个函数。具体代码示例如下: +用同样的多层感知机作为示例,如果您想要将其用于链接预测任务,那么您不必再定义 ``forward()`` 函数,而是定义 ``lp_encode(self, data)`` 与 ``lp_decode(self, x, pos_edge_index, neg_edge_index)`` 两个函数。具体代码示例如下: .. code-block:: python From 67b0bb05f813386dff3b589cde9bf923b828576c Mon Sep 17 00:00:00 2001 From: BeiniXie Date: Tue, 12 Apr 2022 20:49:21 +0800 Subject: [PATCH 03/16] model_cn --- docs/docfile/tutorial/t_model_chinese.rst | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/docs/docfile/tutorial/t_model_chinese.rst b/docs/docfile/tutorial/t_model_chinese.rst index 5a76e8f..7d58fe8 100644 --- a/docs/docfile/tutorial/t_model_chinese.rst +++ b/docs/docfile/tutorial/t_model_chinese.rst @@ -2,10 +2,11 @@ AutoGL 模型 ============ -在AutoGL中,我们使用 ``model``和 ``automodel``类定义图神经网络模型,并让它们和超参数优化(hyper parameter optimization, HPO)模块兼容。 +在AutoGL中,我们使用 ``model`` 和 ``automodel`` 类定义图神经网络模型,并让它们和超参数优化(hyper parameter optimization, HPO)模块兼容。 当前版本下,我们支持节点分类、图分类和链接预测三种任务任务,支持的具体模型如下: + +----------------------+----------------------------+ | 任务 | 模型 | +======================+============================+ @@ -16,11 +17,12 @@ AutoGL 模型 | 链接预测 | ``gcn``, ``gat``, ``sage`` | +----------------------+----------------------------+ + 自定义模型和自动模型 ------------------ -我们强烈建议您同时定义 ``model``类和 ``automodel``类。 -其中, ``model`` 类来管理参数的初始化与模型前向传播逻辑, ``automodel``类组织超参数相关的搜索。 -``automodel``在``solver``和``trainer``模块会被调用。 +我们强烈建议您同时定义 ``model`` 类和 ``automodel`` 类。 +其中, ``model`` 类来管理参数的初始化与模型前向传播逻辑, ``automodel`` 类组织超参数相关的搜索。 +``automodel`` 在 ``solver`` 和 ``trainer`` 模块会被调用。 示例 ^^^^ From 73e6b87ee985caf0871de0dfc92098c6a3436105 Mon Sep 17 00:00:00 2001 From: BeiniXie Date: Tue, 12 Apr 2022 20:52:02 +0800 Subject: [PATCH 04/16] model_cn --- docs/docfile/tutorial/t_model_chinese.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/docfile/tutorial/t_model_chinese.rst b/docs/docfile/tutorial/t_model_chinese.rst index 7d58fe8..801347a 100644 --- a/docs/docfile/tutorial/t_model_chinese.rst +++ b/docs/docfile/tutorial/t_model_chinese.rst @@ -8,13 +8,13 @@ AutoGL 模型 +----------------------+----------------------------+ -| 任务 | 模型 | +|任务 | 模型 | +======================+============================+ -| 节点分类 | ``gcn``, ``gat``, ``sage`` | +|节点分类 | ``gcn``, ``gat``, ``sage`` | +----------------------+----------------------------+ -| 图分类 | ``gin``, ``topk`` | +|图分类 | ``gin``, ``topk`` | +----------------------+----------------------------+ -| 链接预测 | ``gcn``, ``gat``, ``sage`` | +|链接预测 | ``gcn``, ``gat``, ``sage`` | +----------------------+----------------------------+ From 9184df249386efc36f19f1a082c5e84921be8d6b Mon Sep 17 00:00:00 2001 From: ZW-ZHANG Date: Wed, 13 Apr 2022 11:19:04 +0800 Subject: [PATCH 05/16] add README_cn.md --- README.md | 2 + README_cn.md | 170 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 172 insertions(+) create mode 100644 README_cn.md diff --git a/README.md b/README.md index aee4d74..a9f57df 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # Auto Graph Learning +[Chinese Introduction](https://github.com/THUMNLab/AutoGL/blob/main/README_cn.md) + An autoML framework & toolkit for machine learning on graphs. *Actively under development by @THUMNLab* diff --git a/README_cn.md b/README_cn.md new file mode 100644 index 0000000..db08340 --- /dev/null +++ b/README_cn.md @@ -0,0 +1,170 @@ +# 智图 (AutoGL) +[English Introduction](https://github.com/THUMNLab/AutoGL) + +用于图数据的自动机器学习框架和工具包。 + +*由清华大学媒体与网络实验室进行开发与维护* + +若有任何意见或建议,欢迎通过issues 或邮件autogl@tsinghua.edu.cn与我们联系。 + + + +## 最新消息 + +- 2021.12.31 v0.3.0-pre版本更新! + - 智图目前支持[__Deep Graph Library (DGL)__](https://www.dgl.ai/)作为后端,以方便DGL的用户使用。目前在DGL后端已经支持同构图的节点分类、链接预测以及图分类等任务。智图现在也可兼容PyG 2.0版本。 + - 智图可以支持__异构图__节点分类任务!详情请参考[异构图教程](http://mn.cs.tsinghua.edu.cn/autogl/documentation/docfile/tutorial/t_hetero_node_clf.html)。 + - 为了使智图算法更灵活,`model`模块目前支持__解耦__为两个子模块,即编码器`encoder`和解码器`decoder`。在__解耦__设计中,一个`encoder`可以被用来处理不同任务,以减少重复开发的负担, + - 我们扩展了支持的[神经架构搜索算法](http://mn.cs.tsinghua.edu.cn/autogl/documentation/docfile/tutorial/t_nas.html),例如[AutoAttend](https://proceedings.mlr.press/v139/guan21a.html),[GASSO](https://proceedings.neurips.cc/paper/2021/hash/8c9f32e03aeb2e3000825c8c875c4edd-Abstract.html), [硬件感知算法](http://mn.cs.tsinghua.edu.cn/autogl/documentation/docfile/documentation/nas.html#autogl.module.nas.estimator.OneShotEstimator_HardwareAware)等。 +- 2021.07.11 智图更新v0.2.0-pre版本! 在新版本中,智图支持[神经架构搜索(NAS)](http://mn.cs.tsinghua.edu.cn/autogl/documentation/docfile/tutorial/t_nas.html),可以对给定的数据集和架构定制化神经网络架构。智图也支持了[采样](http://mn.cs.tsinghua.edu.cn/autogl/documentation/docfile/tutorial/t_trainer.html#node-classification-with-sampling)功能以处理大规模图数据集,包括节点采样、层采样和子图采样。链接预测任务也已经支持。详情请参考我们的[教程](http://mn.cs.tsinghua.edu.cn/autogl/documentation/index.html). +- 2021.04.16 我们关于图自动机器学习的综述文章已经被IJCAI 2021接受! 详情见[这里](http://arxiv.org/abs/2103.00742)。 +- 2021.04.10 我们的论文[__AutoGL: A Library for Automated Graph Learning__](https://arxiv.org/abs/2104.04987)已经被 _ICLR 2021 Workshop on Geometrical and Topological Representation Learning_ 接受! + +## 介绍 + +智图的设计目标是可以简单、快速地对图数据集和任务进行自动机器学习,可供研究者和开发者使用。更多详细信息,可以参阅我们的文档。 + +下图是智图的整体框架。 + + + +智图通过 `datasets` 类以支持图数据集,其基于 PyTorch Geometric 和 Deep Graph Library 的数据集,并添加了一些函数以支持自动机器学习框架。 + +智图通过 `AutoGL solvers` 以处理不同的图机器学习任务,利用五个主要模块自动解决给定的任务,即自动特征工程 `auto feature engineer`,神经架构搜索 `neural architecture search`,自动模型 `auto model`,超参数优化 `hyperparameter optimization`,和自动模型集成 `auto ensemble`。 + +目前,智图支持以下算法: + + + + + + + + + + + + + + + + + + +
特征工程图模型神经架构搜索超参数优化模型集成
生成器
Graphlets
EigenGNN
更多 ...

选择器
SeFilterConstant
gbdt

全图特征
Netlsd
NxAverageClustering
更多 ...
同构图编码器
GCNEncoder
GATEncoder
SAGEEncoder
GINEncoder

解码器
LogSoftmaxDecoder
DotProductDecoder
SumPoolMLPDecoder
JKSumPoolDecoder
+ 搜索算法
+ Random
+ RL
+ Evolution
+ GASSO
+ 更多 ...

+ 搜索空间
+ SinglePath
+ GraphNas
+ AutoAttend
+ 更多 ...

+ 模型评估
+ Oneshot
+ Scratch
+
Grid
Random
Anneal
Bayes
CAMES
MOCAMES
Quasi random
TPE
AutoNE
Voting
Stacking
+ +此工具包还可作为一个框架供用户实现和测试自己的自动机器学习或图机器学习模型。 + +## 安装 + +### 依赖 + +在安装智图之前,请首先安装以下依赖项。 + +1. Python >= 3.6.0 + +2. PyTorch (>=1.6.0) + + 详细信息请参考。 + +3. 图机器学习工具包 + + 智图需要 PyTorch Geometric(PyG)或 Deep Graph Library(DGL)作为后端。若两者均安装,可在运行时选择任一后端,参考[这里](http://mn.cs.tsinghua.edu.cn/autogl/documentation/docfile/tutorial/t_backend.html)。 + + 3.1 PyTorch Geometric (>=1.7.0) + + 详细信息请参考。 + + 3.2 Deep Graph Library (>=0.7.0) + + 详细信息请参考。 + + +### 安装 + +#### 通过pip进行安装 + +运行以下命令以通过`pip`安装智图。 + +``` +pip install autogl +``` + +#### 从源代码安装 + +运行以下命令以从源安装智图。 + +``` +git clone https://github.com/THUMNLab/AutoGL.git +cd AutoGL +python setup.py install +``` + +#### 开发者安装 + +如果您想以开发者方式安装智图,请运行以下命令以创建软链接,然后即可修改本地程序后而无需重复安装。 +``` +pip install -e . +``` + +## 文档 + +您可参考文档页面 以参阅我们的详细文档。 + +文档也可以进行本地编译。首先,请安装 sphinx 和 sphinx-rtd-theme: +``` +pip install -U Sphinx +pip install sphinx-rtd-theme +``` + +然后,通过以下方式创建 html 文档: +``` +cd docs +make clean && make html +``` +文档将在如下路径自动生成:`docs/_build/html` + +## 引用 + +如果您使用了智图代码,请按如下方式引用我们的[论文](https://openreview.net/forum?id=0yHwpLeInDn): +``` +@inproceedings{guan2021autogl, + title={Auto{GL}: A Library for Automated Graph Learning}, + author={Chaoyu Guan and Ziwei Zhang and Haoyang Li and Heng Chang and Zeyang Zhang and Yijian Qin and Jiyan Jiang and Xin Wang and Wenwu Zhu}, + booktitle={ICLR 2021 Workshop on Geometrical and Topological Representation Learning}, + year={2021}, + url={https://openreview.net/forum?id=0yHwpLeInDn} +} +``` + +或许您也会发现我们的[综述](http://arxiv.org/abs/2103.00742)有帮助: +``` +@article{zhang2021automated, + title={Automated Machine Learning on Graphs: A Survey}, + author={Zhang, Ziwei and Wang, Xin and Zhu, Wenwu}, + booktitle = {Proceedings of the Thirtieth International Joint Conference on Artificial Intelligence, {IJCAI-21}}, + year={2021}, + note={Survey track} +} +``` + +## 版权相关 +从v0.2版本开始,智图的所有代码采用[Apache license](LICENSE)。 + From 566a34af30bbe37dc862be47d4f3e52abe328ec5 Mon Sep 17 00:00:00 2001 From: ZW-ZHANG Date: Wed, 13 Apr 2022 11:27:10 +0800 Subject: [PATCH 06/16] update readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a9f57df..81d5d8f 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Auto Graph Learning -[Chinese Introduction](https://github.com/THUMNLab/AutoGL/blob/main/README_cn.md) +[Chinese Introduction](README_cn.md) An autoML framework & toolkit for machine learning on graphs. From bc6bb59cf5d6f87de61d6668f5f8bbccd5aefcf6 Mon Sep 17 00:00:00 2001 From: Generall <465383637@qq.com> Date: Wed, 13 Apr 2022 11:31:53 +0800 Subject: [PATCH 07/16] Update README_cn.md --- README_cn.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README_cn.md b/README_cn.md index db08340..f87fc49 100644 --- a/README_cn.md +++ b/README_cn.md @@ -1,5 +1,5 @@ # 智图 (AutoGL) -[English Introduction](https://github.com/THUMNLab/AutoGL) +[English Introduction]() 用于图数据的自动机器学习框架和工具包。 From 2f576e6fc3e59e89e305a8d85331747e410f747c Mon Sep 17 00:00:00 2001 From: Generall <465383637@qq.com> Date: Wed, 13 Apr 2022 11:33:15 +0800 Subject: [PATCH 08/16] Update README_cn.md --- README_cn.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README_cn.md b/README_cn.md index f87fc49..6f9c9d5 100644 --- a/README_cn.md +++ b/README_cn.md @@ -1,5 +1,5 @@ # 智图 (AutoGL) -[English Introduction]() +[English Introduction](.) 用于图数据的自动机器学习框架和工具包。 From 19bb6f58647a4d3009c064a60bf2b83faf4a9dfd Mon Sep 17 00:00:00 2001 From: Generall <465383637@qq.com> Date: Wed, 13 Apr 2022 11:33:35 +0800 Subject: [PATCH 09/16] Update README_cn.md --- README_cn.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README_cn.md b/README_cn.md index 6f9c9d5..8549812 100644 --- a/README_cn.md +++ b/README_cn.md @@ -1,5 +1,5 @@ # 智图 (AutoGL) -[English Introduction](.) +[English Introduction](..) 用于图数据的自动机器学习框架和工具包。 From b2a27888a96142aac9bf127ac10d1be8b770a0a7 Mon Sep 17 00:00:00 2001 From: wondergo2017 Date: Wed, 13 Apr 2022 14:28:31 +0800 Subject: [PATCH 10/16] ensemble cn --- docs/docfile/tutorial/t_ensemble_cn.rst | 54 +++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 docs/docfile/tutorial/t_ensemble_cn.rst diff --git a/docs/docfile/tutorial/t_ensemble_cn.rst b/docs/docfile/tutorial/t_ensemble_cn.rst new file mode 100644 index 0000000..3b8ec8f --- /dev/null +++ b/docs/docfile/tutorial/t_ensemble_cn.rst @@ -0,0 +1,54 @@ +.. _ensemble: + +Ensemble +======== + +我们现在支持 voting 和 stacking 方法 + +Voting +------ + +Voter本质上构建了base learner预测的加权和。给定一个评估指标,Voter以某种方式确定base learner的权重,使得验证集指标分数最大化。 + +我们采用Rich Caruana的权重确定方法。该方法首先通过贪婪搜索找到权重相等的(可能是冗余的)base learner集合,然后通过集合中出现的次数指定Voter中的权重。 + +您可以通过重写 ``_specificy_weights`` 方法来定制自己的权重确定方法。 + +.. code-block :: python + + # 例子: 对所有base learner 使用同样的权重 + class EqualWeightVoting(Voting): + def _specify_weights(self, predictions, label, feval): + return np.ones(self.n_models)/self.n_models + # 对所有base learner 赋予相同的权重 + +Stacking +-------- + +Stacker将Base Learner的预测作为输入来训练元模型,以找到这些base learner的最佳组合。 + +目前我们支持广义线性模型(GLM)和梯度推进模型(GBM)作为元模型。 + +创建一个新的ensemble +---------------------- + +您可以通过继承base ensember,重载``fit``和``ensemble``方法来创建自己的ensember。 + +.. code-block :: python + + # 例子 : 使用当前可用的最佳模型 + from autogl.module.ensemble.base import BaseEnsembler + import numpy as np + class BestModel(BaseEnsembler): + def fit(self, predictions, label, identifiers, feval): + if not isinstance(feval, list): + feval = [feval] + scores = np.array([feval[0].evaluate(pred, label) for pred in predictions]) * (1 if feval[0].is_higher_better else -1) + self.scores = dict(zip(identifiers, scores)) # record validation score of base learners + ensemble_pred = predictions[np.argmax(scores)] + return [fx.evaluate(ensemble_pred, label) for fx in feval] + + def ensemble(self, predictions, identifiers): + best_idx = np.argmax([self.scores[model_name] for model_name in identifiers]) # choose the currently best model in the identifiers + return predictions[best_idx] + From e87e7b39aa56f41b3be61f88081c61fe22bc0f46 Mon Sep 17 00:00:00 2001 From: wondergo2017 Date: Wed, 13 Apr 2022 15:19:35 +0800 Subject: [PATCH 11/16] add fe_tutorial unit test --- test/fe/fe_tutorial.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 test/fe/fe_tutorial.py diff --git a/test/fe/fe_tutorial.py b/test/fe/fe_tutorial.py new file mode 100644 index 0000000..eac51f3 --- /dev/null +++ b/test/fe/fe_tutorial.py @@ -0,0 +1,35 @@ +# 1. Choose a dataset. +from autogl.datasets import build_dataset_from_name +data = build_dataset_from_name('cora') + +# 2. Compose a feature engineering pipeline +from autogl.module.feature._base_feature_engineer._base_feature_engineer import _ComposedFeatureEngineer +from autogl.module.feature import EigenFeatureGenerator +from autogl.module.feature import NetLSD + +# you may compose feature engineering bases through autogl.module.feature._base_feature_engineer +fe = _ComposedFeatureEngineer([ + EigenFeatureGenerator(size=32), + NetLSD() +]) + +# 3. Fit and transform the data +fe.fit(data) +data1=fe.transform(data,inplace=False) + +import autogl +import torch +from autogl.module.feature._generators._basic import BaseFeatureGenerator + +class OneHotFeatureGenerator(BaseFeatureGenerator): + # if overrider_features==False , concat the features with original features; otherwise override. + def __init__(self, override_features: bool = False): + super(BaseFeatureGenerator, self).__init__(override_features) + + def _extract_nodes_feature(self, data: autogl.data.Data) -> torch.Tensor: + num_nodes: int = ( + data.x.size(0) + if data.x is not None and isinstance(data.x, torch.Tensor) + else (data.edge_index.max().item() + 1) + ) + return torch.eye(num_nodes) \ No newline at end of file From c7a06ceac8b5087d452adef7c5f16532e6eee53c Mon Sep 17 00:00:00 2001 From: wondergo2017 Date: Wed, 13 Apr 2022 15:20:27 +0800 Subject: [PATCH 12/16] update fe tutorial doc --- docs/docfile/tutorial/t_fe.rst | 76 ++++++++++++++++------------------ 1 file changed, 36 insertions(+), 40 deletions(-) diff --git a/docs/docfile/tutorial/t_fe.rst b/docs/docfile/tutorial/t_fe.rst index df92809..bf76e79 100644 --- a/docs/docfile/tutorial/t_fe.rst +++ b/docs/docfile/tutorial/t_fe.rst @@ -4,8 +4,7 @@ AutoGL Feature Engineering ========================== We provide a series of node and graph feature engineers for -you to compose within a feature engineering pipeline. An automatic -feature engineering algorithm is also provided. +you to compose within a feature engineering pipeline. Quick Start ----------- @@ -16,18 +15,15 @@ Quick Start data = build_dataset_from_name('cora') # 2. Compose a feature engineering pipeline - from autogl.module.feature import BaseFeature,AutoFeatureEngineer - from autogl.module.feature.generators import GeEigen - from autogl.module.feature.selectors import SeGBDT - from autogl.module.feature.graph import SgNetLSD - # you may compose feature engineering bases through BaseFeature.compose - fe = BaseFeature.compose([ - GeEigen(size=32) , - SeGBDT(fixlen=100), - SgNetLSD() + from autogl.module.feature._base_feature_engineer._base_feature_engineer import _ComposedFeatureEngineer + from autogl.module.feature import EigenFeatureGenerator + from autogl.module.feature import NetLSD + + # you may compose feature engineering bases through autogl.module.feature._base_feature_engineer + fe = _ComposedFeatureEngineer([ + EigenFeatureGenerator(size=32), + NetLSD() ]) - # or just through '&' operator - fe = fe & AutoFeatureEngineer(fixlen=200,max_epoch=3) # 3. Fit and transform the data fe.fit(data) @@ -44,19 +40,19 @@ in configurations or as arguments of the autogl solver. +---------------------------+-------------------------------------------------+ | Base | Description | +===========================+=================================================+ -| ``graphlet`` | concatenate local graphlet numbers as features. | +| ``GraphletGenerator`` | concatenate local graphlet numbers as features. | +---------------------------+-------------------------------------------------+ -| ``eigen`` | concatenate Eigen features. | +| ``EigenFeatureGenerator`` | concatenate Eigen features. | +---------------------------+-------------------------------------------------+ -| ``pagerank`` | concatenate Pagerank scores. | +| ``PageRankFeatureGenerator`` | concatenate Pagerank scores. | +---------------------------+-------------------------------------------------+ -| ``PYGLocalDegreeProfile`` | concatenate Local Degree Profile features. | +| `` LocalDegreeProfileGenerator `` | concatenate Local Degree Profile features. | +---------------------------+-------------------------------------------------+ -| ``PYGNormalizeFeatures`` | Normalize all node features | +| ``NormalizeFeatures`` | Normalize all node features | +---------------------------+-------------------------------------------------+ -| ``PYGOneHotDegree`` | concatenate degree one-hot encoding. | +| ``OneHotDegreeGenerator`` | concatenate degree one-hot encoding. | +---------------------------+-------------------------------------------------+ -| ``onehot`` | concatenate node id one-hot encoding. | +| ``OneHotFeatureGenerator`` | concatenate node id one-hot encoding. | +---------------------------+-------------------------------------------------+ 2. ``selectors`` @@ -64,14 +60,14 @@ in configurations or as arguments of the autogl solver. +----------------------+--------------------------------------------------------------------------------+ | Base | Description | +======================+================================================================================+ -| ``SeFilterConstant`` | delete all constant and one-hot encoding node features. | +| ``FilterConstant`` | delete all constant and one-hot encoding node features. | +----------------------+--------------------------------------------------------------------------------+ -| ``gbdt`` | select top-k important node features ranked by Gradient Descent Decision Tree. | +| ``GBDTFeatureSelector`` | select top-k important node features ranked by Gradient Descent Decision Tree. | +----------------------+--------------------------------------------------------------------------------+ 3. ``graph`` -``netlsd`` is a graph feature generation method. please refer to the according document. +``NetLSD`` is a graph feature generation method. please refer to the according document. A set of graph feature extractors implemented in NetworkX are wrapped, please refer to NetworkX for details. (``NxLargeCliqueSize``, ``NxAverageClusteringApproximate``, ``NxDegreeAssortativityCoefficient``, ``NxDegreePearsonCorrelationCoefficient``, ``NxHasBridge`` ,``NxGraphCliqueNumber``, ``NxGraphNumberOfCliques``, ``NxTransitivity``, ``NxAverageClustering``, ``NxIsConnected``, ``NxNumberConnectedComponents``, @@ -87,24 +83,24 @@ Of course, you can directly inherit the ``BaseFeature`` as well. Create Your Own FE ------------------ You can create your own feature engineering object by simply inheriting one of feature engineering base types ,namely ``generators``, ``selectors`` , ``graph``, -and overloading methods ``_fit`` and ``_transform``. +and overloading methods ``extract_xx_features``. .. code-block :: python # for example : create a node one-hot feature. - from autogl.module.feature.generators.base import BaseGenerator - import numpy as np - class GeOnehot(BaseGenerator): - def __init__(self): - super(GeOnehot,self).__init__(data_t='np',multigraph=True,subgraph=False) - # data type in mid is 'numpy', - # and it can be used for multigraph, - # but not suitable for subgraph feature extraction. - - def _fit(self): - pass # nothing to train or memorize - - def _transform(self, data): - fe=np.eye(data.x.shape[0]) - data.x=np.concatenate([data.x,fe],axis=1) - return data + import autogl + import torch + from autogl.module.feature._generators._basic import BaseFeatureGenerator + + class OneHotFeatureGenerator(BaseFeatureGenerator): + # if overrider_features==False , concat the features with original features; otherwise override. + def __init__(self, override_features: bool = False): + super(BaseFeatureGenerator, self).__init__(override_features) + + def _extract_nodes_feature(self, data: autogl.data.Data) -> torch.Tensor: + num_nodes: int = ( + data.x.size(0) + if data.x is not None and isinstance(data.x, torch.Tensor) + else (data.edge_index.max().item() + 1) + ) + return torch.eye(num_nodes) From 7cce7cc535816fe5de1f56821eed8b266ec69134 Mon Sep 17 00:00:00 2001 From: wondergo2017 Date: Wed, 13 Apr 2022 15:42:09 +0800 Subject: [PATCH 13/16] update fe_cn tutorial doc --- docs/docfile/tutorial/t_fe_cn.rst | 100 ++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 docs/docfile/tutorial/t_fe_cn.rst diff --git a/docs/docfile/tutorial/t_fe_cn.rst b/docs/docfile/tutorial/t_fe_cn.rst new file mode 100644 index 0000000..9b1b246 --- /dev/null +++ b/docs/docfile/tutorial/t_fe_cn.rst @@ -0,0 +1,100 @@ +.. _fe: + +AutoGL 特征工程 +========================== + +我们提供了一系列的节点和图的特征工程方法。您可以挑选需要的特征工程方法,并在一个特性工程管道中编写。 + +快速开始 +----------- +.. code-block :: python + + # 1. 选择一个数据集. + from autogl.datasets import build_dataset_from_name + data = build_dataset_from_name('cora') + + # 2. 选择特征工程方法 + from autogl.module.feature._base_feature_engineer._base_feature_engineer import _ComposedFeatureEngineer + from autogl.module.feature import EigenFeatureGenerator + from autogl.module.feature import NetLSD + + # 可以通过以下方式将多个特征工程方法组合起来 + fe = _ComposedFeatureEngineer([ + EigenFeatureGenerator(size=32), + NetLSD() + ]) + + # 3.拟合变换数据 + fe.fit(data) + data1=fe.transform(data,inplace=False) + + +特征工程方法 +--------------------- +现在支持3种类型的特征工程方法, 分别是 ``generators``, ``selectors`` , ``graph``. 你可以像在 ``快速开始`` 部分一样引入对应的模块,或者可以直接在Config或者Solver中传入需要的方法名称。 + +1. ``generators`` + ++---------------------------+-------------------------------------------------+ +| 方法名 | 描述 | ++===========================+=================================================+ +| ``GraphletGenerator`` | 生成local graphlet 数量作为节点特征 | ++---------------------------+-------------------------------------------------+ +| ``EigenFeatureGenerator`` | 生成特征向量作为节点特征 | ++---------------------------+-------------------------------------------------+ +| ``PageRankFeatureGenerator`` | 生成Pagerank 分数作为节点特征 | ++---------------------------+-------------------------------------------------+ +| `` LocalDegreeProfileGenerator `` | 生成Local Degree Profile作为节点特征 | ++---------------------------+-------------------------------------------------+ +| ``NormalizeFeatures`` | 归一化所有节点特征 | ++---------------------------+-------------------------------------------------+ +| ``OneHotDegreeGenerator`` | 生成节点度的独热编码作为节点特征 | ++---------------------------+-------------------------------------------------+ +| ``OneHotFeatureGenerator`` | 生成节点ID的独热编码作为节点特征 | ++---------------------------+-------------------------------------------------+ + +2. ``selectors`` + ++----------------------+--------------------------------------------------------------------------------+ +| 方法名 | 描述 | ++======================+================================================================================+ +| ``FilterConstant`` | 删除所有常量和独热编码节点特征 | ++----------------------+--------------------------------------------------------------------------------+ +| ``GBDTFeatureSelector`` | 通过梯度下降决策树对节点特征进行重要性排序,选择最重要的K个重要的节点特征 | ++----------------------+--------------------------------------------------------------------------------+ + +3. ``graph`` + +``NetLSD`` 是一种图特征生成方法。 + +一系列Networkx中的图特征生成方法被集成到库中, 若想了解详情,请查阅NetworkX的相关文档。 (``NxLargeCliqueSize``, ``NxAverageClusteringApproximate``, ``NxDegreeAssortativityCoefficient``, ``NxDegreePearsonCorrelationCoefficient``, ``NxHasBridge`` +,``NxGraphCliqueNumber``, ``NxGraphNumberOfCliques``, ``NxTransitivity``, ``NxAverageClustering``, ``NxIsConnected``, ``NxNumberConnectedComponents``, +``NxIsDistanceRegular``, ``NxLocalEfficiency``, ``NxGlobalEfficiency``, ``NxIsEulerian``) + +特征工程类型根据变化特征的方法进行分类。 ``generators`` 生成新特征并拼接或覆盖原始的特征。 而 ``selectors`` 选择原始特征中有用的部分。 +前两种可以节点或者边的层级使用(更改节点或边的特征), 而 ``graph`` 关注图级别的特征工程(在图特征上进行修改)。 +如果您需要进一步开发使用,可以通过继承其中一种基础类进行修改;或者可以直接继承更加底层的``BaseFeature``类。 + +构建您自己的特征工程方法 +------------------ +您可以继承其中一种特征工程基础类 ``BaseFeatureGenerator``或 ``BaseFeatureSelector`` 进行修改, 重载方法 ``extract_xx_features``。对于图层级特征工程,可以参考 ``_NetworkXGraphFeatureEngineer`` 的实现。 + +.. code-block :: python + + # 例子:创建节点ID独热编码特征 + import autogl + import torch + from autogl.module.feature._generators._basic import BaseFeatureGenerator + + class OneHotFeatureGenerator(BaseFeatureGenerator): + # 设置 overrider_features 为False , 则将原始特征拼接起来; 否则直接覆盖原始特征。 + def __init__(self, override_features: bool = False): + super(BaseFeatureGenerator, self).__init__(override_features) + + def _extract_nodes_feature(self, data: autogl.data.Data) -> torch.Tensor: + num_nodes: int = ( + data.x.size(0) + if data.x is not None and isinstance(data.x, torch.Tensor) + else (data.edge_index.max().item() + 1) + ) + return torch.eye(num_nodes) From 925c517828a7d3fd2c122ef914d4abee2648e6cb Mon Sep 17 00:00:00 2001 From: Generall <465383637@qq.com> Date: Wed, 13 Apr 2022 16:58:36 +0800 Subject: [PATCH 14/16] Update README_cn.md --- README_cn.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README_cn.md b/README_cn.md index 8549812..8c0cc07 100644 --- a/README_cn.md +++ b/README_cn.md @@ -1,5 +1,5 @@ # 智图 (AutoGL) -[English Introduction](..) +[English Introduction](...) 用于图数据的自动机器学习框架和工具包。 From 3e78ec6ddf6f9a16810616abd511a36a38327f8b Mon Sep 17 00:00:00 2001 From: Generall <465383637@qq.com> Date: Wed, 13 Apr 2022 16:58:53 +0800 Subject: [PATCH 15/16] Update README_cn.md --- README_cn.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README_cn.md b/README_cn.md index 8c0cc07..0def3f0 100644 --- a/README_cn.md +++ b/README_cn.md @@ -1,5 +1,5 @@ # 智图 (AutoGL) -[English Introduction](...) +[English Introduction](../..) 用于图数据的自动机器学习框架和工具包。 From dde3d3aef5dba695f80d27c191758ad23eb5e2f4 Mon Sep 17 00:00:00 2001 From: CoreLeader Date: Wed, 13 Apr 2022 18:05:00 +0800 Subject: [PATCH 16/16] Revise tutorial for backend and datasets --- docs/docfile/tutorial/t_backend-cn.rst | 33 ++++ docs/docfile/tutorial/t_backend.rst | 4 +- docs/docfile/tutorial/t_dataset-cn.rst | 100 ++++++++++++ docs/docfile/tutorial/t_dataset.rst | 218 ++++++++++--------------- 4 files changed, 221 insertions(+), 134 deletions(-) create mode 100644 docs/docfile/tutorial/t_backend-cn.rst create mode 100644 docs/docfile/tutorial/t_dataset-cn.rst diff --git a/docs/docfile/tutorial/t_backend-cn.rst b/docs/docfile/tutorial/t_backend-cn.rst new file mode 100644 index 0000000..574abe8 --- /dev/null +++ b/docs/docfile/tutorial/t_backend-cn.rst @@ -0,0 +1,33 @@ +.. _backend: + +Backend Support +=============== + +目前,AutoGL支持使用PyTorch-Geometric或Deep Graph Library作为后端,以便熟悉两者之一的用户均可受益于自动图学习。 + +为指定特定的后端,用户可以使用环境变量``AUTOGL_BACKEND``进行声明,例如: + +.. code-block:: python + + AUTOGL_BACKEND=pyg python xxx.py + +或 + +.. code-block:: python + + import os + os.environ["AUTOGL_BACKEND"] = "pyg" + import autogl + + ... + + +如果环境变量``AUTOGL_BACKEND``未声明,AutoGL会根据用户的Python运行环境中所安装的图学习库自动选择。 +如果PyTorch-Geometric和Deep Graph Library均已安装,则Deep Graph Library将被作为默认的后端。 + +可以以编程方式获得当前使用的后端: + +.. code-block:: python + + from autogl.backend import DependentBackend + print(DependentBackend.get_backend_name()) diff --git a/docs/docfile/tutorial/t_backend.rst b/docs/docfile/tutorial/t_backend.rst index ae68c6e..4ec39ec 100644 --- a/docs/docfile/tutorial/t_backend.rst +++ b/docs/docfile/tutorial/t_backend.rst @@ -9,13 +9,13 @@ enable users from both end benifiting the automation of graph learning. To specify one specific backend, you can declare the backend using environment variables ``AUTOGL_BACKEND``. For example: -.. code-block :: shell +.. code-block:: python AUTOGL_BACKEND=pyg python xxx.py or -.. code-block :: python +.. code-block:: python import os os.environ["AUTOGL_BACKEND"] = "pyg" diff --git a/docs/docfile/tutorial/t_dataset-cn.rst b/docs/docfile/tutorial/t_dataset-cn.rst new file mode 100644 index 0000000..a2106b5 --- /dev/null +++ b/docs/docfile/tutorial/t_dataset-cn.rst @@ -0,0 +1,100 @@ +.. _dataset: + +AutoGL 数据集 +============== + +我们基于PyTorch-Geometric (PyG),Deep Graph Learning (DGL)及Open Graph Benchmark (OGB)等图学习库提供了多种多样的常用数据集。 +同时,用户可以使用AutoGL所提供的统一静态图容器``GeneralStaticGraph``自定义静态同构图及异构图,例如: + +.. code-block:: python + from autogl.data.graph import GeneralStaticGraph, GeneralStaticGraphGenerator + + ''' 创建同构图 ''' + custom_static_homogeneous_graph = GeneralStaticGraphGenerator.create_homogeneous_static_graph( + {'x': torch.rand(2708, 3), 'y': torch.rand(2708, 1)}, torch.randint(0, 1024, (2, 10556)) + ) + + ''' 创建异构图 ''' + custom_static_heterogeneous_graph = GeneralStaticGraphGenerator.create_heterogeneous_static_graph( + { + 'author': {'x': torch.rand(1024, 3), 'y': torch.rand(1024, 1)}, + 'paper': {'feat': torch.rand(2048, 10), 'z': torch.rand(2048, 13)} + }, + { + ('author', 'writing', 'paper'): (torch.randint(0, 1024, (2, 5120)), torch.rand(5120, 10)), + ('author', 'reading', 'paper'): torch.randint(0, 1024, (2, 3840)), + } + ) + + +提供的常用数据集 +---------------- +AutoGL目前提供如下多种常用基准数据集: + +半监督节点分类: + ++------------------+------------+-----------+--------------------------------+ +| 数据集 | PyG | DGL | 默认train/val/test划分 | ++==================+============+===========+================================+ +| Cora | ✓ | ✓ | ✓ | ++------------------+------------+-----------+--------------------------------+ +| Citeseer | ✓ | ✓ | ✓ | ++------------------+------------+-----------+--------------------------------+ +| Pubmed | ✓ | ✓ | ✓ | ++------------------+------------+-----------+--------------------------------+ +| Amazon Computers | ✓ | ✓ | | ++------------------+------------+-----------+--------------------------------+ +| Amazon Photo | ✓ | ✓ | | ++------------------+------------+-----------+--------------------------------+ +| Coauthor CS | ✓ | ✓ | | ++------------------+------------+-----------+--------------------------------+ +| Coauthor Physics | ✓ | ✓ | | ++------------------+------------+-----------+--------------------------------+ +| Reddit | ✓ | ✓ | ✓ | ++------------------+------------+-----------+--------------------------------+ +| ogbn-products | ✓ | ✓ | ✓ | ++------------------+------------+-----------+--------------------------------+ +| ogbn-proteins | ✓ | ✓ | ✓ | ++------------------+------------+-----------+--------------------------------+ +| ogbn-arxiv | ✓ | ✓ | ✓ | ++------------------+------------+-----------+--------------------------------+ +| ogbn-papers100M | ✓ | ✓ | ✓ | ++------------------+------------+-----------+--------------------------------+ + + +图分类任务: MUTAG, IMDB-Binary, IMDB-Multi, PROTEINS, COLLAB等 + ++-------------+------------+------------+--------------+------------+--------------------+ +| 数据集 | PyG | DGL | 节点特征 | 标签 | 边特征 | ++=============+============+============+==============+============+====================+ +| MUTAG | ✓ | ✓ | ✓ | ✓ | ✓ | ++-------------+------------+------------+--------------+------------+--------------------+ +| IMDB-Binary | ✓ | ✓ | | ✓ | | ++-------------+------------+------------+--------------+------------+--------------------+ +| IMDB-Multi | ✓ | ✓ | | ✓ | | ++-------------+------------+------------+--------------+------------+--------------------+ +| PROTEINS | ✓ | ✓ | ✓ | ✓ | | ++-------------+------------+------------+--------------+------------+--------------------+ +| COLLAB | ✓ | ✓ | | ✓ | | ++-------------+------------+------------+--------------+------------+--------------------+ +| ogbg-molhiv | ✓ | ✓ | ✓ | ✓ | ✓ | ++-------------+------------+------------+--------------+------------+--------------------+ +| ogbg-molpcba| ✓ | ✓ | ✓ | ✓ | ✓ | ++-------------+------------+------------+--------------+------------+--------------------+ +| ogbg-ppa | ✓ | ✓ | | ✓ | ✓ | ++-------------+------------+------------+--------------+------------+--------------------+ +| ogbg-code2 | ✓ | ✓ | ✓ | ✓ | ✓ | ++-------------+------------+------------+--------------+------------+--------------------+ + + +链接预测任务:目前AutoGL可以使用针对节点分类任务的多种图数据进行自动链接预测。 + +通过GeneralStaticGraph序列构建自定义数据集 +---------------------------------------------------------------- +如下代码片段展示了通过一个由``GeneralStaticGraph``序列构建自定义数据集的方法。 + +.. code-block:: python + from autogl.data import InMemoryDataset + ''' graphs变量是一个由GeneralStaticGraph实例所构成的序列 ''' + graphs = [ ... ] + custom_dataset = InMemoryDataset(graphs) diff --git a/docs/docfile/tutorial/t_dataset.rst b/docs/docfile/tutorial/t_dataset.rst index 6cb6bd4..7ec0df7 100644 --- a/docs/docfile/tutorial/t_dataset.rst +++ b/docs/docfile/tutorial/t_dataset.rst @@ -3,144 +3,98 @@ AutoGL Dataset ============== -We import the module of datasets from `CogDL` and `PyTorch Geometric` and add support for datasets from `OGB`. One can refer to the usage of creating and building datasets via the tutorial of `CogDL`_, `PyTorch Geometric`_, and `OGB`_. +We provide various common datasets based on ``PyTorch-Geometric``, ``Deep Graph Library`` and ``OGB``. +Besides, users are able to leverage a unified abstraction provided in AutoGL, ``GeneralStaticGraph``, which is towards both static homogeneous graph and static heterogeneous graph. -.. _CogDL: https://cogdl.readthedocs.io/en/latest/tutorial.html -.. _PyTorch Geometric: https://pytorch-geometric.readthedocs.io/en/latest/notes/create_dataset.html -.. _OGB: https://ogb.stanford.edu/docs/dataset_overview/ +A basic example to construct an instance of ``GeneralStaticGraph`` is shown as follows. + +.. code-block:: python + from autogl.data.graph import GeneralStaticGraph, GeneralStaticGraphGenerator + + ''' Construct a custom homogeneous graph ''' + custom_static_homogeneous_graph: GeneralStaticGraph = GeneralStaticGraphGenerator.create_homogeneous_static_graph( + {'x': torch.rand(2708, 3), 'y': torch.rand(2708, 1)}, torch.randint(0, 1024, (2, 10556)) + ) + + ''' Construct a custom heterogemneous graph ''' + custom_static_heterogeneous_graph: GeneralStaticGraph = GeneralStaticGraphGenerator.create_heterogeneous_static_graph( + { + 'author': {'x': torch.rand(1024, 3), 'y': torch.rand(1024, 1)}, + 'paper': {'feat': torch.rand(2048, 10), 'z': torch.rand(2048, 13)} + }, + { + ('author', 'writing', 'paper'): (torch.randint(0, 1024, (2, 5120)), torch.rand(5120, 10)), + ('author', 'reading', 'paper'): torch.randint(0, 1024, (2, 3840)), + } + ) Supporting datasets ------------------- AutoGL now supports the following benchmarks for different tasks: -Semi-supervised node classification: Cora, Citeseer, Pubmed, Amazon Computers\*, Amazon Photo\*, Coauthor CS\*, Coauthor Physics\*, Reddit (\*: using `utils.random_splits_mask_class` for splitting dataset is recommended.). -For detailed information for supporting datasets, please kindly refer to `PyTorch Geometric Dataset`_. - -.. _PyTorch Geometric Dataset: https://pytorch-geometric.readthedocs.io/en/latest/modules/datasets.html - -+------------------+------------+-----------+------------+------------+-----------+------------+--------------------+---------------------+ -| Dataset | PyG | CogDL | x | y | edge_index| edge_attr | train/val/test node | train/val/test mask | -+==================+============+===========+============+============+===========+============+====================+=====================+ -| Cora | ✓ | | ✓ | ✓ | ✓ | ✓ | | ✓ | -+------------------+------------+-----------+------------+------------+-----------+------------+--------------------+---------------------+ -| Citeseer | ✓ | | ✓ | ✓ | ✓ | ✓ | | ✓ | -+------------------+------------+-----------+------------+------------+-----------+------------+--------------------+---------------------+ -| Pubmed | ✓ | | ✓ | ✓ | ✓ | ✓ | | ✓ | -+------------------+------------+-----------+------------+------------+-----------+------------+--------------------+---------------------+ -| Amazon Computers | ✓ | | ✓ | ✓ | ✓ | ✓ | | | -+------------------+------------+-----------+------------+------------+-----------+------------+--------------------+---------------------+ -| Amazon Photo | ✓ | | ✓ | ✓ | ✓ | ✓ | | | -+------------------+------------+-----------+------------+------------+-----------+------------+--------------------+---------------------+ -| Coauthor CS | ✓ | | ✓ | ✓ | ✓ | ✓ | | | -+------------------+------------+-----------+------------+------------+-----------+------------+--------------------+---------------------+ -| Coauthor Physics | ✓ | | ✓ | ✓ | ✓ | ✓ | | | -+------------------+------------+-----------+------------+------------+-----------+------------+--------------------+---------------------+ -| Reddit | ✓ | | ✓ | ✓ | ✓ | ✓ | | ✓ | -+------------------+------------+-----------+------------+------------+-----------+------------+--------------------+---------------------+ - -Graph classification: MUTAG, IMDB-B, IMDB-M, PROTEINS, COLLAB - -+-----------+------------+------------+-----------+------------+------------+-----------+ -| Dataset | PyG | CogDL | x | y | edge_index | edge_attr | -+===========+============+============+===========+============+============+===========+ -| MUTAG | ✓ | | ✓ | ✓ | ✓ | ✓ | -+-----------+------------+------------+-----------+------------+------------+-----------+ -| IMDB-B | ✓ | | | ✓ | ✓ | | -+-----------+------------+------------+-----------+------------+------------+-----------+ -| IMDB-M | ✓ | | | ✓ | ✓ | | -+-----------+------------+------------+-----------+------------+------------+-----------+ -| PROTEINS | ✓ | | ✓ | ✓ | ✓ | | -+-----------+------------+------------+-----------+------------+------------+-----------+ -| COLLAB | ✓ | | | ✓ | ✓ | | -+-----------+------------+------------+-----------+------------+------------+-----------+ - -TODO: Supporting all datasets from `PyTorch Geometric`. - -OGB datasets ------------- -AutoGL also supports the popular benchmark on `OGB` for node classification and graph classification tasks. For the summary of `OGB` datasets, please kindly refer to the their `docs`_. - -.. _docs: https://ogb.stanford.edu/docs/nodeprop/ - -Since the loss and evaluation metric used for `OGB` datasets vary among different tasks, we also add `string` properties of datasets for identification: - -+-----------------+----------------+-------------------+ -| Dataset | dataset.metric | datasets.loss | -+=================+================+===================+ -| ogbn-products | Accuracy | nll_loss | -+-----------------+----------------+-------------------+ -| ogbn-proteins | ROC-AUC | BCEWithLogitsLoss | -+-----------------+----------------+-------------------+ -| ogbn-arxiv | Accuracy | nll_loss | -+-----------------+----------------+-------------------+ -| ogbn-papers100M | Accuracy | nll_loss | -+-----------------+----------------+-------------------+ -| ogbn-mag | Accuracy | nll_loss | -+-----------------+----------------+-------------------+ -| ogbg-molhiv | ROC-AUC | BCEWithLogitsLoss | -+-----------------+----------------+-------------------+ -| ogbg-molpcba | AP | BCEWithLogitsLoss | -+-----------------+----------------+-------------------+ -| ogbg-ppa | Accuracy | CrossEntropyLoss | -+-----------------+----------------+-------------------+ -| ogbg-code | F1 score | CrossEntropyLoss | -+-----------------+----------------+-------------------+ - - -Create a dataset via URL ------------------------- - -If your dataset is the same as the 'ppi' dataset, which contains two matrices: 'network' and 'group', you can register your dataset directly use the above code. The default root for downloading dataset is `~/.cache-autogl`, you can also specify the root by passing the string to the `path` in `build_dataset(args, path)` or `build_dataset_from_name(dataset, path)`. - -.. code-block:: python - - # following code-snippet is from autogl/datasets/matlab_matrix.py - - @register_dataset("ppi") - class PPIDataset(MatlabMatrix): - def __init__(self, path): - dataset, filename = "ppi", "Homo_sapiens" - url = "http://snap.stanford.edu/node2vec/" - super(PPIDataset, self).__init__(path, filename, url) - -You should declare the name of the dataset, the name of the file, and the URL, where our script can download the resource. Then you can use either `build_dataset(args, path)` or `build_dataset_from_name(dataset, path)` in your task to build a dataset with corresponding parameters. - -Create a dataset locally ------------------------- - -If you want to test your local dataset, we recommend you to refer to the docs on `creating PyTorch Geometric dataset`_. - -.. _creating PyTorch Geometric dataset: https://pytorch-geometric.readthedocs.io/en/latest/notes/create_dataset.html - - -You can simply inherit from `torch_geometric.data.InMemoryDataset` to create an empty `dataset`, then create some `torch_geometric.data.Data` objects for your data and pass a regular python list holding them, then pass them to `torch_geometric.data.Dataset` or `torch_geometric.data.DataLoader`. -Let’s see this process in a simplified example: +Semi-supervised node classification: Cora, Citeseer, Pubmed, Amazon Computers, Amazon Photo, Coauthor CS, Coauthor Physics, Reddit, etc. + ++------------------+------------+-----------+--------------------------------+ +| Dataset | PyG | DGL | default train/val/test split | ++==================+============+===========+================================+ +| Cora | ✓ | ✓ | ✓ | ++------------------+------------+-----------+--------------------------------+ +| Citeseer | ✓ | ✓ | ✓ | ++------------------+------------+-----------+--------------------------------+ +| Pubmed | ✓ | ✓ | ✓ | ++------------------+------------+-----------+--------------------------------+ +| Amazon Computers | ✓ | ✓ | | ++------------------+------------+-----------+--------------------------------+ +| Amazon Photo | ✓ | ✓ | | ++------------------+------------+-----------+--------------------------------+ +| Coauthor CS | ✓ | ✓ | | ++------------------+------------+-----------+--------------------------------+ +| Coauthor Physics | ✓ | ✓ | | ++------------------+------------+-----------+--------------------------------+ +| Reddit | ✓ | ✓ | ✓ | ++------------------+------------+-----------+--------------------------------+ +| ogbn-products | ✓ | ✓ | ✓ | ++------------------+------------+-----------+--------------------------------+ +| ogbn-proteins | ✓ | ✓ | ✓ | ++------------------+------------+-----------+--------------------------------+ +| ogbn-arxiv | ✓ | ✓ | ✓ | ++------------------+------------+-----------+--------------------------------+ +| ogbn-papers100M | ✓ | ✓ | ✓ | ++------------------+------------+-----------+--------------------------------+ + +Graph classification: MUTAG, IMDB-Binary, IMDB-Multi, PROTEINS, COLLAB, etc. + ++-------------+------------+------------+--------------+------------+--------------------+ +| Dataset | PyG | DGL | Node Feature | Label | Edge Features | ++=============+============+============+==============+============+====================+ +| MUTAG | ✓ | ✓ | ✓ | ✓ | ✓ | ++-------------+------------+------------+--------------+------------+--------------------+ +| IMDB-Binary | ✓ | ✓ | | ✓ | | ++-------------+------------+------------+--------------+------------+--------------------+ +| IMDB-Multi | ✓ | ✓ | | ✓ | | ++-------------+------------+------------+--------------+------------+--------------------+ +| PROTEINS | ✓ | ✓ | ✓ | ✓ | | ++-------------+------------+------------+--------------+------------+--------------------+ +| COLLAB | ✓ | ✓ | | ✓ | | ++-------------+------------+------------+--------------+------------+--------------------+ +| ogbg-molhiv | ✓ | ✓ | ✓ | ✓ | ✓ | ++-------------+------------+------------+--------------+------------+--------------------+ +| ogbg-molpcba| ✓ | ✓ | ✓ | ✓ | ✓ | ++-------------+------------+------------+--------------+------------+--------------------+ +| ogbg-ppa | ✓ | ✓ | | ✓ | ✓ | ++-------------+------------+------------+--------------+------------+--------------------+ +| ogbg-code2 | ✓ | ✓ | ✓ | ✓ | ✓ | ++-------------+------------+------------+--------------+------------+--------------------+ + +Link Prediction: At present, AutoGL utilizes various homogeneous graphs towards node classification to conduct automatic link prediction. + +Construct custom dataset by instances of GeneralStaticGraph +------------------------------------------------------------ +The following example shows the way to compose a custom dataset by a sequence of instances of ``GeneralStaticGraph``. .. code-block:: python - - from typing import Iterable - from torch_geometric.data.data import Data - from autogl.datasets import build_dataset_from_name - from torch_geometric.data import InMemoryDataset - - class MyDataset(InMemoryDataset): - def __init__(self, datalist) -> None: - super().__init__() - self.data, self.slices = self.collate(datalist) - - # Create your own Data objects - - # for example, if you have edge_index, features and labels - # you can create a Data as follows - # See pytorch geometric more info of Data - data = Data() - data.edge_index = edge_index - data.x = features - data.y = labels - - # create a list of Data object - data_list = [data, Data(...), ..., Data(...)] - - # Initialize AutoGL Dataset with your own data - myData = MyDataset(data_list) + from autogl.data import InMemoryDataset + ''' Suppose the graphs is a sequence of instances of GeneralStaticGraph ''' + graphs = [ ... ] + custom_dataset = InMemoryDataset(graphs)