diff --git a/autogl/module/train/node_classification_trainer/node_classification_sampled_trainer.py b/autogl/module/train/node_classification_trainer/node_classification_sampled_trainer.py
index 9ccdfce..415e9bd 100644
--- a/autogl/module/train/node_classification_trainer/node_classification_sampled_trainer.py
+++ b/autogl/module/train/node_classification_trainer/node_classification_sampled_trainer.py
@@ -166,7 +166,7 @@ class NodeClassificationGraphSAINTTrainer(BaseNodeClassificationTrainer):
         self.__num_graphs_per_epoch: int = num_graphs_per_epoch
 
         " Set sampled_budget "
-        sampled_budget: int = kwargs.get("sampled_budget")
+        sampled_budget: int = kwargs.get("sampled_budget", 1e4)
         # todo: This is a version caused by current unreasonable initialization process
         # todo: Refactor the framework for trainer to fix in future version
         # if type(sampled_budget) != int:
@@ -197,11 +197,16 @@ class NodeClassificationGraphSAINTTrainer(BaseNodeClassificationTrainer):
             __cpu_count: _typing.Optional[int] = os.cpu_count()
             return __cpu_count if __cpu_count else 0
 
-        self.__training_sampler_num_workers: int = kwargs.get(
-            "training_sampler_num_workers", _cpu_count()
-        )
-        if not 0 <= self.__training_sampler_num_workers <= _cpu_count():
-            self.__training_sampler_num_workers: int = _cpu_count()
+        # self.__training_sampler_num_workers: int = kwargs.get(
+        #     "training_sampler_num_workers", _cpu_count()
+        # )
+        
+        # if not 0 <= self.__training_sampler_num_workers <= _cpu_count():
+        #     self.__training_sampler_num_workers: int = _cpu_count()
+
+        # force to be 0 to be compactible with current pyg solution.
+        self.__training_sampler_num_workers: int = 0
+        
         super(NodeClassificationGraphSAINTTrainer, self).__init__(
             model, num_features, num_classes, device, init, feval, loss
         )
diff --git a/docs/docfile/tutorial/t_model.rst b/docs/docfile/tutorial/t_model.rst
index b1ee81e..e46a94c 100644
--- a/docs/docfile/tutorial/t_model.rst
+++ b/docs/docfile/tutorial/t_model.rst
@@ -3,68 +3,266 @@
 AutoGL Model
 ============
 
-AutoGL project uses ``model`` to define the common graph nerual networks and ``automodel`` to denote the relative class that includes some auto functions. Currently, we support the following models and automodels:
+In AutoGL, we use ``model`` and ``automodel`` to define the logic of graph nerual networks and make it compatible with hyper parameter optimization. Currently we support the following models for given tasks.
 
-* ``GCN`` and ``AutoGCN`` : graph convolutional network from https://arxiv.org/abs/1609.02907
-* ``GAT`` and ``AutoGAT`` : graph attentional network from https://arxiv.org/abs/1710.10903
-* ``GraphSAGE`` and ``AutoGraphSAGE`` : from the "Inductive Representation Learning on Large Graphs" https://arxiv.org/abs/1706.02216
++----------------------+----------------------------+
+| Tasks                | Models                     |
++======================+============================+
+| Node Classification  | ``gcn``, ``gat``, ``sage`` |
++----------------------+----------------------------+
+| Graph Classification | ``gin``, ``topk``          |
++----------------------+----------------------------+
+| Link Prediction      | ``gcn``, ``gat``, ``sage`` |
++----------------------+----------------------------+
 
-And we also support the following models and automodels for graph classification tasks:
-* ``GIN`` and ``AutoGIN`` : graph isomorphism network from https://arxiv.org/abs/1810.00826
-* ``Topkpool`` and ``AutoTopkpool`` : graph U-Net from https://arxiv.org/abs/1905.05178, https://arxiv.org/abs/1905.02850
+Lazy Initialization
+-------------------
+
+In current AutoGL pipeline, some important hyper-parameters related with model cannot be set outside before the pipeline (e.g. input dimensions, which can only be caluclated during running after feature engineered). Therefore, in ``automodel``, we use lazy initialization to initialize the core ``model``. When the ``automodel`` initialization method ``__init__()`` is called with argument ``init`` be ``False``, only (part of) the hyper-parameters will be set. The ``automodel`` will have its core ``model`` only after ``initialize()`` is explicitly called, which will be done automatically in ``solver`` and ``from_hyper_parameter()``, after all the hyper-parameters are set properly.
 
 Define your own model and automodel
 -----------------------------------
 
-If you want to add your own model and automodel for some task, the only thing you should do is add a new model where the forward function should be fulfilled and a new automodel inherited from the basemodel.
+We highly recommend you to define both ``model`` and ``automodel``, although you only need your ``automodel`` to communicate with ``solver`` and ``trainer``. The ``model`` will be responsible for the parameters initialization and forward logic declaration, while the ``automodel`` will be responsible for the hyper-parameter definiton and organization.
+
+General customization
+^^^^^^^^^^^^^^^^^^^^^
 
-For new models used in link prediction tasks, you should fulfill the lp_encode and lp_decode function. The difference between lp_encode and forward function is that there is not classification layer in lp_encode.
+Let's say you want to implement a simple MLP for node classification and want to let AutoGL find the best hyper-parameters for you. You can first define the logics assuming all the hyper-parameters are given.
+
+.. code-block:: python
 
+    import torch
+
+    # define mlp model, need to inherit from torch.nn.Module
+    class MyMLP(torch.nn.Module):
+        # assume you already get all the hyper-parameters
+        def __init__(self, in_channels, num_classes, layer_num, dim):
+            super().__init__()
+            if layer_num == 1:
+                ops = [torch.nn.Linear(in_channels, num_classes)]
+            else:
+                ops = [torch.nn.Linear(in_channels, dim)]
+                for i in range(layer_num - 2):
+                    ops.append(torch.nn.Linear(dim, dim))
+                ops.append(torch.nn.Linear(dim, num_classes))
+        
+            self.core = torch.nn.Sequential(*ops)
+        
+        # this method is required
+        def forward(self, data):
+            # data: torch_geometric.data.Data
+            assert hasattr(data, 'x'), 'MLP only support graph data with features'
+            x = data.x
+            return torch.nn.functional.log_softmax(self.core(x))
 
-Firstly, you should define your model if it does not belong to the models above.
 
-Secondly, you should define your corresponding automodel.
+After you define the logic of ``model``, you can now define your ``automodel`` to manage the hyper-parameters.
 
 .. code-block:: python
 
-    # 1. define your search space to self.space of your automodel instance
-    [
-        {'parameterName': 'num_layers', 'type': 'DISCRETE', 'feasiblePoints': '2,3,4'},
-        {"parameterName": 'hidden', "type": "NUMERICAL_LIST", "numericalType": "INTEGER", "length": 3, "minValue": [8, 8, 8], "maxValue": [64, 64, 64], "scalingType": "LOG"},
-        {'parameterName': 'dropout', 'type': 'DOUBLE', 'maxValue': 0.9, 'minValue': 0.1, 'scalingType': 'LINEAR'},
-        {'parameterName': 'act', 'type': 'CATEGORICAL_LIST', "feasiblePoints": ['leaky_relu', 'relu', 'elu', 'tanh']},
-    ]
-    # 2. define the default point to self.hyperparams of your automodel instance
-    {
-        'num_layers': 2,
-        'hidden': [16],
-        'dropout': 0.2,
-        'act': 'leaky_relu'
-    }
-
-Where ``self.space`` is a list of dictionary indicating the name, type, feasible point, min/max value and some properties of the parameter. ``self.hyperparams`` is a dictionary indicating the hyper-parameters used in this model.
-
-Finally, you can use the defined model and automodel for the specific need.
+    from autogl.module.model import BaseModel
+    
+    # define your automodel, need to inherit from BaseModel
+    class MyAutoMLP(BaseModel):
+        def __init__(self):
+            # (required) make sure you call __init__ of super with init argument properly set.
+            # if you do not want to initialize inside __init__, please pass False.
+            super().__init__(init=False)
+
+            # (required) define the search space
+            self.space = [
+                {'parameterName': 'layer_num', 'type': 'INTEGER', 'minValue': 1, 'maxValue': 5, 'scalingType': 'LINEAR'},
+                {'parameterName': 'dim', 'type': 'INTEGER', 'minValue': 64, 'maxValue': 128, 'scalingType': 'LINEAR'}
+            ]
+
+            # set default hyper-parameters
+            self.layer_num = 2
+            self.dim = 72
+
+            # for the hyper-parameters that are related with dataset, you can just set them to None
+            self.num_classes = None
+            self.num_features = None
+
+            # (required) since we don't know the num_classes and num_features until we see the dataset,
+            # we cannot initialize the models when instantiated. the initialized will be set to False.
+            self.initialized = False
+
+            # (required) set the device of current auto model
+            self.device = torch.device('cuda')
+
+        # (required) get current hyper-parameters of this automodel
+        # need to return a dictionary whose keys are the same with self.space
+        def get_hyper_parameter(self):
+            return {
+                'layer_num': self.layer_num,
+                'dim': self.dim
+            }
+        
+        # (required) override to interact with num_classes
+        def get_num_classes(self):
+            return self.num_classes
+        
+        # (required) override to interact with num_classes
+        def set_num_classes(self, n_classes):
+            self.num_classes = n_classes
+        
+        # (required) override to interact with num_features
+        def get_num_features(self):
+            return self.num_features
+        
+        # (required) override to interact with num_features
+        def set_num_features(self, n_features):
+            self.num_features = n_features
+
+        # (required) instantiate the core MLP model using corresponding hyper-parameters
+        def initialize(self):
+            # (required) you need to make sure the core model is named as `self.model`
+            self.model = MyMLP(
+                in_channels = self.num_features,
+                num_classes = self.num_classes,
+                layer_num = self.layer_num,
+                dim = self.dim
+            ).to(self.device)
+
+            self.initialized = True
+        
+        # (required) override to create a copy of model using provided hyper-parameters
+        def from_hyper_parameter(self, hp):
+            # hp is a dictionary that contains keys and values corrsponding to your self.space
+            # in this case, it will be in form {'layer_num': XX, 'dim': XX}
+            
+            # create a new instance
+            ret = self.__class__()
+
+            # set the hyper-parameters related to dataset and device
+            ret.num_classes = self.num_classes
+            ret.num_features = self.num_features
+            ret.device = self.device
+
+            # set the hyper-parameters according to hp
+            ret.layer_num = hp['layer_num']
+            ret.dim = hp['dim']
+
+            # initialize it before returning
+            ret.initialize()
+
+            return ret
+        
+
+Then, you can use this node classification model as part of AutoNodeClassifier ``solver``.
 
 .. code-block :: python
 
-    # for example
-    import torch
-    from .base import BaseModel
-    class YourGNN(torch.nn.Module):
+    from autogl.solver import AutoNodeClassifier
+
+    solver = AutoNodeClassifier(graph_models=(MyAutoMLP(),))
+
+
+The model for graph classification is generally the same, except that you can now also receive the ``num_graph_features`` (the dimension of the graph-level feature) through overriding ``set_num_graph_features(self, n_graph_features)`` of ``BaseModel``. Also, please remember to return graph-level logits instead of node-level one in ``forward()`` of ``model``.
+
+Model for link prediction
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+For link prediction, the definition of model is a bit different with the common forward definition. You need to implement the ``lp_encode(self, data)`` and ``lp_decode(self, x, pos_edge_index, neg_edge_index)`` to interact with ``LinkPredictionTrainer`` and ``AutoLinkPredictor``. Taking the class ``MyMLP`` defined above for example, if you want to perform link prediction:
+
+.. code-block:: python
+
+    class MyMLPForLP(torch.nn.Module):
+        # num_classes is removed since it is invalid for link prediction
+        def __init__(self, in_channels, layer_num, dim):
+            super().__init__()
+            ops = [torch.nn.Linear(in_channels, dim)]
+            for i in range(layer_num - 1):
+                ops.append(torch.nn.Linear(dim, dim))
+        
+            self.core = torch.nn.Sequential(*ops)
+
+        # (required) for interaction with link prediction trainer and solver
+        def lp_encode(self, data):
+            return self.core(data.x)
+
+        # (required) for interaction with link prediction trainer and solver
+        def lp_decode(self, x, pos_edge_index, neg_edge_index):
+            # first, get all the edge_index need calculated
+            edge_index = torch.cat([pos_edge_index, neg_edge_index], dim=-1)
+            # then, use dot-products to calculate logits, you can use whatever decode method you want
+            logits = (x[edge_index[0]] * x[edge_index[1]]).sum(dim=-1)
+            return logits
+
+    class MyAutoMLPForLP(MyAutoMLP):
+        def initialize(self):
+            # init MyMLPForLP instead of MyMLP
+            self.model = MyMLPForLP(
+                in_channels = self.num_features,
+                layer_num = self.layer_num,
+                dim = self.dim
+            ).to(self.device)
+
+            self.initialized = True
+
+
+Model with sampling support
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Towards efficient representation learning on large-scale graph, AutoGL currently support node classification using sampling techniques including node-wise sampling, layer-wise sampling, and graph-wise sampling. See more about sampling in :ref:`trainer`.
+
+In order to conduct node classification using sampling technique with your custom model, further adaptation and modification are generally required.
+According to the Message Passing mechanism of Graph Neural Network (GNN), numerous nodes in the multi-hop neighborhood of evaluation set or test set are potentially involved to evaluate the GNN model on large-scale graph dataset.
+As the representations for those numerous nodes are likely to occupy large amount of computational resource, the common forwarding process is generally infeasible for model evaluation on large-scale graph.
+An iterative representation learning mechanism is a practical and feasible way to evaluate **Sequential Model**,
+which only consists of multiple sequential layers, with each layer taking a ``Data`` aggregate as input. The input ``Data`` has the same functionality with ``torch_geometric.data.Data``, which conventionally provides properties ``x``, ``edge_index``, and optional ``edge_weight``.
+If your custom model is composed of concatenated layers, you would better make your model inherit ``ClassificationSupportedSequentialModel`` to utilize the layer-wise representation learning mechanism to efficiently conduct representation learning for your custom sequential model.
+
+.. code-block:: python
+
+    import autogl
+    from autogl.module.model.base import ClassificationSupportedSequentialModel
+
+    # override Linear so that it can take graph data as input
+    class Linear(torch.nn.Linear):
         def forward(self, data):
-            pass  # Your forward function
-
-    class YourAutoGNN(BaseModel):
-        def __init__(self, num_features=None, num_classes=None, device=None, init=True, **args):
-            """
-            num_features: the number of features
-            num_classes: the number of classes
-            device: your device to run code
-            init: if True, the model will be initialize
-            """
-            self.space = XXX  # Define your search space
-            self.hyperparams = XXX  # Define your hyper-parameters
-            self.initialized = False
-            if init is True:
-                self.initialize()
+            return super().forward(data.x)
+
+    class MyMLPSampling(ClassificationSupportedSequentialModel):
+        def __init__(self, in_channels, num_classes, layer_num, dim):
+            super().__init__()
+            if layer_num == 1:
+                ops = [Linear(in_channels, num_classes)]
+            else:
+                ops = [Linear(in_channels, dim)]
+                for i in range(layer_num - 2):
+                    ops.append(Linear(dim, dim))
+                ops.append(Linear(dim, num_classes))
+
+            self.core = torch.nn.ModuleList(ops)
+
+        # (required) override sequential_encoding_layers property to interact with sampling
+        @property
+        def sequential_encoding_layers(self) -> torch.nn.ModuleList:
+            return self.core
+        
+        # (required) define the encode logic of classification for sampling
+        def cls_encode(self, data):
+            # if you use sampling, the data will be passed in two possible ways,
+            # you can judge it use following rules
+            if hasattr(data, 'edge_indexes'):
+                # the edge_indexes are a list of edge_index, one for each layer
+                edge_indexes = data.edge_indexes
+                edge_weights = [None] * len(self.core) if getattr(data, 'edge_weights', None) is None else data.edge_weights
+            else:
+                # the edge_index and edge_weight will stay the same as default
+                edge_indexes = [data.edge_index] * len(self.core)
+                edge_weights = [getattr(data, 'edge_weight', None)] * len(self.core)
+
+            x = data.x
+            for i in range(len(self.core)):
+                data = autogl.data.Data(x=x, edge_index=edge_indexes[i])
+                data.edge_weight = edge_weights[i]
+                x = self.sequential_encoding_layers[i](data)
+            return x
+
+        # (required) define the decode logic of classification for sampling
+        def cls_decode(self, x):
+            return torch.nn.functional.log_softmax(x)
+
diff --git a/docs/docfile/tutorial/t_trainer.rst b/docs/docfile/tutorial/t_trainer.rst
index 86979e5..36f9888 100644
--- a/docs/docfile/tutorial/t_trainer.rst
+++ b/docs/docfile/tutorial/t_trainer.rst
@@ -10,34 +10,10 @@ AutoGL project use ``trainer`` to handle the auto-training of tasks. Currently,
 * ``LinkPredictionTrainer`` for link prediction
 
 
-Initialization
---------------
+Lazy Initialization
+-------------------
+Similar reason to :ref:model, we also use lazy initialization for all trainers. Only (part of) the hyper-parameters will be set when ``__init__()`` is called. The ``trainer`` will have its core ``model`` only after ``initialize()`` is explicitly called, which will be done automatically in ``solver`` and ``duplicate_from_hyper_parameter()``, after all the hyper-parameters are set properly.
 
-A trainer can either be initialized from its ``__init__()``. If you want to build a trainer by ``__init__()``, you need to pass the following parameters to it, namely as ``model``, ``num_features``, and ``num_classes`` and ``auto ensemble``. You can also define some parameters alternatively, including ``optimizer``, ``lr``, ``max_epoch``, ``early_stopping_round``, ``weight_decay`` and etc.
-
-In the ``__init__()``, you need to define the space and hyperparameter of your trainer:  
-
-.. code-block:: python
-
-    # 1. define your search space of trainer
-    self.space = [
-        {'parameterName': 'max_epoch', 'type': 'INTEGER', 'maxValue': 300, 'minValue': 10, 'scalingType': 'LINEAR'},
-        {'parameterName': 'early_stopping_round', 'type': 'INTEGER', 'maxValue': 30, 'minValue': 10,
-             'scalingType': 'LINEAR'},
-        {'parameterName': 'lr', 'type': 'DOUBLE', 'maxValue': 1e-3, 'minValue': 1e-4, 'scalingType': 'LOG'},
-        {'parameterName': 'weight_decay', 'type': 'DOUBLE', 'maxValue': 5e-3, 'minValue': 5e-4,
-             'scalingType': 'LOG'},
-    ]
-
-    # 2. define the initial point of hyperparameter search of your trainer
-    self.hyperparams = {
-        'max_epoch': self.max_epoch,
-        'early_stopping_round': self.early_stopping_round,
-        'lr': self.lr,
-        'weight_decay': self.weight_decay
-    }
-
-Where ``self.space`` is a list of dictionary indicating the name, type, and some properties of the parameter. ``self.hyperparams`` is a dictionary indicating the hyper-parameters used in this trainer.
 
 Train and Predict
 -----------------
@@ -48,7 +24,7 @@ We have given the training and testing functions for the tasks of node classific
 The evaluation function is defined in ``evaluate()``, you can use your our evaluation metrics and methods.
 
 Node Classification with Sampling
-------------------------------------
+---------------------------------
 According to various present studies, training with spatial sampling has been demonstrated
 as an efficient technique for representation learning on large-scale graph.
 We provide implementations for various representative sampling mechanisms including
@@ -84,14 +60,131 @@ The sampling techniques can be utilized by adopting corresponding trainer
 and ``NodeClassificationNeighborSamplingTrainer``.
 You can either specify the corresponding name of trainer in YAML configuration file
 or instantiate the solver ``AutoNodeClassifier``
-with the instance of specific trainer as ``model`` argument.
+with the instance of specific trainer. However, please make sure to manange some key
+hyper-paramters properly inside the hyper-parameter space. Specifically:
+
+For ``NodeClassificationLayerDependentImportanceSamplingTrainer``, you need to set the
+hyper-parameter ``sampled_node_sizes`` properly. The space of ``sampled_node_sizes`` should
+be a list of the same size with your **Sequential Model**. For example, if you have a
+model with layer number 4, you need to pass the hyper-parameter space properly:
+
+.. code-block:: python
+
+    solver = AutoNodeClassifier(
+        graph_models=(A_MODEL_WITH_4_LAYERS,),
+        default_trainer='NodeClassificationLayerDependentImportanceSamplingTrainer',
+        trainer_hp_space=[
+            # (required) you need to set the trainer_hp_space properly.
+            {
+                'parameterName': 'sampled_node_sizes',
+                'type': 'NUMERICAL_LIST', 
+                "numericalType": "INTEGER",
+                "length": 4,                    # same with the layer number of your model
+                "minValue": [200,200,200,200],
+                "maxValue": [1000,1000,1000,1000],
+                "scalingType": "LOG"
+            },
+            ...
+        ]
+    )
+
+If the layer number of your model is a searchable hyper-parameters, you can also set the ``cutPara``
+and ``cutFunc`` properly, to make it connected with your layer number hyper-parameters of model.
+
+.. code-block:: python
+
+    '''
+    Suppose the layer number of your model is of the following forms:
+    {
+        'parameterName': 'layer_number',
+        'type': 'INTEGER',
+        'minValue': 2,
+        'maxValue': 4,
+        'scalingType': 'LOG'
+    }
+    '''
+
+    solver = AutoNodeClassifier(
+        graph_models=(A_MODEL_WITH_DYNAMIC_LAYERS,),
+        default_trainer='NodeClassificationLayerDependentImportanceSamplingTrainer',
+        trainer_hp_space=[
+            # (required) you need to set the trainer_hp_space properly.
+            {
+                'parameterName': 'sampled_node_sizes',
+                'type': 'NUMERICAL_LIST', 
+                "numericalType": "INTEGER",
+                "length": 4,                    # max length
+                "cutPara": ("layer_number", ),  # link with layer_number
+                "cutFunc": lambda x:x[0],       # link with layer_number
+                "minValue": [200,200,200,200],
+                "maxValue": [1000,1000,1000,1000],
+                "scalingType": "LOG"
+            },
+            ...
+        ]
+    )
+
+
+Similarly, if you want to use ``NodeClassificationNeighborSamplingTrainer``, you need to
+make sure setting the hyper-parameter ``sampling_sizes`` the same length as the layer number
+of your model. For example:
+
+.. code-block:: python
+
+    '''
+    Suppose the layer number of your model is of the following forms:
+    {
+        'parameterName': 'layer_number',
+        'type': 'INTEGER',
+        'minValue': 2,
+        'maxValue': 4,
+        'scalingType': 'LOG'
+    }
+    '''
+
+    solver = AutoNodeClassifier(
+        graph_models=(A_MODEL_WITH_DYNAMIC_LAYERS,),
+        default_trainer='NodeClassificationNeighborSamplingTrainer',
+        trainer_hp_space=[
+            # (required) you need to set the trainer_hp_space properly.
+            {
+                'parameterName': 'sampling_sizes',
+                'type': 'NUMERICAL_LIST', 
+                "numericalType": "INTEGER",
+                "length": 4,                    # max length
+                "cutPara": ("layer_number", ),  # link with layer_number
+                "cutFunc": lambda x:x[0],       # link with layer_number
+                "minValue": [20,20,20,20],
+                "maxValue": [100,100,100,100],
+                "scalingType": "LOG"
+            },
+            ...
+        ]
+    )
 
-A brief example is demonstrated as follows:
+
+You can also pass a trainer inside model list directly. A brief example is demonstrated as follows:
 
 .. code-block:: python
 
     ladies_sampling_trainer = NodeClassificationLayerDependentImportanceSamplingTrainer(
-        model='gcn', num_features=dataset.num_features, num_classes=dataset.num_classes,
-        ...
+        model='gcn', num_features=dataset.num_features, num_classes=dataset.num_classes, ...
     )
+
+    ladies_sampling_trainer.hyper_parameter_space = [
+        # (required) you need to set the trainer_hp_space properly.
+        {
+            'parameterName': 'sampled_node_sizes',
+            'type': 'NUMERICAL_LIST', 
+            "numericalType": "INTEGER",
+            "length": 4,                    # max length
+            "cutPara": ("num_layers", ),    # link with layer_number
+            "cutFunc": lambda x:x[0],       # link with layer_number
+            "minValue": [200,200,200,200],
+            "maxValue": [1000,1000,1000,1000],
+            "scalingType": "LOG"
+        },
+        ...
+    ]
+
     AutoNodeClassifier(graph_models=(ladies_sampling_trainer,), ...)