[DOC] update docstring for unify them

2 years ago · 81afc16efc
--- a/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py
+++ b/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py
@@ -10,9 +10,7 @@ from transformers import BertTokenizerFast


 class WordEmbedding(nn.Module):
    """
    Encode tokens drawn from column names
    """
    """Encode tokens drawn from column names"""

    def __init__(
        self,
@@ -36,9 +34,7 @@ class WordEmbedding(nn.Module):


 class NumEmbedding(nn.Module):
    """
    Encode tokens drawn from column names and the corresponding numerical features.
    """
    """Encode tokens drawn from column names and the corresponding numerical features."""

    def __init__(self, hidden_dim):
        super().__init__()
@@ -47,9 +43,13 @@ class NumEmbedding(nn.Module):
        nn_init.uniform_(self.num_bias, a=-1 / math.sqrt(hidden_dim), b=1 / math.sqrt(hidden_dim))

    def forward(self, col_emb, x_ts) -> Tensor:
        """args:
        col_emb: numerical column embedding, (# numerical columns, emb_dim)
        x_ts: numerical features, (bs, emb_dim)
        """
        Parameters
        ----------
        col_emb : Any
            numerical column embedding, (# numerical columns, emb_dim)
        x_ts : Any
            numerical features, (bs, emb_dim)
        """
        col_emb = col_emb.unsqueeze(0).expand((x_ts.shape[0], -1, -1))
        feat_emb = col_emb * x_ts.unsqueeze(-1).float() + self.num_bias
@@ -57,10 +57,7 @@ class NumEmbedding(nn.Module):


 class FeatureTokenizer:
    """
    Process input dataframe to input indices towards encoder,
    usually used to build dataloader for paralleling loading.
    """
    """Process input dataframe to input indices towards encoder, usually used to build dataloader for paralleling loading."""

    def __init__(
        self,
@@ -68,8 +65,11 @@ class FeatureTokenizer:
        cache_dir=None,
        **kwargs,
    ):
        """args:
        disable_tokenizer_parallel: true if use extractor for collator function in torch.DataLoader
        """
        Parameters
        ----------
        disable_tokenizer_parallel : bool, optional
            true if use extractor for collator function in torch.DataLoader
        """
        self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", cache_dir=cache_dir)
        self.tokenizer.__dict__["model_max_length"] = 512
@@ -95,10 +95,7 @@ class FeatureTokenizer:
                'num_col_input_ids': tensor contains numerical column tokenized ids,
            }
        """
        encoded_inputs = {
            "x_num": None,
            "num_col_input_ids": None
        }
        encoded_inputs = {"x_num": None, "num_col_input_ids": None}
        num_cols = x.columns.tolist() if not shuffle else np.random.shuffle(x.columns.tolist())
        x_num = x[num_cols].fillna(0)

@@ -194,11 +191,6 @@ class FeatureProcessor(nn.Module):
        num_att_mask=None,
        **kwargs,
    ) -> Tensor:
        """args:
        x: pd.DataFrame with column names and features.
        shuffle: if shuffle column order during the training.
        num_mask: indicate the NaN place of numerical features, 0: NaN 1: normal.
        """
        x_num = x_num.to(self.device)

        num_col_emb = self.word_embedding(num_col_input_ids.to(self.device))
--- a/learnware/market/heterogeneous/organizer/hetero_map/trainer.py
+++ b/learnware/market/heterogeneous/organizer/hetero_map/trainer.py
@@ -30,11 +30,6 @@ class Trainer:
        eval_batch_size=256,
        **kwargs,
    ):
        """args:
        train_set_list: a list of training sets [(x_1,y_1),(x_2,y_2),...]
        patience: the max number of early stop patience
        eval_less_is_better: if the set eval_metric is the less the better. For val_loss, it should be set True.
        """
        self.model = model
        if isinstance(train_set_list, tuple):
            train_set_list = [train_set_list]
@@ -129,9 +124,7 @@ class Trainer:
        return trainloader

    def _get_parameter_names(self, model, forbidden_layer_types):
        """
        Returns the names of the model parameters that are not inside a forbidden layer.
        """
        """Returns the names of the model parameters that are not inside a forbidden layer."""
        result = []
        for name, child in model.named_children():
            result += [
@@ -174,9 +167,7 @@ class TransTabCollatorForCL:
        self.num_partition = num_partition

    def __call__(self, data):
        """
        Take a list of subsets (views) from the original tests.
        """
        """Take a list of subsets (views) from the original tests."""
        # 1. build positive pairs
        # 2. encode each pair using feature extractor
        df_x = pd.concat([row for row in data])
@@ -192,15 +183,19 @@ class TransTabCollatorForCL:
        return res

    def _build_positive_pairs(self, x, n):
        """
        Builds positive pairs of sub-dataframes from the input dataframe x.

        Args:
            x (pandas.DataFrame): Input dataframe.
            n (int): Number of sub-dataframes to split x into.

        Returns:
            list: List of sub-dataframes, each containing a positive pair of columns from x.
        """Builds positive pairs of sub-dataframes from the input dataframe x.

        Parameters
        ----------
        x : pandas.DataFrame
            Input dataframe.
        n : int
            Number of sub-dataframes to split x into.

        Returns
        -------
        List
            List of sub-dataframes, each containing a positive pair of columns from x.
        """
        x_cols = x.columns.tolist()
        sub_col_list = np.array_split(np.array(x_cols), n)
@@ -217,14 +212,17 @@ class TransTabCollatorForCL:
        return sub_x_list

    def _build_positive_pairs_single_view(self, x):
        """
        Builds positive pairs for a single view of data by corrupting half of the columns and shuffling the corrupted columns.
        """Builds positive pairs for a single view of data by corrupting half of the columns and shuffling the corrupted columns.

        Args:
            x (pandas.DataFrame): The input data.
        Parameters
        ----------
        x : pandas.DataFrame
            The input data.

        Returns:
            list: A list of two pandas DataFrames, where each DataFrame contains the original data with half of the columns corrupted and shuffled.
        Returns
        -------
        List
            A list of two pandas DataFrames, where each DataFrame contains the original data with half of the columns corrupted and shuffled.
        """
        x_cols = x.columns.tolist()
        sub_x_list = [x]