From 81afc16efc299f80d279fd4e4b26f56388a51b70 Mon Sep 17 00:00:00 2001 From: bxdd Date: Tue, 14 Nov 2023 14:56:46 +0800 Subject: [PATCH] [DOC] update docstring for unify them --- .../organizer/hetero_map/feature_extractor.py | 40 ++++++--------- .../organizer/hetero_map/trainer.py | 50 +++++++++---------- 2 files changed, 40 insertions(+), 50 deletions(-) diff --git a/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py b/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py index 89105c0..10d390a 100644 --- a/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py +++ b/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py @@ -10,9 +10,7 @@ from transformers import BertTokenizerFast class WordEmbedding(nn.Module): - """ - Encode tokens drawn from column names - """ + """Encode tokens drawn from column names""" def __init__( self, @@ -36,9 +34,7 @@ class WordEmbedding(nn.Module): class NumEmbedding(nn.Module): - """ - Encode tokens drawn from column names and the corresponding numerical features. - """ + """Encode tokens drawn from column names and the corresponding numerical features.""" def __init__(self, hidden_dim): super().__init__() @@ -47,9 +43,13 @@ class NumEmbedding(nn.Module): nn_init.uniform_(self.num_bias, a=-1 / math.sqrt(hidden_dim), b=1 / math.sqrt(hidden_dim)) def forward(self, col_emb, x_ts) -> Tensor: - """args: - col_emb: numerical column embedding, (# numerical columns, emb_dim) - x_ts: numerical features, (bs, emb_dim) + """ + Parameters + ---------- + col_emb : Any + numerical column embedding, (# numerical columns, emb_dim) + x_ts : Any + numerical features, (bs, emb_dim) """ col_emb = col_emb.unsqueeze(0).expand((x_ts.shape[0], -1, -1)) feat_emb = col_emb * x_ts.unsqueeze(-1).float() + self.num_bias @@ -57,10 +57,7 @@ class NumEmbedding(nn.Module): class FeatureTokenizer: - """ - Process input dataframe to input indices towards encoder, - usually used to build dataloader for paralleling loading. - """ + """Process input dataframe to input indices towards encoder, usually used to build dataloader for paralleling loading.""" def __init__( self, @@ -68,8 +65,11 @@ class FeatureTokenizer: cache_dir=None, **kwargs, ): - """args: - disable_tokenizer_parallel: true if use extractor for collator function in torch.DataLoader + """ + Parameters + ---------- + disable_tokenizer_parallel : bool, optional + true if use extractor for collator function in torch.DataLoader """ self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", cache_dir=cache_dir) self.tokenizer.__dict__["model_max_length"] = 512 @@ -95,10 +95,7 @@ class FeatureTokenizer: 'num_col_input_ids': tensor contains numerical column tokenized ids, } """ - encoded_inputs = { - "x_num": None, - "num_col_input_ids": None - } + encoded_inputs = {"x_num": None, "num_col_input_ids": None} num_cols = x.columns.tolist() if not shuffle else np.random.shuffle(x.columns.tolist()) x_num = x[num_cols].fillna(0) @@ -194,11 +191,6 @@ class FeatureProcessor(nn.Module): num_att_mask=None, **kwargs, ) -> Tensor: - """args: - x: pd.DataFrame with column names and features. - shuffle: if shuffle column order during the training. - num_mask: indicate the NaN place of numerical features, 0: NaN 1: normal. - """ x_num = x_num.to(self.device) num_col_emb = self.word_embedding(num_col_input_ids.to(self.device)) diff --git a/learnware/market/heterogeneous/organizer/hetero_map/trainer.py b/learnware/market/heterogeneous/organizer/hetero_map/trainer.py index f192b78..c4a85e6 100644 --- a/learnware/market/heterogeneous/organizer/hetero_map/trainer.py +++ b/learnware/market/heterogeneous/organizer/hetero_map/trainer.py @@ -30,11 +30,6 @@ class Trainer: eval_batch_size=256, **kwargs, ): - """args: - train_set_list: a list of training sets [(x_1,y_1),(x_2,y_2),...] - patience: the max number of early stop patience - eval_less_is_better: if the set eval_metric is the less the better. For val_loss, it should be set True. - """ self.model = model if isinstance(train_set_list, tuple): train_set_list = [train_set_list] @@ -129,9 +124,7 @@ class Trainer: return trainloader def _get_parameter_names(self, model, forbidden_layer_types): - """ - Returns the names of the model parameters that are not inside a forbidden layer. - """ + """Returns the names of the model parameters that are not inside a forbidden layer.""" result = [] for name, child in model.named_children(): result += [ @@ -174,9 +167,7 @@ class TransTabCollatorForCL: self.num_partition = num_partition def __call__(self, data): - """ - Take a list of subsets (views) from the original tests. - """ + """Take a list of subsets (views) from the original tests.""" # 1. build positive pairs # 2. encode each pair using feature extractor df_x = pd.concat([row for row in data]) @@ -192,15 +183,19 @@ class TransTabCollatorForCL: return res def _build_positive_pairs(self, x, n): - """ - Builds positive pairs of sub-dataframes from the input dataframe x. - - Args: - x (pandas.DataFrame): Input dataframe. - n (int): Number of sub-dataframes to split x into. - - Returns: - list: List of sub-dataframes, each containing a positive pair of columns from x. + """Builds positive pairs of sub-dataframes from the input dataframe x. + + Parameters + ---------- + x : pandas.DataFrame + Input dataframe. + n : int + Number of sub-dataframes to split x into. + + Returns + ------- + List + List of sub-dataframes, each containing a positive pair of columns from x. """ x_cols = x.columns.tolist() sub_col_list = np.array_split(np.array(x_cols), n) @@ -217,14 +212,17 @@ class TransTabCollatorForCL: return sub_x_list def _build_positive_pairs_single_view(self, x): - """ - Builds positive pairs for a single view of data by corrupting half of the columns and shuffling the corrupted columns. + """Builds positive pairs for a single view of data by corrupting half of the columns and shuffling the corrupted columns. - Args: - x (pandas.DataFrame): The input data. + Parameters + ---------- + x : pandas.DataFrame + The input data. - Returns: - list: A list of two pandas DataFrames, where each DataFrame contains the original data with half of the columns corrupted and shuffled. + Returns + ------- + List + A list of two pandas DataFrames, where each DataFrame contains the original data with half of the columns corrupted and shuffled. """ x_cols = x.columns.tolist() sub_x_list = [x]