Browse Source

[DOC] update docstring for unify them

tags/v0.3.2
bxdd 2 years ago
parent
commit
81afc16efc
2 changed files with 40 additions and 50 deletions
  1. +16
    -24
      learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py
  2. +24
    -26
      learnware/market/heterogeneous/organizer/hetero_map/trainer.py

+ 16
- 24
learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py View File

@@ -10,9 +10,7 @@ from transformers import BertTokenizerFast


class WordEmbedding(nn.Module):
"""
Encode tokens drawn from column names
"""
"""Encode tokens drawn from column names"""

def __init__(
self,
@@ -36,9 +34,7 @@ class WordEmbedding(nn.Module):


class NumEmbedding(nn.Module):
"""
Encode tokens drawn from column names and the corresponding numerical features.
"""
"""Encode tokens drawn from column names and the corresponding numerical features."""

def __init__(self, hidden_dim):
super().__init__()
@@ -47,9 +43,13 @@ class NumEmbedding(nn.Module):
nn_init.uniform_(self.num_bias, a=-1 / math.sqrt(hidden_dim), b=1 / math.sqrt(hidden_dim))

def forward(self, col_emb, x_ts) -> Tensor:
"""args:
col_emb: numerical column embedding, (# numerical columns, emb_dim)
x_ts: numerical features, (bs, emb_dim)
"""
Parameters
----------
col_emb : Any
numerical column embedding, (# numerical columns, emb_dim)
x_ts : Any
numerical features, (bs, emb_dim)
"""
col_emb = col_emb.unsqueeze(0).expand((x_ts.shape[0], -1, -1))
feat_emb = col_emb * x_ts.unsqueeze(-1).float() + self.num_bias
@@ -57,10 +57,7 @@ class NumEmbedding(nn.Module):


class FeatureTokenizer:
"""
Process input dataframe to input indices towards encoder,
usually used to build dataloader for paralleling loading.
"""
"""Process input dataframe to input indices towards encoder, usually used to build dataloader for paralleling loading."""

def __init__(
self,
@@ -68,8 +65,11 @@ class FeatureTokenizer:
cache_dir=None,
**kwargs,
):
"""args:
disable_tokenizer_parallel: true if use extractor for collator function in torch.DataLoader
"""
Parameters
----------
disable_tokenizer_parallel : bool, optional
true if use extractor for collator function in torch.DataLoader
"""
self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", cache_dir=cache_dir)
self.tokenizer.__dict__["model_max_length"] = 512
@@ -95,10 +95,7 @@ class FeatureTokenizer:
'num_col_input_ids': tensor contains numerical column tokenized ids,
}
"""
encoded_inputs = {
"x_num": None,
"num_col_input_ids": None
}
encoded_inputs = {"x_num": None, "num_col_input_ids": None}
num_cols = x.columns.tolist() if not shuffle else np.random.shuffle(x.columns.tolist())
x_num = x[num_cols].fillna(0)

@@ -194,11 +191,6 @@ class FeatureProcessor(nn.Module):
num_att_mask=None,
**kwargs,
) -> Tensor:
"""args:
x: pd.DataFrame with column names and features.
shuffle: if shuffle column order during the training.
num_mask: indicate the NaN place of numerical features, 0: NaN 1: normal.
"""
x_num = x_num.to(self.device)

num_col_emb = self.word_embedding(num_col_input_ids.to(self.device))


+ 24
- 26
learnware/market/heterogeneous/organizer/hetero_map/trainer.py View File

@@ -30,11 +30,6 @@ class Trainer:
eval_batch_size=256,
**kwargs,
):
"""args:
train_set_list: a list of training sets [(x_1,y_1),(x_2,y_2),...]
patience: the max number of early stop patience
eval_less_is_better: if the set eval_metric is the less the better. For val_loss, it should be set True.
"""
self.model = model
if isinstance(train_set_list, tuple):
train_set_list = [train_set_list]
@@ -129,9 +124,7 @@ class Trainer:
return trainloader

def _get_parameter_names(self, model, forbidden_layer_types):
"""
Returns the names of the model parameters that are not inside a forbidden layer.
"""
"""Returns the names of the model parameters that are not inside a forbidden layer."""
result = []
for name, child in model.named_children():
result += [
@@ -174,9 +167,7 @@ class TransTabCollatorForCL:
self.num_partition = num_partition

def __call__(self, data):
"""
Take a list of subsets (views) from the original tests.
"""
"""Take a list of subsets (views) from the original tests."""
# 1. build positive pairs
# 2. encode each pair using feature extractor
df_x = pd.concat([row for row in data])
@@ -192,15 +183,19 @@ class TransTabCollatorForCL:
return res

def _build_positive_pairs(self, x, n):
"""
Builds positive pairs of sub-dataframes from the input dataframe x.

Args:
x (pandas.DataFrame): Input dataframe.
n (int): Number of sub-dataframes to split x into.

Returns:
list: List of sub-dataframes, each containing a positive pair of columns from x.
"""Builds positive pairs of sub-dataframes from the input dataframe x.

Parameters
----------
x : pandas.DataFrame
Input dataframe.
n : int
Number of sub-dataframes to split x into.

Returns
-------
List
List of sub-dataframes, each containing a positive pair of columns from x.
"""
x_cols = x.columns.tolist()
sub_col_list = np.array_split(np.array(x_cols), n)
@@ -217,14 +212,17 @@ class TransTabCollatorForCL:
return sub_x_list

def _build_positive_pairs_single_view(self, x):
"""
Builds positive pairs for a single view of data by corrupting half of the columns and shuffling the corrupted columns.
"""Builds positive pairs for a single view of data by corrupting half of the columns and shuffling the corrupted columns.

Args:
x (pandas.DataFrame): The input data.
Parameters
----------
x : pandas.DataFrame
The input data.

Returns:
list: A list of two pandas DataFrames, where each DataFrame contains the original data with half of the columns corrupted and shuffled.
Returns
-------
List
A list of two pandas DataFrames, where each DataFrame contains the original data with half of the columns corrupted and shuffled.
"""
x_cols = x.columns.tolist()
sub_x_list = [x]


Loading…
Cancel
Save