| @@ -10,9 +10,7 @@ from transformers import BertTokenizerFast | |||
| class WordEmbedding(nn.Module): | |||
| """ | |||
| Encode tokens drawn from column names | |||
| """ | |||
| """Encode tokens drawn from column names""" | |||
| def __init__( | |||
| self, | |||
| @@ -36,9 +34,7 @@ class WordEmbedding(nn.Module): | |||
| class NumEmbedding(nn.Module): | |||
| """ | |||
| Encode tokens drawn from column names and the corresponding numerical features. | |||
| """ | |||
| """Encode tokens drawn from column names and the corresponding numerical features.""" | |||
| def __init__(self, hidden_dim): | |||
| super().__init__() | |||
| @@ -47,9 +43,13 @@ class NumEmbedding(nn.Module): | |||
| nn_init.uniform_(self.num_bias, a=-1 / math.sqrt(hidden_dim), b=1 / math.sqrt(hidden_dim)) | |||
| def forward(self, col_emb, x_ts) -> Tensor: | |||
| """args: | |||
| col_emb: numerical column embedding, (# numerical columns, emb_dim) | |||
| x_ts: numerical features, (bs, emb_dim) | |||
| """ | |||
| Parameters | |||
| ---------- | |||
| col_emb : Any | |||
| numerical column embedding, (# numerical columns, emb_dim) | |||
| x_ts : Any | |||
| numerical features, (bs, emb_dim) | |||
| """ | |||
| col_emb = col_emb.unsqueeze(0).expand((x_ts.shape[0], -1, -1)) | |||
| feat_emb = col_emb * x_ts.unsqueeze(-1).float() + self.num_bias | |||
| @@ -57,10 +57,7 @@ class NumEmbedding(nn.Module): | |||
| class FeatureTokenizer: | |||
| """ | |||
| Process input dataframe to input indices towards encoder, | |||
| usually used to build dataloader for paralleling loading. | |||
| """ | |||
| """Process input dataframe to input indices towards encoder, usually used to build dataloader for paralleling loading.""" | |||
| def __init__( | |||
| self, | |||
| @@ -68,8 +65,11 @@ class FeatureTokenizer: | |||
| cache_dir=None, | |||
| **kwargs, | |||
| ): | |||
| """args: | |||
| disable_tokenizer_parallel: true if use extractor for collator function in torch.DataLoader | |||
| """ | |||
| Parameters | |||
| ---------- | |||
| disable_tokenizer_parallel : bool, optional | |||
| true if use extractor for collator function in torch.DataLoader | |||
| """ | |||
| self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", cache_dir=cache_dir) | |||
| self.tokenizer.__dict__["model_max_length"] = 512 | |||
| @@ -95,10 +95,7 @@ class FeatureTokenizer: | |||
| 'num_col_input_ids': tensor contains numerical column tokenized ids, | |||
| } | |||
| """ | |||
| encoded_inputs = { | |||
| "x_num": None, | |||
| "num_col_input_ids": None | |||
| } | |||
| encoded_inputs = {"x_num": None, "num_col_input_ids": None} | |||
| num_cols = x.columns.tolist() if not shuffle else np.random.shuffle(x.columns.tolist()) | |||
| x_num = x[num_cols].fillna(0) | |||
| @@ -194,11 +191,6 @@ class FeatureProcessor(nn.Module): | |||
| num_att_mask=None, | |||
| **kwargs, | |||
| ) -> Tensor: | |||
| """args: | |||
| x: pd.DataFrame with column names and features. | |||
| shuffle: if shuffle column order during the training. | |||
| num_mask: indicate the NaN place of numerical features, 0: NaN 1: normal. | |||
| """ | |||
| x_num = x_num.to(self.device) | |||
| num_col_emb = self.word_embedding(num_col_input_ids.to(self.device)) | |||
| @@ -30,11 +30,6 @@ class Trainer: | |||
| eval_batch_size=256, | |||
| **kwargs, | |||
| ): | |||
| """args: | |||
| train_set_list: a list of training sets [(x_1,y_1),(x_2,y_2),...] | |||
| patience: the max number of early stop patience | |||
| eval_less_is_better: if the set eval_metric is the less the better. For val_loss, it should be set True. | |||
| """ | |||
| self.model = model | |||
| if isinstance(train_set_list, tuple): | |||
| train_set_list = [train_set_list] | |||
| @@ -129,9 +124,7 @@ class Trainer: | |||
| return trainloader | |||
| def _get_parameter_names(self, model, forbidden_layer_types): | |||
| """ | |||
| Returns the names of the model parameters that are not inside a forbidden layer. | |||
| """ | |||
| """Returns the names of the model parameters that are not inside a forbidden layer.""" | |||
| result = [] | |||
| for name, child in model.named_children(): | |||
| result += [ | |||
| @@ -174,9 +167,7 @@ class TransTabCollatorForCL: | |||
| self.num_partition = num_partition | |||
| def __call__(self, data): | |||
| """ | |||
| Take a list of subsets (views) from the original tests. | |||
| """ | |||
| """Take a list of subsets (views) from the original tests.""" | |||
| # 1. build positive pairs | |||
| # 2. encode each pair using feature extractor | |||
| df_x = pd.concat([row for row in data]) | |||
| @@ -192,15 +183,19 @@ class TransTabCollatorForCL: | |||
| return res | |||
| def _build_positive_pairs(self, x, n): | |||
| """ | |||
| Builds positive pairs of sub-dataframes from the input dataframe x. | |||
| Args: | |||
| x (pandas.DataFrame): Input dataframe. | |||
| n (int): Number of sub-dataframes to split x into. | |||
| Returns: | |||
| list: List of sub-dataframes, each containing a positive pair of columns from x. | |||
| """Builds positive pairs of sub-dataframes from the input dataframe x. | |||
| Parameters | |||
| ---------- | |||
| x : pandas.DataFrame | |||
| Input dataframe. | |||
| n : int | |||
| Number of sub-dataframes to split x into. | |||
| Returns | |||
| ------- | |||
| List | |||
| List of sub-dataframes, each containing a positive pair of columns from x. | |||
| """ | |||
| x_cols = x.columns.tolist() | |||
| sub_col_list = np.array_split(np.array(x_cols), n) | |||
| @@ -217,14 +212,17 @@ class TransTabCollatorForCL: | |||
| return sub_x_list | |||
| def _build_positive_pairs_single_view(self, x): | |||
| """ | |||
| Builds positive pairs for a single view of data by corrupting half of the columns and shuffling the corrupted columns. | |||
| """Builds positive pairs for a single view of data by corrupting half of the columns and shuffling the corrupted columns. | |||
| Args: | |||
| x (pandas.DataFrame): The input data. | |||
| Parameters | |||
| ---------- | |||
| x : pandas.DataFrame | |||
| The input data. | |||
| Returns: | |||
| list: A list of two pandas DataFrames, where each DataFrame contains the original data with half of the columns corrupted and shuffled. | |||
| Returns | |||
| ------- | |||
| List | |||
| A list of two pandas DataFrames, where each DataFrame contains the original data with half of the columns corrupted and shuffled. | |||
| """ | |||
| x_cols = x.columns.tolist() | |||
| sub_x_list = [x] | |||