[FIX] fix bugs for feature_embedding

2 years ago · 5cc6c1bbcb
--- a/learnware/market/heterogeneous/organizer/init.py
+++ b/learnware/market/heterogeneous/organizer/init.py
@@ -1,4 +1,5 @@
 import os
 import traceback
 import pandas as pd
 from collections import defaultdict
 from typing import List, Tuple, Union
@@ -125,6 +126,7 @@ class HeteroMapTableOrganizer(EasyOrganizer):
                hetero_spec.save(save_path)

            except Exception as err:
                traceback.print_exc()
                logger.warning(f"Learnware {idx} generate HeteroMapTableSpecification failed! Due to {err}")

    def _get_hetero_learnware_ids(self, ids: Union[str, List[str]]) -> List[str]:
--- a/learnware/market/heterogeneous/organizer/hetero_map/init.py
+++ b/learnware/market/heterogeneous/organizer/hetero_map/init.py
@@ -39,7 +39,7 @@ class HeteroMap(nn.Module):
        temperature=10,
        base_temperature=10,
        activation="relu",
        device="cuda:0",
        device="cpu",
        **kwargs,
    ):
        """
@@ -174,7 +174,7 @@ class HeteroMap(nn.Module):
    def hetero_mapping(self, rkme_spec: RKMETableSpecification, features: dict) -> HeteroMapTableSpecification:
        hetero_spec = HeteroMapTableSpecification()
        data = rkme_spec.get_z()
        cols = [features.get(str(i), "") for i in range(data.shape[1])]
        cols = [features.get(str(i), "Unknown Feature") for i in range(data.shape[1])]
        hetero_input_df = pd.DataFrame(data=data, columns=cols)
        hetero_embedding = self._extract_batch_features(hetero_input_df)
        hetero_spec.generate_stat_spec_from_system(hetero_embedding, rkme_spec)
--- a/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py
+++ b/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py
@@ -53,6 +53,7 @@ class NumEmbedding(nn.Module):
        x_ts : Any
            numerical features, (bs, emb_dim)
        """
        print(np.array(col_emb).shape, np.array(x_ts).shape)
        col_emb = col_emb.unsqueeze(0).expand((x_ts.shape[0], -1, -1))
        feat_emb = col_emb * x_ts.unsqueeze(-1).float() + self.num_bias
        return feat_emb
@@ -99,13 +100,18 @@ class FeatureTokenizer:
            }
        """
        encoded_inputs = {"x_num": None, "num_col_input_ids": None}
        num_cols = x.columns.tolist() if not shuffle else np.random.shuffle(x.columns.tolist())
        x_num = x[num_cols].fillna(0)

        num_cols = x.columns.tolist() if not shuffle else np.random.shuffle(x.columns.tolist())
        index_cols = (
            [i for i in range(len(x.columns))] if not shuffle else np.random.shuffle([i for i in range(len(x.columns))])
        )
        num_cols = [x.columns[i] for i in index_cols]
        x_num = x.iloc(axis=1)[index_cols].fillna(0)
        if keep_input_grad:
            x_num_ts = torch.tensor(x_num.values, dtype=float, requires_grad=True)  # keep the grad
        else:
            x_num_ts = torch.tensor(x_num.values, dtype=float)

        num_col_ts = self.tokenizer(
            num_cols,
            padding=True,
@@ -195,9 +201,11 @@ class FeatureProcessor(nn.Module):
        **kwargs,
    ) -> Tensor:
        x_num = x_num.to(self.device)

        print("?1", np.array(x_num).shape, np.array(num_col_input_ids).shape)
        num_col_emb = self.word_embedding(num_col_input_ids.to(self.device))
        print("?2", np.array(x_num).shape, np.array(num_col_emb).shape)
        num_col_emb = self._avg_embedding_by_mask(num_col_emb, num_att_mask)
        print("?3", np.array(x_num).shape, np.array(num_col_emb).shape)

        num_feat_embedding = self.num_embedding(num_col_emb, x_num)
        num_feat_embedding = self.align_layer(num_feat_embedding).float()
--- a/learnware/market/heterogeneous/searcher.py
+++ b/learnware/market/heterogeneous/searcher.py
@@ -1,5 +1,5 @@
 from typing import Tuple, List

 import traceback
 from ...learnware import Learnware
 from ...logger import get_module_logger
 from ..base import BaseUserInfo
@@ -34,9 +34,8 @@ class HeteroSearcher(EasySearcher):
            return True

        except Exception as e:
            logger.warning(
                f"Invalid heterogeneous search information provided. Use homogeneous search instead. Error: {e}"
            )
            traceback.print_exc()
            logger.warning(f"Invalid heterogeneous search information provided. Use homogeneous search instead.")
            return False

    def __call__(
--- a/learnware/reuse/feature_augment.py
+++ b/learnware/reuse/feature_augment.py
@@ -12,8 +12,8 @@ class FeatureAugmentReuser(BaseReuser):
    FeatureAugmentReuser is a class for augmenting features using predictions of a given learnware model and applying regression or classification on the augmented dataset.

    This class supports two modes:
    - "regression": Uses RidgeCV for regression tasks.
    - "classification": Uses LogisticRegressionCV for classification tasks.
        - "regression": Uses RidgeCV for regression tasks.
        - "classification": Uses LogisticRegressionCV for classification tasks.
    """

    def __init__(self, learnware_list: List[Learnware] = None, mode: str = None):