From 112f853abfb16b12c7206c50929477c279caa632 Mon Sep 17 00:00:00 2001 From: Peng Tan Date: Sun, 5 Nov 2023 21:16:32 +0800 Subject: [PATCH 01/90] =?UTF-8?q?=E2=9C=A8=20feat(reuse):=20add=20two=20re?= =?UTF-8?q?user?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit add feature_augment_reuser & hetero_reuser --- learnware/reuse/feature_augment_reuser.py | 46 +++++ learnware/reuse/hetero_reuser/__init__.py | 26 +++ .../reuse/hetero_reuser/feature_alignment.py | 177 ++++++++++++++++++ 3 files changed, 249 insertions(+) create mode 100644 learnware/reuse/feature_augment_reuser.py create mode 100644 learnware/reuse/hetero_reuser/__init__.py create mode 100644 learnware/reuse/hetero_reuser/feature_alignment.py diff --git a/learnware/reuse/feature_augment_reuser.py b/learnware/reuse/feature_augment_reuser.py new file mode 100644 index 0000000..2f1c835 --- /dev/null +++ b/learnware/reuse/feature_augment_reuser.py @@ -0,0 +1,46 @@ +from typing import List +import numpy as np + +from sklearn.linear_model import RidgeCV + +from .base import BaseReuser +from learnware.learnware import Learnware + + +class FeatureAugmentReuser(BaseReuser): + def __init__(self, learnware: Learnware = None, task_type: str = None): + self.learnware=learnware + assert task_type in ["classification", "regression"] + self.task_type=task_type + + def predict(self, x_test: np.ndarray) -> np.ndarray: + x_test=self._fill_data(x_test) + y_pred=self.learnware.predict(x_test) + x_test_aug=np.concatenate((x_test, y_pred.reshape(-1, 1)), axis=1) + y_pred_aug=self.output_aligner.predict(x_test_aug) + return y_pred_aug + + def fit(self, x_train, y_train): + x_train=self._fill_data(x_train) + y_pred=self.learnware.predict(x_train) + x_train_aug=np.concatenate((x_train, y_pred.reshape(-1, 1)), axis=1) + if self.task_type=="regression": + alpha_list = [0.01, 0.1, 1.0, 10, 100] + ridge_cv = RidgeCV(alphas=alpha_list, store_cv_values=True) + ridge_cv.fit(x_train_aug, y_train) + self.output_aligner=ridge_cv + elif self.task_type=="classification": + raise NotImplementedError("Not implemented yet!") + + def _fill_data(self, X: np.ndarray): + X[np.isinf(X) | np.isneginf(X) | np.isposinf(X) | np.isneginf(X)] = np.nan + if np.any(np.isnan(X)): + for col in range(X.shape[1]): + is_nan = np.isnan(X[:, col]) + if np.any(is_nan): + if np.all(is_nan): + raise ValueError(f"All values in column {col} are exceptional, e.g., NaN and Inf.") + # Fill np.nan with np.nanmean + col_mean = np.nanmean(X[:, col]) + X[:, col] = np.where(is_nan, col_mean, X[:, col]) + return X \ No newline at end of file diff --git a/learnware/reuse/hetero_reuser/__init__.py b/learnware/reuse/hetero_reuser/__init__.py new file mode 100644 index 0000000..43523df --- /dev/null +++ b/learnware/reuse/hetero_reuser/__init__.py @@ -0,0 +1,26 @@ +from learnware.learnware import Learnware +from learnware.reuse.base import BaseReuser +from .feature_alignment import FeatureAligner +from ..feature_augment_reuser import FeatureAugmentReuser + + +class HeteroMapTableReuser(BaseReuser): + + def __init__(self, learnware: Learnware = None, task_type: str = None, cuda_idx=0, **align_arguments): + self.learnware=learnware + assert task_type in ["classification", "regression"] + self.task_type=task_type + self.cuda_idx=cuda_idx + self.align_arguments=align_arguments + + def fit(self, user_rkme): + self.feature_aligner=FeatureAligner(learnware=self.learnware, task_type=self.task_type, cuda_idx=self.cuda_idx, **self.align_arguments) + self.feature_aligner.fit(user_rkme) + self.reuser=self.feature_aligner + + def finetune(self, x_train,y_train): + self.reuser=FeatureAugmentReuser(learnware=self.feature_aligner, task_type=self.task_type) + self.reuser.fit(x_train, y_train) + + def predict(self, user_data): + return self.reuser.predict(user_data) \ No newline at end of file diff --git a/learnware/reuse/hetero_reuser/feature_alignment.py b/learnware/reuse/hetero_reuser/feature_alignment.py new file mode 100644 index 0000000..4cbe71d --- /dev/null +++ b/learnware/reuse/hetero_reuser/feature_alignment.py @@ -0,0 +1,177 @@ +from typing import List, Any +import numpy as np +from numpy import ndarray +import torch.nn as nn +import torch.nn.functional as F +import torch +import time +from tqdm import trange +from loguru import logger + +from learnware.learnware import Learnware +from learnware.specification import RKMEStatSpecification +from learnware.specification.regular.table.rkme import choose_device + +from ..base import BaseReuser + + +class FeatureAligner(BaseReuser): + + def __init__(self, learnware: Learnware = None, task_type: str = None, cuda_idx=0, **align_arguments): + self.learnware=learnware + assert task_type in ["classification", "regression"] + self.task_type=task_type + self.align_arguments=align_arguments + self.cuda_idx=cuda_idx + self.device = choose_device(cuda_idx=cuda_idx) + + def fit(self, user_rkme): + target_rkme=self.learnware.specification.get_stat_spec()["RKMEStatSpecification"] + trainer=FeatureAlignmentTrainer(target_rkme=target_rkme, user_rkme=user_rkme, cuda_idx=self.cuda_idx, **self.align_arguments) + self.align_model=trainer.model + self.align_model.eval() + + def predict(self, user_data: ndarray) -> ndarray: + user_data=self._fill_data(user_data) + transformed_user_data=self.align_model(torch.tensor(user_data, device=self.device).float()).detach().cpu().numpy() + y_pred=self.learnware.predict(transformed_user_data) + return y_pred + + def _fill_data(self, X: np.ndarray): + X[np.isinf(X) | np.isneginf(X) | np.isposinf(X) | np.isneginf(X)] = np.nan + if np.any(np.isnan(X)): + for col in range(X.shape[1]): + is_nan = np.isnan(X[:, col]) + if np.any(is_nan): + if np.all(is_nan): + raise ValueError(f"All values in column {col} are exceptional, e.g., NaN and Inf.") + # Fill np.nan with np.nanmean + col_mean = np.nanmean(X[:, col]) + X[:, col] = np.where(is_nan, col_mean, X[:, col]) + return X + + +class FeatureAlignmentModel(nn.Module): + + def __init__(self, input_dim, output_dim, hidden_dims=[1024], activation="relu", dropout_ratio=0, use_bn=False): + super().__init__() + dims = [input_dim] + hidden_dims + [output_dim] + self.fc_list = nn.ModuleList() + self.drop_list = nn.ModuleList() + + if len(hidden_dims) > 0: + for i in range(len(dims) - 2): + self.drop_list.append(nn.Dropout(dropout_ratio)) + if use_bn: + self.fc_list.append(nn.Sequential(nn.Linear(dims[i], dims[i + 1]), nn.BatchNorm1d(dims[i + 1]))) + else: + self.fc_list.append(nn.Linear(dims[i], dims[i + 1])) + + self.final_fc = nn.Linear(dims[-2], dims[-1]) + + if activation == "gelu": + self.activation = F.gelu + elif activation == "selu": + self.activation = F.selu + elif activation == "leakyrelu": + self.activation = F.leaky_relu + else: + self.activation = F.relu + + def forward(self, x): + if len(self.fc_list) > 0: + for fc, drop in zip(self.fc_list, self.drop_list): + x = fc(x) + x = self.activation(x) + x = drop(x) + return self.final_fc(x) + + +class FeatureAlignmentTrainer(): + + def __init__( + self, + target_rkme: RKMEStatSpecification, # (X, weight) + user_rkme: RKMEStatSpecification, # (X, weight) + extra_labeled_data: Any = None, + target_learnware: Learnware = None, + num_epoch: int = 50, + lr: float = 1e-3, + gamma: float = 0.1, + network_type: str = "ArbitraryMapping", + optimizer_type: str = "Adam", + hidden_dims: List[int] = [1024], + activation: str = "relu", + dropout_ratio: float = 0, + use_bn: bool = False, + const: float = 1e1, + cuda_idx: int = 0 + ): + """Training the base mapping network + """ + self.target_rkme = target_rkme + self.user_rkme = user_rkme + self.args = { + "lr": lr, + "num_epoch": num_epoch, + "gamma": gamma, + "hidden_dims": hidden_dims, + "activation": activation, + "dropout_ratio": dropout_ratio, + "use_bn": use_bn, + } + self.network_type = network_type + self.optimizer_type = optimizer_type + self.const=const + self.device = choose_device(cuda_idx=cuda_idx) + if extra_labeled_data is not None and target_learnware is not None: + self.train_with_labeled_data(extra_labeled_data[0], extra_labeled_data[1], target_learnware) + else: + self.train() + + def gaussian_kernel(self, x1, x2): + x1 = x1.double() + x2 = x2.double() + X12norm = torch.sum(x1**2, 1, keepdim=True) - 2 * x1 @ x2.T + torch.sum(x2**2, 1, keepdim=True).T + return torch.exp(-X12norm * self.args["gamma"]) + + def compute_mmd(self, user_X, user_weight, target_X, target_weight): + term1 = torch.sum(self.gaussian_kernel(user_X, user_X) * (user_weight.T @ user_weight)) + term2 = torch.sum(self.gaussian_kernel(user_X, target_X) * (user_weight.T @ target_weight)) + term3 = torch.sum(self.gaussian_kernel(target_X, target_X) * (target_weight.T @ target_weight)) + return term1 - 2 * term2 + term3 + + def train(self): + args = self.args + input_dim = self.user_rkme.get_z().shape[1] + output_dim = self.target_rkme.get_z().shape[1] + + user_model=FeatureAlignmentModel(input_dim, output_dim, args["hidden_dims"], args["activation"], args["dropout_ratio"], args["use_bn"]) + + # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + user_model.to(self.device) + user_data_x = torch.tensor(self.user_rkme.get_z(), device=self.device).float() + user_data_weight = torch.tensor(self.user_rkme.get_beta(), device=self.device).view(1, -1).double() + target_data_x = torch.tensor(self.target_rkme.get_z(), device=self.device) + target_data_weight = torch.tensor(self.target_rkme.get_beta(), device=self.device).view(1, -1).double() + if self.optimizer_type == "Adam": + optimizer = torch.optim.Adam(user_model.parameters(), lr=args["lr"]) + else: + optimizer = torch.optim.SGD(user_model.parameters(), lr=args["lr"]) + + start_time = time.time() + for epoch in trange(args["num_epoch"], desc="Epoch"): + transformed_user_data_x = user_model(user_data_x) + mmd_loss = self.compute_mmd(transformed_user_data_x, user_data_weight, target_data_x, target_data_weight) + + optimizer.zero_grad() + mmd_loss.backward() + optimizer.step() + logger.info( + "epoch: {}, train mmd_loss: {:.4f}, lr: {:.6f}, spent: {:.1f} secs".format( + epoch, mmd_loss.item(), optimizer.param_groups[0]["lr"], time.time() - start_time + ) + ) + + self.model = user_model + logger.info("training complete, cost {:.1f} secs.".format(time.time() - start_time)) \ No newline at end of file From 98f4f05efaa572468f59ce49ff2ff6f67109d37a Mon Sep 17 00:00:00 2001 From: Peng Tan Date: Sun, 5 Nov 2023 21:28:44 +0800 Subject: [PATCH 02/90] =?UTF-8?q?=F0=9F=A6=84=20refactor(reuse):=20modify?= =?UTF-8?q?=20init.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit add two reusers to __init__.py --- learnware/reuse/__init__.py | 2 ++ setup.py | 50 ++++++++++++++++++------------------- 2 files changed, 27 insertions(+), 25 deletions(-) diff --git a/learnware/reuse/__init__.py b/learnware/reuse/__init__.py index 0ea82c5..8e9621b 100644 --- a/learnware/reuse/__init__.py +++ b/learnware/reuse/__init__.py @@ -1,3 +1,5 @@ from .ensemble_pruning import EnsemblePruningReuser from .averaging import AveragingReuser from .job_selector import JobSelectorReuser +from .feature_augment_reuser import FeatureAugmentReuser +from .hetero_reuser import HeteroMapTableReuser \ No newline at end of file diff --git a/setup.py b/setup.py index 67f7254..b37dfab 100644 --- a/setup.py +++ b/setup.py @@ -51,31 +51,31 @@ def get_platform(): # What packages are required for this module to be executed? # `estimator` may depend on other packages. In order to reduce dependencies, it is not written here. REQUIRED = [ - "numpy>=1.20.0", - "pandas>=0.25.1", - "scipy>=1.0.0", - "matplotlib>=3.1.3", - "torch>=1.11.0", - "cvxopt>=1.3.0", - "tqdm>=4.65.0", - "scikit-learn>=0.22", - "joblib>=1.2.0", - "pyyaml>=6.0", - "fire>=0.3.1", - "lightgbm>=3.3.0", - "psutil>=5.9.4", - "torchvision>=0.15.1", - "sqlalchemy>=2.0.21", - "shortuuid>=1.0.11", - "geatpy>=2.7.0", - "docker>=6.1.3", - "rapidfuzz>=3.4.0", - "torchtext>=0.16.0", - "sentence_transformers>=2.2.2", - "torch-optimizer>=0.3.0", - "langdetect>=1.0.9", - "huggingface-hub<0.18", - "portalocker>=2.0.0", + # "numpy>=1.20.0", + # "pandas>=0.25.1", + # "scipy>=1.0.0", + # "matplotlib>=3.1.3", + # "torch>=1.11.0", + # "cvxopt>=1.3.0", + # "tqdm>=4.65.0", + # "scikit-learn>=0.22", + # "joblib>=1.2.0", + # "pyyaml>=6.0", + # "fire>=0.3.1", + # "lightgbm>=3.3.0", + # "psutil>=5.9.4", + # "torchvision>=0.15.1", + # "sqlalchemy>=2.0.21", + # "shortuuid>=1.0.11", + # "geatpy>=2.7.0", + # "docker>=6.1.3", + # "rapidfuzz>=3.4.0", + # "torchtext>=0.16.0", + # "sentence_transformers>=2.2.2", + # "torch-optimizer>=0.3.0", + # "langdetect>=1.0.9", + # "huggingface-hub<0.18", + # "portalocker>=2.0.0", ] if get_platform() != MACOS: From 4ac77b50a529257110647e9cae30349e600657d8 Mon Sep 17 00:00:00 2001 From: Peng Tan Date: Sun, 5 Nov 2023 22:04:00 +0800 Subject: [PATCH 03/90] [ENH] add two reusers (FeatureAugmentReuser, HeteroMapTableReuser) --- learnware/reuse/__init__.py | 2 + learnware/reuse/feature_augment_reuser.py | 46 +++++ learnware/reuse/hetero_reuser/__init__.py | 26 +++ .../reuse/hetero_reuser/feature_alignment.py | 177 ++++++++++++++++++ setup.py | 50 ++--- 5 files changed, 276 insertions(+), 25 deletions(-) create mode 100644 learnware/reuse/feature_augment_reuser.py create mode 100644 learnware/reuse/hetero_reuser/__init__.py create mode 100644 learnware/reuse/hetero_reuser/feature_alignment.py diff --git a/learnware/reuse/__init__.py b/learnware/reuse/__init__.py index 0ea82c5..8e9621b 100644 --- a/learnware/reuse/__init__.py +++ b/learnware/reuse/__init__.py @@ -1,3 +1,5 @@ from .ensemble_pruning import EnsemblePruningReuser from .averaging import AveragingReuser from .job_selector import JobSelectorReuser +from .feature_augment_reuser import FeatureAugmentReuser +from .hetero_reuser import HeteroMapTableReuser \ No newline at end of file diff --git a/learnware/reuse/feature_augment_reuser.py b/learnware/reuse/feature_augment_reuser.py new file mode 100644 index 0000000..2f1c835 --- /dev/null +++ b/learnware/reuse/feature_augment_reuser.py @@ -0,0 +1,46 @@ +from typing import List +import numpy as np + +from sklearn.linear_model import RidgeCV + +from .base import BaseReuser +from learnware.learnware import Learnware + + +class FeatureAugmentReuser(BaseReuser): + def __init__(self, learnware: Learnware = None, task_type: str = None): + self.learnware=learnware + assert task_type in ["classification", "regression"] + self.task_type=task_type + + def predict(self, x_test: np.ndarray) -> np.ndarray: + x_test=self._fill_data(x_test) + y_pred=self.learnware.predict(x_test) + x_test_aug=np.concatenate((x_test, y_pred.reshape(-1, 1)), axis=1) + y_pred_aug=self.output_aligner.predict(x_test_aug) + return y_pred_aug + + def fit(self, x_train, y_train): + x_train=self._fill_data(x_train) + y_pred=self.learnware.predict(x_train) + x_train_aug=np.concatenate((x_train, y_pred.reshape(-1, 1)), axis=1) + if self.task_type=="regression": + alpha_list = [0.01, 0.1, 1.0, 10, 100] + ridge_cv = RidgeCV(alphas=alpha_list, store_cv_values=True) + ridge_cv.fit(x_train_aug, y_train) + self.output_aligner=ridge_cv + elif self.task_type=="classification": + raise NotImplementedError("Not implemented yet!") + + def _fill_data(self, X: np.ndarray): + X[np.isinf(X) | np.isneginf(X) | np.isposinf(X) | np.isneginf(X)] = np.nan + if np.any(np.isnan(X)): + for col in range(X.shape[1]): + is_nan = np.isnan(X[:, col]) + if np.any(is_nan): + if np.all(is_nan): + raise ValueError(f"All values in column {col} are exceptional, e.g., NaN and Inf.") + # Fill np.nan with np.nanmean + col_mean = np.nanmean(X[:, col]) + X[:, col] = np.where(is_nan, col_mean, X[:, col]) + return X \ No newline at end of file diff --git a/learnware/reuse/hetero_reuser/__init__.py b/learnware/reuse/hetero_reuser/__init__.py new file mode 100644 index 0000000..43523df --- /dev/null +++ b/learnware/reuse/hetero_reuser/__init__.py @@ -0,0 +1,26 @@ +from learnware.learnware import Learnware +from learnware.reuse.base import BaseReuser +from .feature_alignment import FeatureAligner +from ..feature_augment_reuser import FeatureAugmentReuser + + +class HeteroMapTableReuser(BaseReuser): + + def __init__(self, learnware: Learnware = None, task_type: str = None, cuda_idx=0, **align_arguments): + self.learnware=learnware + assert task_type in ["classification", "regression"] + self.task_type=task_type + self.cuda_idx=cuda_idx + self.align_arguments=align_arguments + + def fit(self, user_rkme): + self.feature_aligner=FeatureAligner(learnware=self.learnware, task_type=self.task_type, cuda_idx=self.cuda_idx, **self.align_arguments) + self.feature_aligner.fit(user_rkme) + self.reuser=self.feature_aligner + + def finetune(self, x_train,y_train): + self.reuser=FeatureAugmentReuser(learnware=self.feature_aligner, task_type=self.task_type) + self.reuser.fit(x_train, y_train) + + def predict(self, user_data): + return self.reuser.predict(user_data) \ No newline at end of file diff --git a/learnware/reuse/hetero_reuser/feature_alignment.py b/learnware/reuse/hetero_reuser/feature_alignment.py new file mode 100644 index 0000000..4cbe71d --- /dev/null +++ b/learnware/reuse/hetero_reuser/feature_alignment.py @@ -0,0 +1,177 @@ +from typing import List, Any +import numpy as np +from numpy import ndarray +import torch.nn as nn +import torch.nn.functional as F +import torch +import time +from tqdm import trange +from loguru import logger + +from learnware.learnware import Learnware +from learnware.specification import RKMEStatSpecification +from learnware.specification.regular.table.rkme import choose_device + +from ..base import BaseReuser + + +class FeatureAligner(BaseReuser): + + def __init__(self, learnware: Learnware = None, task_type: str = None, cuda_idx=0, **align_arguments): + self.learnware=learnware + assert task_type in ["classification", "regression"] + self.task_type=task_type + self.align_arguments=align_arguments + self.cuda_idx=cuda_idx + self.device = choose_device(cuda_idx=cuda_idx) + + def fit(self, user_rkme): + target_rkme=self.learnware.specification.get_stat_spec()["RKMEStatSpecification"] + trainer=FeatureAlignmentTrainer(target_rkme=target_rkme, user_rkme=user_rkme, cuda_idx=self.cuda_idx, **self.align_arguments) + self.align_model=trainer.model + self.align_model.eval() + + def predict(self, user_data: ndarray) -> ndarray: + user_data=self._fill_data(user_data) + transformed_user_data=self.align_model(torch.tensor(user_data, device=self.device).float()).detach().cpu().numpy() + y_pred=self.learnware.predict(transformed_user_data) + return y_pred + + def _fill_data(self, X: np.ndarray): + X[np.isinf(X) | np.isneginf(X) | np.isposinf(X) | np.isneginf(X)] = np.nan + if np.any(np.isnan(X)): + for col in range(X.shape[1]): + is_nan = np.isnan(X[:, col]) + if np.any(is_nan): + if np.all(is_nan): + raise ValueError(f"All values in column {col} are exceptional, e.g., NaN and Inf.") + # Fill np.nan with np.nanmean + col_mean = np.nanmean(X[:, col]) + X[:, col] = np.where(is_nan, col_mean, X[:, col]) + return X + + +class FeatureAlignmentModel(nn.Module): + + def __init__(self, input_dim, output_dim, hidden_dims=[1024], activation="relu", dropout_ratio=0, use_bn=False): + super().__init__() + dims = [input_dim] + hidden_dims + [output_dim] + self.fc_list = nn.ModuleList() + self.drop_list = nn.ModuleList() + + if len(hidden_dims) > 0: + for i in range(len(dims) - 2): + self.drop_list.append(nn.Dropout(dropout_ratio)) + if use_bn: + self.fc_list.append(nn.Sequential(nn.Linear(dims[i], dims[i + 1]), nn.BatchNorm1d(dims[i + 1]))) + else: + self.fc_list.append(nn.Linear(dims[i], dims[i + 1])) + + self.final_fc = nn.Linear(dims[-2], dims[-1]) + + if activation == "gelu": + self.activation = F.gelu + elif activation == "selu": + self.activation = F.selu + elif activation == "leakyrelu": + self.activation = F.leaky_relu + else: + self.activation = F.relu + + def forward(self, x): + if len(self.fc_list) > 0: + for fc, drop in zip(self.fc_list, self.drop_list): + x = fc(x) + x = self.activation(x) + x = drop(x) + return self.final_fc(x) + + +class FeatureAlignmentTrainer(): + + def __init__( + self, + target_rkme: RKMEStatSpecification, # (X, weight) + user_rkme: RKMEStatSpecification, # (X, weight) + extra_labeled_data: Any = None, + target_learnware: Learnware = None, + num_epoch: int = 50, + lr: float = 1e-3, + gamma: float = 0.1, + network_type: str = "ArbitraryMapping", + optimizer_type: str = "Adam", + hidden_dims: List[int] = [1024], + activation: str = "relu", + dropout_ratio: float = 0, + use_bn: bool = False, + const: float = 1e1, + cuda_idx: int = 0 + ): + """Training the base mapping network + """ + self.target_rkme = target_rkme + self.user_rkme = user_rkme + self.args = { + "lr": lr, + "num_epoch": num_epoch, + "gamma": gamma, + "hidden_dims": hidden_dims, + "activation": activation, + "dropout_ratio": dropout_ratio, + "use_bn": use_bn, + } + self.network_type = network_type + self.optimizer_type = optimizer_type + self.const=const + self.device = choose_device(cuda_idx=cuda_idx) + if extra_labeled_data is not None and target_learnware is not None: + self.train_with_labeled_data(extra_labeled_data[0], extra_labeled_data[1], target_learnware) + else: + self.train() + + def gaussian_kernel(self, x1, x2): + x1 = x1.double() + x2 = x2.double() + X12norm = torch.sum(x1**2, 1, keepdim=True) - 2 * x1 @ x2.T + torch.sum(x2**2, 1, keepdim=True).T + return torch.exp(-X12norm * self.args["gamma"]) + + def compute_mmd(self, user_X, user_weight, target_X, target_weight): + term1 = torch.sum(self.gaussian_kernel(user_X, user_X) * (user_weight.T @ user_weight)) + term2 = torch.sum(self.gaussian_kernel(user_X, target_X) * (user_weight.T @ target_weight)) + term3 = torch.sum(self.gaussian_kernel(target_X, target_X) * (target_weight.T @ target_weight)) + return term1 - 2 * term2 + term3 + + def train(self): + args = self.args + input_dim = self.user_rkme.get_z().shape[1] + output_dim = self.target_rkme.get_z().shape[1] + + user_model=FeatureAlignmentModel(input_dim, output_dim, args["hidden_dims"], args["activation"], args["dropout_ratio"], args["use_bn"]) + + # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + user_model.to(self.device) + user_data_x = torch.tensor(self.user_rkme.get_z(), device=self.device).float() + user_data_weight = torch.tensor(self.user_rkme.get_beta(), device=self.device).view(1, -1).double() + target_data_x = torch.tensor(self.target_rkme.get_z(), device=self.device) + target_data_weight = torch.tensor(self.target_rkme.get_beta(), device=self.device).view(1, -1).double() + if self.optimizer_type == "Adam": + optimizer = torch.optim.Adam(user_model.parameters(), lr=args["lr"]) + else: + optimizer = torch.optim.SGD(user_model.parameters(), lr=args["lr"]) + + start_time = time.time() + for epoch in trange(args["num_epoch"], desc="Epoch"): + transformed_user_data_x = user_model(user_data_x) + mmd_loss = self.compute_mmd(transformed_user_data_x, user_data_weight, target_data_x, target_data_weight) + + optimizer.zero_grad() + mmd_loss.backward() + optimizer.step() + logger.info( + "epoch: {}, train mmd_loss: {:.4f}, lr: {:.6f}, spent: {:.1f} secs".format( + epoch, mmd_loss.item(), optimizer.param_groups[0]["lr"], time.time() - start_time + ) + ) + + self.model = user_model + logger.info("training complete, cost {:.1f} secs.".format(time.time() - start_time)) \ No newline at end of file diff --git a/setup.py b/setup.py index 67f7254..b37dfab 100644 --- a/setup.py +++ b/setup.py @@ -51,31 +51,31 @@ def get_platform(): # What packages are required for this module to be executed? # `estimator` may depend on other packages. In order to reduce dependencies, it is not written here. REQUIRED = [ - "numpy>=1.20.0", - "pandas>=0.25.1", - "scipy>=1.0.0", - "matplotlib>=3.1.3", - "torch>=1.11.0", - "cvxopt>=1.3.0", - "tqdm>=4.65.0", - "scikit-learn>=0.22", - "joblib>=1.2.0", - "pyyaml>=6.0", - "fire>=0.3.1", - "lightgbm>=3.3.0", - "psutil>=5.9.4", - "torchvision>=0.15.1", - "sqlalchemy>=2.0.21", - "shortuuid>=1.0.11", - "geatpy>=2.7.0", - "docker>=6.1.3", - "rapidfuzz>=3.4.0", - "torchtext>=0.16.0", - "sentence_transformers>=2.2.2", - "torch-optimizer>=0.3.0", - "langdetect>=1.0.9", - "huggingface-hub<0.18", - "portalocker>=2.0.0", + # "numpy>=1.20.0", + # "pandas>=0.25.1", + # "scipy>=1.0.0", + # "matplotlib>=3.1.3", + # "torch>=1.11.0", + # "cvxopt>=1.3.0", + # "tqdm>=4.65.0", + # "scikit-learn>=0.22", + # "joblib>=1.2.0", + # "pyyaml>=6.0", + # "fire>=0.3.1", + # "lightgbm>=3.3.0", + # "psutil>=5.9.4", + # "torchvision>=0.15.1", + # "sqlalchemy>=2.0.21", + # "shortuuid>=1.0.11", + # "geatpy>=2.7.0", + # "docker>=6.1.3", + # "rapidfuzz>=3.4.0", + # "torchtext>=0.16.0", + # "sentence_transformers>=2.2.2", + # "torch-optimizer>=0.3.0", + # "langdetect>=1.0.9", + # "huggingface-hub<0.18", + # "portalocker>=2.0.0", ] if get_platform() != MACOS: From 8e9cf0432ca10590542bf9af96e7dd2c800b52f4 Mon Sep 17 00:00:00 2001 From: liuht Date: Mon, 6 Nov 2023 17:00:02 +0800 Subject: [PATCH 04/90] [MNT] add HeteroSpecification --- learnware/specification/__init__.py | 1 + learnware/specification/system/__init__.py | 1 + learnware/specification/system/base.py | 16 ++++ learnware/specification/system/heter_table.py | 93 +++++++++++++++++-- 4 files changed, 101 insertions(+), 10 deletions(-) create mode 100644 learnware/specification/system/base.py diff --git a/learnware/specification/__init__.py b/learnware/specification/__init__.py index b27ef5b..c999210 100644 --- a/learnware/specification/__init__.py +++ b/learnware/specification/__init__.py @@ -7,3 +7,4 @@ from .regular import ( RKMEImageSpecification, RKMETextSpecification, ) +from .system import HeteroSpecification diff --git a/learnware/specification/system/__init__.py b/learnware/specification/system/__init__.py index e69de29..1a8b6ca 100644 --- a/learnware/specification/system/__init__.py +++ b/learnware/specification/system/__init__.py @@ -0,0 +1 @@ +from .heter_table import HeteroSpecification diff --git a/learnware/specification/system/base.py b/learnware/specification/system/base.py new file mode 100644 index 0000000..8369b28 --- /dev/null +++ b/learnware/specification/system/base.py @@ -0,0 +1,16 @@ +from __future__ import annotations + +from ..base import BaseStatSpecification +from loguru import logger + + +class SystemStatsSpecification(BaseStatSpecification): + def generate_stat_spec(self, **kwargs): + self.generate_stat_spec_from_system(**kwargs) + + def generate_stat_spec_from_system(self, **kwargs): + """Construct statistical specification from raw dataset + - kwargs may include the feature, label and model + - kwargs also can include hyperparameters of specific method for specifaction generation + """ + raise NotImplementedError("generate_stat_spec_from_data is not implemented") \ No newline at end of file diff --git a/learnware/specification/system/heter_table.py b/learnware/specification/system/heter_table.py index ae24a1c..e878ca6 100644 --- a/learnware/specification/system/heter_table.py +++ b/learnware/specification/system/heter_table.py @@ -1,15 +1,88 @@ -from ..base import BaseStatSpecification +from __future__ import annotations +import codecs +import copy +import json +import os -class HeterMapTableSpecification(BaseStatSpecification): - def generate_stat_spec(self, **kwargs): - pass +import numpy as np +import torch +from ..regular.table import RKMEStatSpecification +from ..regular.table.rkme import choose_device, setup_seed, torch_rbf_kernel - def save(self, filepath: str): - pass +from .base import SystemStatsSpecification - def load(self, filepath: str): - pass +class HeteroSpecification(SystemStatsSpecification): + """Heterogeneous Embedding Specification""" - def dist(self, other_spec): - pass + def __init__(self, gamma: float = 0.1, cuda_idx: int = -1): + self.z = None + self.beta = None + self.embedding = None + self.weight = None + self.gamma = gamma + self.cuda_idx = cuda_idx + torch.cuda.empty_cache() + self.device = choose_device(cuda_idx=cuda_idx) + setup_seed(0) + super(HeteroSpecification, self).__init__(type=self.__class__.__name__) + + def get_z(self) -> np.ndarray: + return self.z.detach().cpu().numpy + + def get_beta(self) -> np.ndarray: + return self.beta.detach().cpu().numpy + + def generate_stat_spec_from_system(self, heter_embedding: np.ndarray, rkme_spec: RKMEStatSpecification): + self.beta = rkme_spec.beta.to(self.device) + self.z = torch.from_numpy(heter_embedding).double().to(self.device) + + def inner_prod(self, Embed2: HeteroSpecification) -> float: + beta_1 = self.beta.reshape(1, -1).double().to(self.device) + beta_2 = Embed2.beta.reshape(1, -1).double().to(self.device) + Z1 = self.z.double().reshape(self.z.shape[0], -1).to(self.device) + Z2 = Embed2.z.double().reshape(Embed2.z.shape[0], -1).to(self.device) + v = torch.sum(torch_rbf_kernel(Z1, Z2, self.gamma) * (beta_1.T @ beta_2)) + + return float(v) + + def dist(self, Embed2: HeteroSpecification, omit_term1: bool = False) -> float: + term1 = 0 if omit_term1 else self.inner_prod(self) + term2 = self.inner_prod(Embed2) + term3 = Embed2.inner_prod(Embed2) + + return float(term1 - 2 * term2 + term3) + + def load(self, filepath: str) -> bool: + load_path = filepath + if os.path.exists(load_path): + with codecs.open(load_path, "r", encoding="utf-8") as fin: + obj_text = fin.read() + embedding_load = json.loads(obj_text) + embedding_load["device"] = choose_device(embedding_load["cuda_idx"]) + embedding_load["z"] = torch.from_numpy(np.array(embedding_load["z"])) + embedding_load["beta"] = torch.from_numpy(np.array(embedding_load["beta"])) + + for d in self.__dir__(): + if d in embedding_load.keys(): + setattr(self, d, embedding_load[d]) + return True + else: + return False + + def save(self, filepath: str) -> bool: + save_path = filepath + embedding_to_save = copy.deepcopy(self.__dict__) + if torch.is_tensor(embedding_to_save["z"]): + embedding_to_save["z"] = embedding_to_save["z"].detach().cpu().numpy() + embedding_to_save["z"] = embedding_to_save["z"].tolist() + if torch.is_tensor(embedding_to_save["beta"]): + embedding_to_save["beta"] = embedding_to_save["beta"].detach().cpu().numpy() + embedding_to_save["beta"] = embedding_to_save["beta"].tolist() + embedding_to_save["device"] = "gpu" if embedding_to_save["cuda_idx"] != -1 else "cpu" + # embedding_to_save["type"] = self.type + json.dump( + embedding_to_save, + codecs.open(save_path, "w", encoding="utf-8"), + separators=(",", ":"), + ) From 0b766d59edfc8170a909e568b1759745620e410b Mon Sep 17 00:00:00 2001 From: liuht Date: Mon, 6 Nov 2023 17:00:52 +0800 Subject: [PATCH 05/90] [MNT] add HeteroMapTableOrganizer --- learnware/market/hetergeneous/database_ops.py | 176 ++++++++ .../market/hetergeneous/organizer/__init__.py | 149 +++++++ .../market/hetergeneous/organizer/config.py | 53 +++ .../organizer/hetero_mapping/__init__.py | 420 ++++++++++++++++++ .../hetero_mapping/feature_extractor.py | 263 +++++++++++ .../organizer/hetero_mapping/trainer.py | 253 +++++++++++ 6 files changed, 1314 insertions(+) create mode 100644 learnware/market/hetergeneous/database_ops.py create mode 100644 learnware/market/hetergeneous/organizer/__init__.py create mode 100644 learnware/market/hetergeneous/organizer/config.py create mode 100644 learnware/market/hetergeneous/organizer/hetero_mapping/__init__.py create mode 100644 learnware/market/hetergeneous/organizer/hetero_mapping/feature_extractor.py create mode 100644 learnware/market/hetergeneous/organizer/hetero_mapping/trainer.py diff --git a/learnware/market/hetergeneous/database_ops.py b/learnware/market/hetergeneous/database_ops.py new file mode 100644 index 0000000..d2920bd --- /dev/null +++ b/learnware/market/hetergeneous/database_ops.py @@ -0,0 +1,176 @@ +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy import create_engine, text +from sqlalchemy import Column, Integer, Text, DateTime, String +import os +import json +from learnware.learnware import get_learnware_from_dirpath +from learnware.logger import get_module_logger + +logger = get_module_logger("database") +DeclarativeBase = declarative_base() + + +class Learnware(DeclarativeBase): + __tablename__ = "tb_learnware" + + id = Column(String(10), primary_key=True, nullable=False) + semantic_spec = Column(Text, nullable=False) + zip_path = Column(Text, nullable=False) + folder_path = Column(Text, nullable=False) + use_flag = Column(Text, nullable=False) + + pass + + +class DatabaseOperations(object): + def __init__(self, url: str, database_name: str): + if url.startswith("sqlite"): + url = os.path.join(url, f"{database_name}.db") + else: + url = f"{url}/{database_name}" + pass + + self.url = url + self.create_database_if_not_exists(url) + + pass + + def create_database_if_not_exists(self, url): + database_exists = True + + if url.startswith("sqlite"): + # it is sqlite + start = url.find(":///") + path = url[start + 4 :] + if os.path.exists(path): + database_exists = True + pass + else: + database_exists = False + os.makedirs(os.path.dirname(path), exist_ok=True) + pass + pass + elif self.url.startswith("postgresql"): + # it is postgresql + dbname_start = url.rfind("/") + dbname = url[dbname_start + 1 :] + url_no_dbname = url[:dbname_start] + "/postgres" + engine = create_engine(url_no_dbname) + + with engine.connect() as conn: + result = conn.execute(text("SELECT datname FROM pg_database;")) + db_list = set() + + for row in result.fetchall(): + db_list.add(row[0].lower()) + pass + + if dbname.lower() not in db_list: + database_exists = False + conn.execution_options(isolation_level="AUTOCOMMIT").execute( + text("CREATE DATABASE {0};".format(dbname)) + ) + pass + else: + database_exists = True + pass + pass + engine.dispose() + pass + else: + raise Exception(f"Unsupported database url: {self.url}") + pass + + self.engine = create_engine(url, future=True) + + if not database_exists: + DeclarativeBase.metadata.create_all(self.engine) + pass + pass + + def clear_learnware_table(self): + with self.engine.connect() as conn: + conn.execute(text("DELETE FROM tb_learnware;")) + conn.commit() + pass + pass + + def add_learnware(self, id: str, semantic_spec: dict, zip_path, folder_path, use_flag: str): + with self.engine.connect() as conn: + semantic_spec_str = json.dumps(semantic_spec) + conn.execute( + text( + ( + "INSERT INTO tb_learnware (id, semantic_spec, zip_path, folder_path, use_flag)" + "VALUES (:id, :semantic_spec, :zip_path, :folder_path, :use_flag);" + ) + ), + dict( + id=id, + semantic_spec=semantic_spec_str, + zip_path=zip_path, + folder_path=folder_path, + use_flag=use_flag, + ), + ) + conn.commit() + pass + pass + + def delete_learnware(self, id: str): + with self.engine.connect() as conn: + conn.execute(text("DELETE FROM tb_learnware WHERE id=:id;"), dict(id=id)) + conn.commit() + pass + pass + + def update_learnware_semantic_specification(self, id: str, semantic_spec: dict): + with self.engine.connect() as conn: + semantic_spec_str = json.dumps(semantic_spec) + r = conn.execute( + text("UPDATE tb_learnware SET semantic_spec=:semantic_spec WHERE id=:id;"), + dict(id=id, semantic_spec=semantic_spec_str), + ) + conn.commit() + pass + pass + + def update_learnware_use_flag(self, id: str, use_flag: str): + with self.engine.connect() as conn: + r = conn.execute( + text("UPDATE tb_learnware SET use_flag=:use_flag WHERE id=:id;"), + dict(id=id, use_flag=use_flag), + ) + conn.commit() + pass + pass + + def load_market(self): + with self.engine.connect() as conn: + cursor = conn.execute(text("SELECT id, semantic_spec, zip_path, folder_path, use_flag FROM tb_learnware;")) + + learnware_list = {} + zip_list = {} + folder_list = {} + use_flags = {} + max_count = 0 + + for id, semantic_spec, zip_path, folder_path, use_flag in cursor: + id = id.strip() + semantic_spec_dict = json.loads(semantic_spec) + new_learnware = get_learnware_from_dirpath( + id=id, semantic_spec=semantic_spec_dict, learnware_dirpath=folder_path + ) + logger.info(f"Load learnware: {id}") + learnware_list[id] = new_learnware + # assert new_learnware is not None + zip_list[id] = zip_path + folder_list[id] = folder_path + use_flags[id] = use_flag + max_count = max(max_count, int(id)) + pass + + return learnware_list, zip_list, folder_list, use_flags, max_count + 1 + pass + + pass \ No newline at end of file diff --git a/learnware/market/hetergeneous/organizer/__init__.py b/learnware/market/hetergeneous/organizer/__init__.py new file mode 100644 index 0000000..53355e8 --- /dev/null +++ b/learnware/market/hetergeneous/organizer/__init__.py @@ -0,0 +1,149 @@ +from __future__ import annotations + +import os +from collections import defaultdict +from typing import List + +import pandas as pd +import multiprocessing + +from .config import C as conf +from .hetero_mapping import Trainer, HeteroMapping +from ..database_ops import DatabaseOperations +from ...base import BaseUserInfo, BaseOrganizer +from ....learnware import Learnware +from ....logger import get_module_logger +from ....specification.system import HeteroSpecification + + +logger = get_module_logger("hetero_market") + +class HeteroMapTableOrganizer(BaseOrganizer): + def reload_market( + self, rebuild=False, auto_update_limit=50 + ): + self.market_store_path = os.path.join(conf.market_root_path, self.market_id) + self.market_mapping_path = os.path.join(self.market_store_path, conf.market_model_path) + self.learnware_pool_path = os.path.join(self.market_store_path, "learnware_pool") + self.learnware_zip_pool_path = os.path.join(self.market_store_path, "zips") + self.learnware_folder_pool_path = os.path.join(self.market_store_path, "unzipped_learnwares") + self.learnware_list = {} # id:learnware + self.learnware_zip_list = {} + self.learnware_folder_list = {} + self.count = 0 + # default root path: ../../.learnware + self.root_path = conf.market_root_path + self.dbops = DatabaseOperations(conf.database_url, "market_" + self.market_id) + self.auto_update_limit = auto_update_limit + + os.makedirs(self.learnware_pool_path, exist_ok=True) + os.makedirs(self.learnware_zip_pool_path, exist_ok=True) + os.makedirs(self.learnware_folder_pool_path, exist_ok=True) + ( + self.learnware_list, + self.learnware_zip_list, + self.learnware_folder_list, + self.use_flags, + self.count, + ) = self.dbops.load_market() + + if rebuild: + logger.warning("Warning! You are trying to clear current database!") + try: + self.dbops.clear_learnware_table() + rmtree(self.learnware_pool_path) + except: + pass + else: + if os.path.exists(self.market_mapping_path): + logger.info(f"Loading Market Mapping from Default Checkpoint {self.market_mapping_path}") + self.market_mapping = HeteroMapping.load(checkpoint=self.market_store_path) + # self._update_learnware_list(self.learnware_list) + else: + logger.warning(f"No Existing Market Mapping!!") + self.market_mapping = HeteroMapping() + + def reset(self, market_id=None, auto_update=False, **kwargs): + # model training arguments(model architecture + optimization) set via self.reset + self.auto_update = auto_update + self.market_id = market_id + self.training_args = kwargs + + def add_learnware(self, zip_path: str, semantic_spec: dict, check_status: int, learnware: Learnware) -> Tuple[str, int]: + self._update_learnware_list([learnware]) + self.learnware_list[learnware.id] = learnware + self.count += 1 + + if self.auto_update and self.count >= self.auto_update_limit: + train_process = multiprocessing.Process(target=self.train, args=(self.learnware_list,)) + train_process.start() + # train_process.join() + + def delete_learnware(self, id: str) -> bool: + raise NotImplementedError + + def update_learnware(self, learnware: Learnware): + raise NotImplementedError + + def get_learnwares(self): + return self.learnware_list + + def train(self, learnware_list: List[Learnware]): + allset = self._learnwares_to_dataframes(learnware_list) + self.market_mapping = HeteroMapping(**self.training_args) + market_mapping_trainer = Trainer( + model=self.market_mapping, + train_set_list=allset, + collate_fn=self.market_mapping.collate_fn, + **self.training_args + ) + market_mapping_trainer.train() + + # auto save whenever market model retrained + market_mapping_trainer.save_model(output_dir=self.market_store_path) + + # essential hetero-mapping update for each market learnware when market model retrained + self._update_learnware_list(learnware_list) + + def _update_learnware_list(self, learnware_list: List[Learnware]): + hetero_mappings_save_path = os.path.join(self.market_store_path, "hetero_mappings") + os.makedirs(hetero_mappings_save_path, exist_ok=True) + for learnware in learnware_list: + learnware.id = learnware.id.replace(",", "_") + hetero_spec_path = os.path.join(hetero_mappings_save_path, f"{learnware.id}.npy") + self._update_learnware_specification(learnware, save_path=hetero_spec_path) + + def _update_learnware_specification(self, learnware: Learnware, save_path: str) -> Learnware: + specification = learnware.specification + learnware_rkme = specification.get_stat_spec()["RKMEStatSpecification"] + learnware_features = specification.get_semantic_spec()["Input"]["Description"].values() + learnware_hetero_spec = self.market_mapping.hetero_mapping(learnware_rkme, learnware_features) + learnware.update_stat_spec("HeteroSpecification", learnware_hetero_spec) + + # custom hetero spec save path? + learnware_hetero_spec.save(save_path) + + def generate_hetero_map_spec(self, user_info: BaseUserInfo) -> HeteroSpecification: + user_rkme = user_info.stat_info["RKMEStatSpecification"] + user_features = user_info.semantic_spec["Input"]["Description"].values() + user_hetero_spec = self.market_mapping.hetero_mapping(user_rkme, user_features) + return user_hetero_spec + + def _learnwares_to_dataframes(self, learnware_list: List[Learnware]) -> List[pd.DataFrame]: + learnware_df_dict = defaultdict(list) + for learnware in learnware_list: + specification = learnware.get_specification() + learnware_rkme = specification.get_stat_spec()["RKMEStatSpecification"] + learnware_features = specification.get_semantic_spec()["Input"]["Description"] + learnware_df = pd.DataFrame(data=learnware_rkme.get_z(), columns=learnware_features.values()) + + learnware_df_dict[tuple(sorted(learnware_features))].append(learnware_df) + + merged_dfs = [pd.concat(dfs) for dfs in learnware_df_dict.values()] + return merged_dfs + + def save(self, save_path): + return NotImplementedError + + def __len__(self): + return len(self.learnware_list) \ No newline at end of file diff --git a/learnware/market/hetergeneous/organizer/config.py b/learnware/market/hetergeneous/organizer/config.py new file mode 100644 index 0000000..a878ace --- /dev/null +++ b/learnware/market/hetergeneous/organizer/config.py @@ -0,0 +1,53 @@ +# Name of the files used for checkpointing +import copy +import json +import logging +import os +from pathlib import Path + +from ....config import Config + +ROOT_PATH = Path(__file__).resolve().parent.parent +HETERO_ROOT_DIRPATH = os.path.join(ROOT_PATH, ".learnware") +PACKAGE_DIRPATH = os.path.dirname(os.path.abspath(__file__)) + +LEARNWARE_POOL_PATH = os.path.join(HETERO_ROOT_DIRPATH, "learnware_pool") +LEARNWARE_ZIP_POOL_PATH = os.path.join(LEARNWARE_POOL_PATH, "zips") +LEARNWARE_FOLDER_POOL_PATH = os.path.join(LEARNWARE_POOL_PATH, "learnwares") + +DATABASE_PATH = os.path.join(HETERO_ROOT_DIRPATH, "database") +STDOUT_PATH = os.path.join(HETERO_ROOT_DIRPATH, "stdout") + +# relative paths +TRAINING_ARGS_NAME = "training_args.json" +MODEL_PATH = "pytorch_model.bin" +TOKENIZER_DIR = "tokenizer" +HETERO_MAPPING_PATH = "hetero_mapping" + +# TODO: Delete them later +# os.makedirs(HETERO_ROOT_DIRPATH, exist_ok=True) +# os.makedirs(DATABASE_PATH, exist_ok=True) +# os.makedirs(STDOUT_PATH, exist_ok=True) + +_DEFAULT_CONFIG = { + "root_path": HETERO_ROOT_DIRPATH, + "package_path": PACKAGE_DIRPATH, + "stdout_path": STDOUT_PATH, + "logging_level": logging.INFO, + "logging_outfile": None, + "market_root_path": HETERO_ROOT_DIRPATH, + "market_model_path": MODEL_PATH, + "market_training_args_path": TRAINING_ARGS_NAME, + "market_tokenizer_path": TOKENIZER_DIR, + "heter_mapping_path": HETERO_MAPPING_PATH, + "learnware_pool_path": LEARNWARE_POOL_PATH, + "learnware_zip_pool_path": LEARNWARE_ZIP_POOL_PATH, + "learnware_folder_pool_path": LEARNWARE_FOLDER_POOL_PATH, + "learnware_folder_config": { + "yaml_file": "learnware.yaml", + "module_file": "__init__.py", + }, + "database_url": f"sqlite:///{DATABASE_PATH}" +} + +C = Config(_DEFAULT_CONFIG) diff --git a/learnware/market/hetergeneous/organizer/hetero_mapping/__init__.py b/learnware/market/hetergeneous/organizer/hetero_mapping/__init__.py new file mode 100644 index 0000000..90014e7 --- /dev/null +++ b/learnware/market/hetergeneous/organizer/hetero_mapping/__init__.py @@ -0,0 +1,420 @@ +import os +from typing import List, Optional + +import numpy as np +import pandas as pd +import torch +import torch.nn.functional as F +from torch import Tensor, nn + +from .....specification import HeteroSpecification, RKMETableSpecification +from ..config import C as conf +from .feature_extractor import * +from .trainer import Trainer, TransTabCollatorForCL + + +class HeteroMapping(nn.Module): + def __init__( + self, + feature_tokenizer=None, + hidden_dim=128, + num_layer=2, + num_attention_head=8, + hidden_dropout_prob=0, + ffn_dim=256, + projection_dim=128, + overlap_ratio=0.5, + num_partition=3, + temperature=10, + base_temperature=10, + activation="relu", + device="cuda:0", + checkpoint=None, + **kwargs, + ) -> None: + super(HeteroMapping, self).__init__() + + self.model_args = { + 'num_partition': num_partition, + 'overlap_ratio': overlap_ratio, + 'hidden_dim': hidden_dim, + 'num_layer': num_layer, + 'num_attention_head': num_attention_head, + 'hidden_dropout_prob': hidden_dropout_prob, + 'ffn_dim': ffn_dim, + 'projection_dim': projection_dim, + 'activation': activation + } + self.model_args.update(kwargs) + + if feature_tokenizer is None: + feature_tokenizer = FeatureTokenizer(**kwargs) + + self.feature_tokenizer = feature_tokenizer + + self.feature_processor = FeatureProcessor( + vocab_size=feature_tokenizer.vocab_size, + pad_token_id=feature_tokenizer.pad_token_id, + hidden_dim=hidden_dim, + hidden_dropout_prob=hidden_dropout_prob, + device=device, + ) + + self.encoder = TransformerMultiLayer( + hidden_dim=hidden_dim, + num_layer=num_layer, + num_attention_head=num_attention_head, + hidden_dropout_prob=hidden_dropout_prob, + ffn_dim=ffn_dim, + activation=activation, + ) + self.cls_token = CLSToken(hidden_dim=hidden_dim) + self.device = device + self.to(device) + + self.collate_fn = TransTabCollatorForCL( + feature_tokenizer=feature_tokenizer, overlap_ratio=overlap_ratio, num_partition=num_partition + ) + + self.projection_head = nn.Linear(hidden_dim, projection_dim, bias=False) + self.cross_entropy_loss = nn.CrossEntropyLoss() + self.temperature = temperature + self.base_temperature = base_temperature + self.num_partition = num_partition + self.overlap_ratio = overlap_ratio + self.device = device + self.to(device) + + @staticmethod + def load(checkpoint=None): + """Load the model state_dict and feature_tokenizer configuration + from the ``ckpt_dir``. + + Parameters + ---------- + ckpt_dir: str + the directory path to load. + + Returns + ------- + None + + """ + # load model weight state dict + market_model_path = os.path.join(checkpoint, conf.market_model_path) + model_info = torch.load(market_model_path, map_location="cpu") + model = HeteroMapping(feature_tokenizer=model_info["feature_tokenizer"], **model_info["model_args"]) + model.load_state_dict(model_info["model_state_dict"], strict=False) + return model + # self.feature_tokenizer.load(checkpoint) + + def save(self, ckpt_dir): + """Save the model state_dict and feature_tokenizer configuration + to the ``ckpt_dir``. + + Parameters + ---------- + ckpt_dir: str + the directory path to save. + + Returns + ------- + None + + """ + # save model weight state dict + model_info = { + "model_state_dict": self.state_dict(), + "model_args": self.model_args, + "feature_tokenizer": self.feature_tokenizer + } + torch.save(model_info, os.path.join(ckpt_dir, conf.market_model_path)) + + def forward(self, x, y=None): + # do positive sampling + feat_x_list = [] + if isinstance(x, dict): + # pretokenized inputs + for input_x in x["input_sub_x"]: + feat_x = self.feature_processor(**input_x) + feat_x = self.cls_token(**feat_x) + feat_x = self.encoder(**feat_x) + feat_x_proj = feat_x[:, 0, :] + feat_x_proj = self.projection_head(feat_x_proj) + feat_x_list.append(feat_x_proj) + else: + raise ValueError(f"expect input x to be dict(pretokenized), get {type(x)} instead") + + # compute cl loss (multi-view InfoNCE loss) + feat_x_multiview = torch.stack(feat_x_list, axis=1) # bs, n_view, emb_dim + loss = self._self_supervised_contrastive_loss(feat_x_multiview) + return loss + + def hetero_mapping(self, rkme_spec: RKMETableSpecification, cols: List[str]) -> HeteroSpecification: + hetero_spec = HeteroSpecification() + hetero_input_df = pd.DataFrame(data=rkme_spec.get_z(), columns=cols) + hetero_embedding = self._extract_batch_features(hetero_input_df) + hetero_spec.generate_stat_spec_from_system(hetero_embedding, rkme_spec) + return hetero_spec + + def _build_positive_pairs(self, x, n): + x_cols = x.columns.tolist() + sub_col_list = np.array_split(np.array(x_cols), n) + len_cols = len(sub_col_list[0]) + overlap = int(np.ceil(len_cols * (self.overlap_ratio))) + sub_x_list = [] + for i, sub_col in enumerate(sub_col_list): + if overlap > 0 and i < n - 1: + sub_col = np.concatenate([sub_col, sub_col_list[i + 1][:overlap]]) + elif overlap > 0 and i == n - 1: + sub_col = np.concatenate([sub_col, sub_col_list[i - 1][-overlap:]]) + sub_x = x.copy()[sub_col] + sub_x_list.append(sub_x) + return sub_x_list + + def _extract_features(self, x, cols=None): + """Make forward pass given the input feature ``x``. + + Parameters + ---------- + x: pd.DataFrame or dict + pd.DataFrame: a batch of raw tabular samples; dict: the output of TransTabFeatureExtractor. + + Returns + ------- + output_features: numpy.ndarray + the [CLS] embedding at the end of transformer encoder. + """ + if isinstance(x, dict): + # input is the pre-tokenized encoded inputs + inputs = x + elif isinstance(x, pd.DataFrame): + # input is dataframe + inputs = self.feature_tokenizer(x) + elif isinstance(x, torch.Tensor): + inputs = self.feature_tokenizer.forward(cols, x) + else: + raise ValueError(f"TransTabOutputFeatureExtractor takes inputs with dict or pd.DataFrame, find {type(x)}.") + + outputs = self.feature_processor(**inputs) # outputs is dict, "embedding" and "mask" + outputs = self.cls_token(**outputs) # add the cls embedding + + # go through transformers, get the first cls embedding + encoder_output = self.encoder(**outputs) # bs, seqlen+1, hidden_dim + output_features = encoder_output[:, 0, :] + + return output_features + + def _extract_batch_features(self, x_test, eval_batch_size=256) -> np.ndarray: + self.eval() + output_feas_list = [] + for i in range(0, len(x_test), eval_batch_size): + bs_x_test = x_test.iloc[i : i + eval_batch_size] + with torch.no_grad(): + output_features = self._extract_features(bs_x_test).detach().cpu().numpy() + output_feas_list.append(output_features) + + all_output_features = np.concatenate(output_feas_list, 0) + return all_output_features + + def _self_supervised_contrastive_loss(self, features): + """Compute the self-supervised VPCL loss. + + Parameters + ---------- + features: torch.Tensor + the encoded features of multiple partitions of input tables, with shape ``(bs, n_partition, proj_dim)``. + + Returns + ------- + loss: torch.Tensor + the computed self-supervised VPCL loss. + """ + batch_size = features.shape[0] + labels = torch.arange(batch_size, dtype=torch.long, device=self.device).view(-1, 1) + mask = torch.eq(labels, labels.T).float().to(labels.device) + + contrast_count = features.shape[1] + # [[0,1],[2,3]] -> [0,2,1,3] + contrast_feature = torch.cat(torch.unbind(features, dim=1), dim=0) + anchor_feature = contrast_feature + anchor_count = contrast_count + anchor_dot_contrast = torch.div(torch.matmul(anchor_feature, contrast_feature.T), self.temperature) + logits_max, _ = torch.max(anchor_dot_contrast, dim=1, keepdim=True) + logits = anchor_dot_contrast - logits_max.detach() + mask = mask.repeat(anchor_count, contrast_count) + logits_mask = torch.scatter( + torch.ones_like(mask), + 1, + torch.arange(batch_size * anchor_count).view(-1, 1).to(features.device), + 0, + ) + mask = mask * logits_mask + # compute log_prob + exp_logits = torch.exp(logits) * logits_mask + log_prob = logits - torch.log(exp_logits.sum(1, keepdim=True)) + # compute mean of log-likelihood over positive + mean_log_prob_pos = (mask * log_prob).sum(1) / mask.sum(1) + loss = -(self.temperature / self.base_temperature) * mean_log_prob_pos + loss = loss.view(anchor_count, batch_size).mean() + return loss + + +def _get_activation_fn(activation): + if activation == "relu": + return F.relu + elif activation == "gelu": + return F.gelu + elif activation == "selu": + return F.selu + elif activation == "leakyrelu": + return F.leaky_relu + raise RuntimeError("activation should be relu/gelu/selu/leakyrelu, not {}".format(activation)) + + +class TransformerLayer(nn.Module): + __config__ = ["batch_first", "norm_first"] + + def __init__( + self, + d_model, + nhead, + dim_feedforward=2048, + dropout=0.1, + activation=F.relu, + layer_norm_eps=1e-5, + batch_first=True, + norm_first=False, + device=None, + dtype=None, + use_layer_norm=True, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.self_attn = nn.MultiheadAttention(d_model, nhead, batch_first=batch_first, **factory_kwargs) + # Implementation of Feedforward model + self.linear1 = nn.Linear(d_model, dim_feedforward, **factory_kwargs) + self.dropout = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model, **factory_kwargs) + + # Implementation of gates + self.gate_linear = nn.Linear(d_model, 1, bias=False) + self.gate_act = nn.Sigmoid() + + self.norm_first = norm_first + self.use_layer_norm = use_layer_norm + + if self.use_layer_norm: + self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs) + self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + + # Legacy string support for activation function. + if isinstance(activation, str): + self.activation = _get_activation_fn(activation) + else: + self.activation = activation + + # self-attention block + def _sa_block(self, x: Tensor, attn_mask: Optional[Tensor], key_padding_mask: Optional[Tensor]) -> Tensor: + src = x + key_padding_mask = ~key_padding_mask.bool() + x = self.self_attn( + x, + x, + x, + attn_mask=attn_mask, + key_padding_mask=key_padding_mask, + )[0] + return self.dropout1(x) + + # feed forward block + def _ff_block(self, x: Tensor) -> Tensor: + g = self.gate_act(self.gate_linear(x)) + h = self.linear1(x) + h = h * g # add gate + h = self.linear2(self.dropout(self.activation(h))) + return self.dropout2(h) + + def __setstate__(self, state): + if "activation" not in state: + state["activation"] = F.relu + super().__setstate__(state) + + def forward(self, src, src_mask=None, src_key_padding_mask=None, is_causal=None, **kwargs) -> Tensor: + r"""Pass the input through the encoder layer. + + Args: + src: the sequence to the encoder layer (required). + src_mask: the mask for the src sequence (optional). + src_key_padding_mask: the mask for the src keys per batch (optional). + + Shape: + see the docs in Transformer class. + """ + # see Fig. 1 of https://arxiv.org/pdf/2002.04745v1.pdf + x = src + if self.use_layer_norm: + if self.norm_first: + x = x + self._sa_block(self.norm1(x), src_mask, src_key_padding_mask) + x = x + self._ff_block(self.norm2(x)) + else: + x = self.norm1(x + self._sa_block(x, src_mask, src_key_padding_mask)) + x = self.norm2(x + self._ff_block(x)) + + else: # do not use layer norm + x = x + self._sa_block(x, src_mask, src_key_padding_mask) + x = x + self._ff_block(x) + return x + + +class TransformerMultiLayer(nn.Module): + def __init__( + self, + hidden_dim=128, + num_layer=2, + num_attention_head=2, + hidden_dropout_prob=0, + ffn_dim=256, + activation="relu", + ): + super().__init__() + self.transformer_encoder = nn.ModuleList( + [ + TransformerLayer( + d_model=hidden_dim, + nhead=num_attention_head, + dropout=hidden_dropout_prob, + dim_feedforward=ffn_dim, + batch_first=True, + layer_norm_eps=1e-5, + norm_first=False, + use_layer_norm=True, + activation=activation, + ) + ] + ) + if num_layer > 1: + encoder_layer = TransformerLayer( + d_model=hidden_dim, + nhead=num_attention_head, + dropout=hidden_dropout_prob, + dim_feedforward=ffn_dim, + batch_first=True, + layer_norm_eps=1e-5, + norm_first=False, + use_layer_norm=True, + activation=activation, + ) + stacked_transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layer - 1) + self.transformer_encoder.append(stacked_transformer) + + def forward(self, embedding, attention_mask=None, **kwargs) -> Tensor: + """args: + embedding: bs, num_token, hidden_dim + """ + outputs = embedding + for i, mod in enumerate(self.transformer_encoder): + outputs = mod(outputs, src_key_padding_mask=attention_mask) + return outputs \ No newline at end of file diff --git a/learnware/market/hetergeneous/organizer/hetero_mapping/feature_extractor.py b/learnware/market/hetergeneous/organizer/hetero_mapping/feature_extractor.py new file mode 100644 index 0000000..f2bbe49 --- /dev/null +++ b/learnware/market/hetergeneous/organizer/hetero_mapping/feature_extractor.py @@ -0,0 +1,263 @@ +import json +import math +import os +from typing import Dict + +import numpy as np +import torch +import torch.nn.init as nn_init +from loguru import logger +from torch import Tensor, nn +from transformers import BertTokenizerFast + +from ..config import C as conf + + +class WordEmbedding(nn.Module): + """ + Encode tokens drawn from column names + """ + + def __init__( + self, + vocab_size, + hidden_dim, + padding_idx=0, + hidden_dropout_prob=0, + layer_norm_eps=1e-5, + ) -> None: + super().__init__() + self.word_embeddings = nn.Embedding(vocab_size, hidden_dim, padding_idx) + nn_init.kaiming_normal_(self.word_embeddings.weight) + self.norm = nn.LayerNorm(hidden_dim, eps=layer_norm_eps) + self.dropout = nn.Dropout(hidden_dropout_prob) + + def forward(self, input_ids) -> Tensor: + embeddings = self.word_embeddings(input_ids) + embeddings = self.norm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class NumEmbedding(nn.Module): + """ + Encode tokens drawn from column names and the corresponding numerical features. + """ + + def __init__(self, hidden_dim) -> None: + super().__init__() + self.norm = nn.LayerNorm(hidden_dim) + self.num_bias = nn.Parameter(Tensor(1, 1, hidden_dim)) # add bias + nn_init.uniform_(self.num_bias, a=-1 / math.sqrt(hidden_dim), b=1 / math.sqrt(hidden_dim)) + + def forward(self, col_emb, x_ts) -> Tensor: + """args: + col_emb: numerical column embedding, (# numerical columns, emb_dim) + x_ts: numerical features, (bs, emb_dim) + """ + col_emb = col_emb.unsqueeze(0).expand((x_ts.shape[0], -1, -1)) + feat_emb = col_emb * x_ts.unsqueeze(-1).float() + self.num_bias + return feat_emb + + +class FeatureTokenizer: + r""" + Process input dataframe to input indices towards encoder, + usually used to build dataloader for paralleling loading. + """ + + def __init__( + self, + disable_tokenizer_parallel=False, + **kwargs, + ) -> None: + """args: + disable_tokenizer_parallel: true if use extractor for collator function in torch.DataLoader + """ + self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") + self.tokenizer.__dict__["model_max_length"] = 512 + if disable_tokenizer_parallel: # disable tokenizer parallel + os.environ["TOKENIZERS_PARALLELISM"] = "false" + self.vocab_size = self.tokenizer.vocab_size + self.pad_token_id = self.tokenizer.pad_token_id + + def __call__(self, x, shuffle=False, keep_input_grad=False) -> Dict: + """ + Parameters + ---------- + x: pd.DataFrame + with column names and features. + + shuffle: bool + if shuffle column order during the training. + + Returns + ------- + encoded_inputs: a dict with { + 'x_num': tensor contains numerical features, + 'num_col_input_ids': tensor contains numerical column tokenized ids, + } + """ + encoded_inputs = { + "x_num": None, + "num_col_input_ids": None, + "x_cat_input_ids": None, + "x_bin_input_ids": None, + } + num_cols = x.columns.tolist() if not shuffle else np.random.shuffle(x.columns.tolist()) + x_num = x[num_cols].fillna(0) + + if keep_input_grad: + x_num_ts = torch.tensor(x_num.values, dtype=float, requires_grad=True) # keep the grad + else: + x_num_ts = torch.tensor(x_num.values, dtype=float) + num_col_ts = self.tokenizer( + num_cols, + padding=True, + truncation=True, + add_special_tokens=False, + return_tensors="pt", + ) + + encoded_inputs["x_num"] = x_num_ts + encoded_inputs["num_col_input_ids"] = num_col_ts["input_ids"] + encoded_inputs["num_att_mask"] = num_col_ts["attention_mask"] # mask out attention + + return encoded_inputs + + # ------------------------ New function ------------------------ + def forward(self, cols, x) -> Dict: + """ + Parameters + ---------- + cols: List[str] + Contain all column names in order. + + x: torch.Tensor + + Returns + ------- + encoded_inputs: a dict with { + 'x_num': tensor contains numerical features, + 'num_col_input_ids': tensor contains numerical column tokenized ids, + } + """ + encoded_inputs = { + "x_num": None, + "num_col_input_ids": None, + } + num_cols = cols + num_col_ts = self.tokenizer( + num_cols, + padding=True, + truncation=True, + add_special_tokens=False, + return_tensors="pt", + ) + encoded_inputs["x_num"] = x + encoded_inputs["num_col_input_ids"] = num_col_ts["input_ids"] + encoded_inputs["num_att_mask"] = num_col_ts["attention_mask"] # mask out attention + + return encoded_inputs + + # def save(self, path): + # """save the feature extractor configuration to local dir.""" + # self.tokenizer.save_pretrained(os.path.join(path, conf.market_tokenizer_path)) + + # def load(self, path): + # """load the feature extractor configuration from local dir.""" + # tokenizer_path = os.path.join(path, conf.market_tokenizer_path) + # if os.path.exists(tokenizer_path): + # self.tokenizer = BertTokenizerFast.from_pretrained(os.path.join(path, conf.market_tokenizer_path)) + + +class FeatureProcessor(nn.Module): + r""" + Process inputs from feature extractor to map them to embeddings. + """ + + def __init__( + self, + vocab_size=None, + hidden_dim=128, + hidden_dropout_prob=0, + pad_token_id=0, + device="cuda:0", + ) -> None: + super().__init__() + self.word_embedding = WordEmbedding( + vocab_size=vocab_size, + hidden_dim=hidden_dim, + hidden_dropout_prob=hidden_dropout_prob, + padding_idx=pad_token_id, + ) + self.num_embedding = NumEmbedding(hidden_dim) + self.align_layer = nn.Linear(hidden_dim, hidden_dim, bias=False) + self.device = device + + def _avg_embedding_by_mask(self, embs, att_mask=None): + if att_mask is None: + return embs.mean(1) + else: + embs[att_mask == 0] = 0 + embs = embs.sum(1) / att_mask.sum(1, keepdim=True).to(embs.device) + return embs + + def forward( + self, + x_num=None, + num_col_input_ids=None, + num_att_mask=None, + **kwargs, + ) -> Tensor: + """args: + x: pd.DataFrame with column names and features. + shuffle: if shuffle column order during the training. + num_mask: indicate the NaN place of numerical features, 0: NaN 1: normal. + """ + x_num = x_num.to(self.device) + + num_col_emb = self.word_embedding(num_col_input_ids.to(self.device)) + num_col_emb = self._avg_embedding_by_mask(num_col_emb, num_att_mask) + + num_feat_embedding = self.num_embedding(num_col_emb, x_num) + num_feat_embedding = self.align_layer(num_feat_embedding).float() + + attention_mask = torch.ones(num_feat_embedding.shape[0], num_feat_embedding.shape[1]).to( + num_feat_embedding.device + ) + return {"embedding": num_feat_embedding, "attention_mask": attention_mask} + + +class CLSToken(nn.Module): + """add a learnable cls token embedding at the end of each sequence.""" + + def __init__(self, hidden_dim) -> None: + super().__init__() + self.weight = nn.Parameter(Tensor(hidden_dim)) + nn_init.uniform_(self.weight, a=-1 / math.sqrt(hidden_dim), b=1 / math.sqrt(hidden_dim)) + self.hidden_dim = hidden_dim + + def expand(self, *leading_dimensions): + new_dims = (1,) * (len(leading_dimensions) - 1) + # cls token (128,) -> view(*new_dims, -1) -> (1, 128) + # (1, 128) -> expand(*leading_dimensions, -1) -> (64, 1, 128) + # here expand means "shared", the cls token embedding remains the same for each sample + return self.weight.view(*new_dims, -1).expand(*leading_dimensions, -1) + + def forward(self, embedding, attention_mask=None, **kwargs) -> Tensor: + # embedding shape: (64, 11, 128), where 11 is the largest sequence length after tokenizing + # after concat, learnable cls token [self.weight] is added to each semantic embedding + # embedding shape: (64, d+1, 128) + embedding = torch.cat([self.expand(len(embedding), 1), embedding], dim=1) + outputs = {"embedding": embedding} + if attention_mask is not None: + attention_mask = torch.cat( + [ + torch.ones(attention_mask.shape[0], 1).to(attention_mask.device), + attention_mask, + ], + 1, + ) + outputs["attention_mask"] = attention_mask + return outputs diff --git a/learnware/market/hetergeneous/organizer/hetero_mapping/trainer.py b/learnware/market/hetergeneous/organizer/hetero_mapping/trainer.py new file mode 100644 index 0000000..321bcdd --- /dev/null +++ b/learnware/market/hetergeneous/organizer/hetero_mapping/trainer.py @@ -0,0 +1,253 @@ +import json +import math +import os +import time + +import numpy as np +import pandas as pd +import torch +from loguru import logger +from torch import nn +from torch.utils.data import DataLoader, Dataset +from tqdm.autonotebook import trange + +from ..config import C as conf +from .feature_extractor import FeatureTokenizer + + +class Trainer: + def __init__( + self, + model, + train_set_list, + collate_fn=None, + output_dir="./ckpt", + num_epoch=10, + batch_size=64, + lr=1e-4, + weight_decay=0, + eval_batch_size=256, + **kwargs, + ): + """args: + train_set_list: a list of training sets [(x_1,y_1),(x_2,y_2),...] + patience: the max number of early stop patience + eval_less_is_better: if the set eval_metric is the less the better. For val_loss, it should be set True. + """ + self.model = model + if isinstance(train_set_list, tuple): + train_set_list = [train_set_list] + + self.train_set_list = train_set_list + self.collate_fn = collate_fn + self.trainloader_list = [ + self._build_dataloader(trainset, batch_size, collator=self.collate_fn) for trainset in train_set_list + ] + self.output_dir = output_dir + # os.makedirs(self.output_dir, exist_ok=True) + self.args = { + "lr": lr, + "weight_decay": weight_decay, + "batch_size": batch_size, + "num_epoch": num_epoch, + "eval_batch_size": eval_batch_size, + "num_training_steps": self._get_num_train_steps(train_set_list, num_epoch, batch_size), + } + self.args["steps_per_epoch"] = int(self.args["num_training_steps"] / (num_epoch * len(self.train_set_list))) + self.optimizer = None + + def train(self): + self._create_optimizer() + start_time = time.time() + final_train_loss = 0 + for epoch in trange(self.args["num_epoch"], desc="Epoch"): + ite = 0 + train_loss_all = 0 + for dataindex in range(len(self.trainloader_list)): + for data in self.trainloader_list[dataindex]: + self.optimizer.zero_grad() + loss = self.model(data) + loss.backward() + self.optimizer.step() + train_loss_all += loss.item() + ite += 1 + + logger.info( + "epoch: {}, train loss: {:.4f}, lr: {:.6f}, spent: {:.1f} secs".format( + epoch, + train_loss_all, + self.optimizer.param_groups[0]["lr"], + time.time() - start_time, + ) + ) + final_train_loss = train_loss_all + + # self.save_model(self.output_dir) + + logger.info("training complete, cost {:.1f} secs.".format(time.time() - start_time)) + return final_train_loss + + def save_model(self, output_dir=None): + if output_dir is None: + logger.info("no path assigned for save mode, default saved to ./ckpt/model.pt !") + output_dir = self.output_dir + + logger.info(f"saving model checkpoint to {output_dir}") + self.model.save(output_dir) + # self.collate_fn.save(output_dir) + + if self.args is not None: + train_args = {} + for k, v in self.args.items(): + if isinstance(v, int) or isinstance(v, str) or isinstance(v, float): + train_args[k] = v + with open( + os.path.join(output_dir, conf.market_training_args_path), + "w", + encoding="utf-8", + ) as f: + f.write(json.dumps(train_args)) + + def _create_optimizer(self): + if self.optimizer is None: + decay_parameters = self._get_parameter_names(self.model, [nn.LayerNorm]) + decay_parameters = [name for name in decay_parameters if "bias" not in name] + + decay_params_dict = {n: p for n, p in self.model.named_parameters() if n in decay_parameters} + no_decay_params_dict = {n: p for n, p in self.model.named_parameters() if n not in decay_parameters} + + optimizer_grouped_parameters = [ + { + "params": list(decay_params_dict.values()), + "weight_decay": self.args["weight_decay"], + }, + {"params": list(no_decay_params_dict.values()), "weight_decay": 0.0}, + ] + + self.optimizer = torch.optim.Adam(optimizer_grouped_parameters, lr=self.args["lr"]) + + def _get_num_train_steps(self, train_set_list, num_epoch, batch_size): + total_step = 0 + for trainset in train_set_list: + x_train = trainset + total_step += np.ceil(len(x_train) / batch_size) + total_step *= num_epoch + return total_step + + def _build_dataloader(self, trainset, batch_size, collator, shuffle=True): + trainloader = DataLoader( + TrainDataset(trainset), + collate_fn=collator, + batch_size=batch_size, + shuffle=shuffle, + pin_memory=True, + drop_last=False, + ) + return trainloader + + def _get_parameter_names(self, model, forbidden_layer_types): + """ + Returns the names of the model parameters that are not inside a forbidden layer. + """ + result = [] + for name, child in model.named_children(): + result += [ + f"{name}.{n}" + for n in self._get_parameter_names(child, forbidden_layer_types) + if not isinstance(child, tuple(forbidden_layer_types)) + ] + # Add model specific parameters (defined with nn.Parameter) since they are not in any child. + result += list(model._parameters.keys()) + return result + + +class TrainDataset(Dataset): + def __init__(self, trainset): + self.x = trainset + + def __len__(self): + return len(self.x) + + def __getitem__(self, index): + x = self.x.iloc[index - 1 : index] + return x + + +class TransTabCollatorForCL: + """support positive pair sampling for contrastive learning of transtab model.""" + + def __init__( + self, + feature_tokenizer=None, + overlap_ratio=0.5, + num_partition=3, + **kwargs, + ) -> None: + self.feature_tokenizer = feature_tokenizer or FeatureTokenizer(disable_tokenizer_parallel=True) + assert num_partition > 0, f"number of contrastive subsets must be greater than 0, got {num_partition}" + assert isinstance(num_partition, int), f"number of constrative subsets must be int, got {type(num_partition)}" + assert overlap_ratio >= 0 and overlap_ratio < 1, f"overlap_ratio must be in [0, 1), got {overlap_ratio}" + self.overlap_ratio = overlap_ratio + self.num_partition = num_partition + + def __call__(self, data): + """ + Take a list of subsets (views) from the original tests. + """ + # 1. build positive pairs + # 2. encode each pair using feature extractor + df_x = pd.concat([row for row in data]) + if self.num_partition > 1: + sub_x_list = self._build_positive_pairs(df_x, self.num_partition) + else: + sub_x_list = self._build_positive_pairs_single_view(df_x) + input_x_list = [] + for sub_x in sub_x_list: + inputs = self.feature_tokenizer(sub_x) + input_x_list.append(inputs) + res = {"input_sub_x": input_x_list} + return res + + def _build_positive_pairs(self, x, n): + """ + Builds positive pairs of sub-dataframes from the input dataframe x. + + Args: + x (pandas.DataFrame): Input dataframe. + n (int): Number of sub-dataframes to split x into. + + Returns: + list: List of sub-dataframes, each containing a positive pair of columns from x. + """ + x_cols = x.columns.tolist() + sub_col_list = np.array_split(np.array(x_cols), n) + len_cols = len(sub_col_list[0]) + overlap = int(math.ceil(len_cols * (self.overlap_ratio))) + sub_x_list = [] + for i, sub_col in enumerate(sub_col_list): + if overlap > 0 and i < n - 1: + sub_col = np.concatenate([sub_col, sub_col_list[i + 1][:overlap]]) + elif overlap > 0 and i == n - 1: + sub_col = np.concatenate([sub_col, sub_col_list[i - 1][-overlap:]]) + sub_x = x.copy()[sub_col] + sub_x_list.append(sub_x) + return sub_x_list + + def _build_positive_pairs_single_view(self, x): + """ + Builds positive pairs for a single view of data by corrupting half of the columns and shuffling the corrupted columns. + + Args: + x (pandas.DataFrame): The input data. + + Returns: + list: A list of two pandas DataFrames, where each DataFrame contains the original data with half of the columns corrupted and shuffled. + """ + x_cols = x.columns.tolist() + sub_x_list = [x] + n_corrupt = int(len(x_cols) * 0.5) + corrupt_cols = x_cols[:n_corrupt] + x_corrupt = x.copy()[corrupt_cols] + np.random.shuffle(x_corrupt.values) + sub_x_list.append(pd.concat([x.copy().drop(corrupt_cols, axis=1), x_corrupt], axis=1)) + return sub_x_list \ No newline at end of file From 63167faeb64d7a2b2b3ba87812dcf7c0abc733cb Mon Sep 17 00:00:00 2001 From: liuht Date: Mon, 6 Nov 2023 17:01:41 +0800 Subject: [PATCH 06/90] [MNT] add HeteroMapTableSearcher --- learnware/market/hetergeneous/searcher.py | 28 +++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 learnware/market/hetergeneous/searcher.py diff --git a/learnware/market/hetergeneous/searcher.py b/learnware/market/hetergeneous/searcher.py new file mode 100644 index 0000000..2bf12f6 --- /dev/null +++ b/learnware/market/hetergeneous/searcher.py @@ -0,0 +1,28 @@ +from typing import List + +from .organizer import HeteroMapTableOrganizer +from ...learnware import Learnware +from ..base import BaseSearcher, BaseUserInfo +from ...logger import get_module_logger + +logger = get_module_logger("hetero_searcher") + +class HeteroMapTableSearcher(BaseSearcher): + def __init__(self, organizer: HeteroMapTableOrganizer = None): + super(HeteroMapTableSearcher, self).__init__(organizer) + + def __call__(self, user_info: BaseUserInfo, check_status: int = None) -> Learnware: + # todo: use specially assigned search_gamma for calculating mmd dist + learnware_list = self.learnware_oganizer.get_learnwares() + target_learnware, min_dist = None, None + user_hetero_spec = self.learnware_oganizer.generate_hetero_map_spec(user_info) + for learnware in learnware_list.values(): + learnware_hetero_spec = learnware.specification.get_stat_spec_by_name("HeteroSpecification") + mmd_dist = learnware_hetero_spec.dist(user_hetero_spec) + if target_learnware is None or mmd_dist < min_dist: + min_dist = mmd_dist + target_learnware = learnware + return target_learnware + + def reset(self, organizer): + self.learnware_oganizer = organizer \ No newline at end of file From 23dd1669852a8bbe6a8b436ad5cb7320b6baaa94 Mon Sep 17 00:00:00 2001 From: liuht Date: Mon, 6 Nov 2023 17:02:00 +0800 Subject: [PATCH 07/90] [MNT] add heterogeneous market --- learnware/market/__init__.py | 2 +- learnware/market/hetergeneous/__init__.py | 3 ++- learnware/market/module.py | 6 ++++++ 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/learnware/market/__init__.py b/learnware/market/__init__.py index 8a00fde..f30f0dc 100644 --- a/learnware/market/__init__.py +++ b/learnware/market/__init__.py @@ -3,7 +3,7 @@ from .base import BaseUserInfo, LearnwareMarket, BaseChecker, BaseOrganizer, Bas from .evolve_anchor import EvolvedAnchoredOrganizer from .evolve import EvolvedOrganizer from .easy2 import EasyOrganizer, EasySearcher, EasySemanticChecker, EasyStatChecker -from .hetergeneous import HeterogeneousOrganizer, MappingFunction +from .hetergeneous import HeteroMapTableOrganizer, HeteroMapTableSearcher from .easy import EasyMarket from .classes import CondaChecker diff --git a/learnware/market/hetergeneous/__init__.py b/learnware/market/hetergeneous/__init__.py index caef8fa..2d09a37 100644 --- a/learnware/market/hetergeneous/__init__.py +++ b/learnware/market/hetergeneous/__init__.py @@ -1 +1,2 @@ -from .organizer import MappingFunction, HeterogeneousOrganizer +from .organizer import HeteroMapTableOrganizer +from .searcher import HeteroMapTableSearcher \ No newline at end of file diff --git a/learnware/market/module.py b/learnware/market/module.py index 0b4eb00..43499ec 100644 --- a/learnware/market/module.py +++ b/learnware/market/module.py @@ -1,11 +1,17 @@ from .base import LearnwareMarket from .easy2 import EasyOrganizer, EasySearcher, EasySemanticChecker, EasyStatChecker +from .hetergeneous import HeteroMapTableOrganizer, HeteroMapTableSearcher MARKET_CONFIG = { "easy": { "organizer": EasyOrganizer(), "searcher": EasySearcher(), "checker_list": [EasySemanticChecker(), EasyStatChecker()], + }, + "hetero": { + "organizer": HeteroMapTableOrganizer(), + "searcher": HeteroMapTableSearcher(), + "checker_list": [] } } From 6c7d40d493c3698953d81be3a843a4a0536716cc Mon Sep 17 00:00:00 2001 From: liuht Date: Mon, 6 Nov 2023 17:47:31 +0800 Subject: [PATCH 08/90] [MNT] black format --- learnware/market/hetergeneous/__init__.py | 2 +- learnware/market/hetergeneous/database_ops.py | 11 ++--- learnware/market/hetergeneous/organizer.py | 5 ++- .../market/hetergeneous/organizer/__init__.py | 44 +++++++++---------- .../market/hetergeneous/organizer/config.py | 2 +- .../organizer/hetero_mapping/__init__.py | 28 ++++++------ .../organizer/hetero_mapping/trainer.py | 2 +- learnware/market/hetergeneous/searcher.py | 9 ++-- learnware/specification/system/base.py | 5 ++- learnware/specification/system/heter_table.py | 3 +- 10 files changed, 58 insertions(+), 53 deletions(-) diff --git a/learnware/market/hetergeneous/__init__.py b/learnware/market/hetergeneous/__init__.py index 2d09a37..dc6608d 100644 --- a/learnware/market/hetergeneous/__init__.py +++ b/learnware/market/hetergeneous/__init__.py @@ -1,2 +1,2 @@ from .organizer import HeteroMapTableOrganizer -from .searcher import HeteroMapTableSearcher \ No newline at end of file +from .searcher import HeteroMapTableSearcher diff --git a/learnware/market/hetergeneous/database_ops.py b/learnware/market/hetergeneous/database_ops.py index d2920bd..5d8461a 100644 --- a/learnware/market/hetergeneous/database_ops.py +++ b/learnware/market/hetergeneous/database_ops.py @@ -1,10 +1,11 @@ -from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy import create_engine, text -from sqlalchemy import Column, Integer, Text, DateTime, String -import os import json +import os + from learnware.learnware import get_learnware_from_dirpath from learnware.logger import get_module_logger +from sqlalchemy import (Column, DateTime, Integer, String, Text, create_engine, + text) +from sqlalchemy.ext.declarative import declarative_base logger = get_module_logger("database") DeclarativeBase = declarative_base() @@ -173,4 +174,4 @@ class DatabaseOperations(object): return learnware_list, zip_list, folder_list, use_flags, max_count + 1 pass - pass \ No newline at end of file + pass diff --git a/learnware/market/hetergeneous/organizer.py b/learnware/market/hetergeneous/organizer.py index 90e2410..15e9ea4 100644 --- a/learnware/market/hetergeneous/organizer.py +++ b/learnware/market/hetergeneous/organizer.py @@ -1,8 +1,9 @@ -import numpy as np from typing import List -from ..evolve.organizer import EvolvedOrganizer +import numpy as np + from ...learnware import Learnware +from ..evolve.organizer import EvolvedOrganizer class MappingFunction: diff --git a/learnware/market/hetergeneous/organizer/__init__.py b/learnware/market/hetergeneous/organizer/__init__.py index 53355e8..c3c5d48 100644 --- a/learnware/market/hetergeneous/organizer/__init__.py +++ b/learnware/market/hetergeneous/organizer/__init__.py @@ -1,40 +1,38 @@ from __future__ import annotations +import multiprocessing import os from collections import defaultdict from typing import List import pandas as pd -import multiprocessing -from .config import C as conf -from .hetero_mapping import Trainer, HeteroMapping -from ..database_ops import DatabaseOperations -from ...base import BaseUserInfo, BaseOrganizer from ....learnware import Learnware from ....logger import get_module_logger from ....specification.system import HeteroSpecification - +from ...base import BaseOrganizer, BaseUserInfo +from ..database_ops import DatabaseOperations +from .config import C as conf +from .hetero_mapping import HeteroMapping, Trainer logger = get_module_logger("hetero_market") + class HeteroMapTableOrganizer(BaseOrganizer): - def reload_market( - self, rebuild=False, auto_update_limit=50 - ): + def reload_market(self, rebuild=False, auto_update_limit=50): self.market_store_path = os.path.join(conf.market_root_path, self.market_id) self.market_mapping_path = os.path.join(self.market_store_path, conf.market_model_path) self.learnware_pool_path = os.path.join(self.market_store_path, "learnware_pool") self.learnware_zip_pool_path = os.path.join(self.market_store_path, "zips") self.learnware_folder_pool_path = os.path.join(self.market_store_path, "unzipped_learnwares") - self.learnware_list = {} # id:learnware + self.learnware_list = {} # id:learnware self.learnware_zip_list = {} self.learnware_folder_list = {} self.count = 0 # default root path: ../../.learnware self.root_path = conf.market_root_path self.dbops = DatabaseOperations(conf.database_url, "market_" + self.market_id) - self.auto_update_limit = auto_update_limit + self.auto_update_limit = auto_update_limit os.makedirs(self.learnware_pool_path, exist_ok=True) os.makedirs(self.learnware_zip_pool_path, exist_ok=True) @@ -62,15 +60,17 @@ class HeteroMapTableOrganizer(BaseOrganizer): else: logger.warning(f"No Existing Market Mapping!!") self.market_mapping = HeteroMapping() - + def reset(self, market_id=None, auto_update=False, **kwargs): # model training arguments(model architecture + optimization) set via self.reset self.auto_update = auto_update self.market_id = market_id self.training_args = kwargs - def add_learnware(self, zip_path: str, semantic_spec: dict, check_status: int, learnware: Learnware) -> Tuple[str, int]: - self._update_learnware_list([learnware]) + def add_learnware( + self, zip_path: str, semantic_spec: dict, check_status: int, learnware: Learnware + ) -> Tuple[str, int]: + self._update_learnware_list([learnware]) self.learnware_list[learnware.id] = learnware self.count += 1 @@ -78,13 +78,13 @@ class HeteroMapTableOrganizer(BaseOrganizer): train_process = multiprocessing.Process(target=self.train, args=(self.learnware_list,)) train_process.start() # train_process.join() - + def delete_learnware(self, id: str) -> bool: raise NotImplementedError - + def update_learnware(self, learnware: Learnware): raise NotImplementedError - + def get_learnwares(self): return self.learnware_list @@ -95,7 +95,7 @@ class HeteroMapTableOrganizer(BaseOrganizer): model=self.market_mapping, train_set_list=allset, collate_fn=self.market_mapping.collate_fn, - **self.training_args + **self.training_args, ) market_mapping_trainer.train() @@ -128,7 +128,7 @@ class HeteroMapTableOrganizer(BaseOrganizer): user_features = user_info.semantic_spec["Input"]["Description"].values() user_hetero_spec = self.market_mapping.hetero_mapping(user_rkme, user_features) return user_hetero_spec - + def _learnwares_to_dataframes(self, learnware_list: List[Learnware]) -> List[pd.DataFrame]: learnware_df_dict = defaultdict(list) for learnware in learnware_list: @@ -138,12 +138,12 @@ class HeteroMapTableOrganizer(BaseOrganizer): learnware_df = pd.DataFrame(data=learnware_rkme.get_z(), columns=learnware_features.values()) learnware_df_dict[tuple(sorted(learnware_features))].append(learnware_df) - + merged_dfs = [pd.concat(dfs) for dfs in learnware_df_dict.values()] return merged_dfs def save(self, save_path): return NotImplementedError - + def __len__(self): - return len(self.learnware_list) \ No newline at end of file + return len(self.learnware_list) diff --git a/learnware/market/hetergeneous/organizer/config.py b/learnware/market/hetergeneous/organizer/config.py index a878ace..2510e3a 100644 --- a/learnware/market/hetergeneous/organizer/config.py +++ b/learnware/market/hetergeneous/organizer/config.py @@ -47,7 +47,7 @@ _DEFAULT_CONFIG = { "yaml_file": "learnware.yaml", "module_file": "__init__.py", }, - "database_url": f"sqlite:///{DATABASE_PATH}" + "database_url": f"sqlite:///{DATABASE_PATH}", } C = Config(_DEFAULT_CONFIG) diff --git a/learnware/market/hetergeneous/organizer/hetero_mapping/__init__.py b/learnware/market/hetergeneous/organizer/hetero_mapping/__init__.py index 90014e7..8a992d5 100644 --- a/learnware/market/hetergeneous/organizer/hetero_mapping/__init__.py +++ b/learnware/market/hetergeneous/organizer/hetero_mapping/__init__.py @@ -33,17 +33,17 @@ class HeteroMapping(nn.Module): **kwargs, ) -> None: super(HeteroMapping, self).__init__() - + self.model_args = { - 'num_partition': num_partition, - 'overlap_ratio': overlap_ratio, - 'hidden_dim': hidden_dim, - 'num_layer': num_layer, - 'num_attention_head': num_attention_head, - 'hidden_dropout_prob': hidden_dropout_prob, - 'ffn_dim': ffn_dim, - 'projection_dim': projection_dim, - 'activation': activation + "num_partition": num_partition, + "overlap_ratio": overlap_ratio, + "hidden_dim": hidden_dim, + "num_layer": num_layer, + "num_attention_head": num_attention_head, + "hidden_dropout_prob": hidden_dropout_prob, + "ffn_dim": ffn_dim, + "projection_dim": projection_dim, + "activation": activation, } self.model_args.update(kwargs) @@ -126,13 +126,13 @@ class HeteroMapping(nn.Module): model_info = { "model_state_dict": self.state_dict(), "model_args": self.model_args, - "feature_tokenizer": self.feature_tokenizer + "feature_tokenizer": self.feature_tokenizer, } torch.save(model_info, os.path.join(ckpt_dir, conf.market_model_path)) def forward(self, x, y=None): # do positive sampling - feat_x_list = [] + feat_x_list = [] if isinstance(x, dict): # pretokenized inputs for input_x in x["input_sub_x"]: @@ -149,7 +149,7 @@ class HeteroMapping(nn.Module): feat_x_multiview = torch.stack(feat_x_list, axis=1) # bs, n_view, emb_dim loss = self._self_supervised_contrastive_loss(feat_x_multiview) return loss - + def hetero_mapping(self, rkme_spec: RKMETableSpecification, cols: List[str]) -> HeteroSpecification: hetero_spec = HeteroSpecification() hetero_input_df = pd.DataFrame(data=rkme_spec.get_z(), columns=cols) @@ -417,4 +417,4 @@ class TransformerMultiLayer(nn.Module): outputs = embedding for i, mod in enumerate(self.transformer_encoder): outputs = mod(outputs, src_key_padding_mask=attention_mask) - return outputs \ No newline at end of file + return outputs diff --git a/learnware/market/hetergeneous/organizer/hetero_mapping/trainer.py b/learnware/market/hetergeneous/organizer/hetero_mapping/trainer.py index 321bcdd..5845c36 100644 --- a/learnware/market/hetergeneous/organizer/hetero_mapping/trainer.py +++ b/learnware/market/hetergeneous/organizer/hetero_mapping/trainer.py @@ -250,4 +250,4 @@ class TransTabCollatorForCL: x_corrupt = x.copy()[corrupt_cols] np.random.shuffle(x_corrupt.values) sub_x_list.append(pd.concat([x.copy().drop(corrupt_cols, axis=1), x_corrupt], axis=1)) - return sub_x_list \ No newline at end of file + return sub_x_list diff --git a/learnware/market/hetergeneous/searcher.py b/learnware/market/hetergeneous/searcher.py index 2bf12f6..d70b06e 100644 --- a/learnware/market/hetergeneous/searcher.py +++ b/learnware/market/hetergeneous/searcher.py @@ -1,12 +1,13 @@ from typing import List -from .organizer import HeteroMapTableOrganizer from ...learnware import Learnware -from ..base import BaseSearcher, BaseUserInfo from ...logger import get_module_logger +from ..base import BaseSearcher, BaseUserInfo +from .organizer import HeteroMapTableOrganizer logger = get_module_logger("hetero_searcher") + class HeteroMapTableSearcher(BaseSearcher): def __init__(self, organizer: HeteroMapTableOrganizer = None): super(HeteroMapTableSearcher, self).__init__(organizer) @@ -23,6 +24,6 @@ class HeteroMapTableSearcher(BaseSearcher): min_dist = mmd_dist target_learnware = learnware return target_learnware - + def reset(self, organizer): - self.learnware_oganizer = organizer \ No newline at end of file + self.learnware_oganizer = organizer diff --git a/learnware/specification/system/base.py b/learnware/specification/system/base.py index 8369b28..12a7226 100644 --- a/learnware/specification/system/base.py +++ b/learnware/specification/system/base.py @@ -1,8 +1,9 @@ from __future__ import annotations -from ..base import BaseStatSpecification from loguru import logger +from ..base import BaseStatSpecification + class SystemStatsSpecification(BaseStatSpecification): def generate_stat_spec(self, **kwargs): @@ -13,4 +14,4 @@ class SystemStatsSpecification(BaseStatSpecification): - kwargs may include the feature, label and model - kwargs also can include hyperparameters of specific method for specifaction generation """ - raise NotImplementedError("generate_stat_spec_from_data is not implemented") \ No newline at end of file + raise NotImplementedError("generate_stat_spec_from_data is not implemented") diff --git a/learnware/specification/system/heter_table.py b/learnware/specification/system/heter_table.py index e878ca6..e611ca1 100644 --- a/learnware/specification/system/heter_table.py +++ b/learnware/specification/system/heter_table.py @@ -7,11 +7,12 @@ import os import numpy as np import torch + from ..regular.table import RKMEStatSpecification from ..regular.table.rkme import choose_device, setup_seed, torch_rbf_kernel - from .base import SystemStatsSpecification + class HeteroSpecification(SystemStatsSpecification): """Heterogeneous Embedding Specification""" From 677a3caed15e079f37fa8e5339cb9a6a2ae8faf9 Mon Sep 17 00:00:00 2001 From: Gene Date: Mon, 6 Nov 2023 19:09:35 +0800 Subject: [PATCH 09/90] [FIX] fix bugs in create_semantic_specification --- learnware/client/learnware_client.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/learnware/client/learnware_client.py b/learnware/client/learnware_client.py index a5d20b3..df9121d 100644 --- a/learnware/client/learnware_client.py +++ b/learnware/client/learnware_client.py @@ -265,11 +265,7 @@ class LearnwareClient: semantic_specification["Input"] = input_description semantic_specification["Output"] = output_description - if self._check_semantic_specification(semantic_specification): - return semantic_specification - else: - logger.error("The parameters passed in create_semantic_specification() are illegal!") - return None + return semantic_specification def list_semantic_specification_values(self, key: SemanticSpecificationKey): url = f"{self.host}/engine/semantic_specification" From ed38d8afb68c19ef48120902f6f8f0befaf15f59 Mon Sep 17 00:00:00 2001 From: liuht Date: Mon, 6 Nov 2023 21:24:24 +0800 Subject: [PATCH 10/90] [FIX] BaseOrganizer -> EasyOrganizer --- learnware/market/hetergeneous/organizer/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/learnware/market/hetergeneous/organizer/__init__.py b/learnware/market/hetergeneous/organizer/__init__.py index c3c5d48..db0c05e 100644 --- a/learnware/market/hetergeneous/organizer/__init__.py +++ b/learnware/market/hetergeneous/organizer/__init__.py @@ -10,7 +10,8 @@ import pandas as pd from ....learnware import Learnware from ....logger import get_module_logger from ....specification.system import HeteroSpecification -from ...base import BaseOrganizer, BaseUserInfo +from ...base import BaseUserInfo +from ...easy2 import EasyOrganizer from ..database_ops import DatabaseOperations from .config import C as conf from .hetero_mapping import HeteroMapping, Trainer @@ -18,7 +19,7 @@ from .hetero_mapping import HeteroMapping, Trainer logger = get_module_logger("hetero_market") -class HeteroMapTableOrganizer(BaseOrganizer): +class HeteroMapTableOrganizer(EasyOrganizer): def reload_market(self, rebuild=False, auto_update_limit=50): self.market_store_path = os.path.join(conf.market_root_path, self.market_id) self.market_mapping_path = os.path.join(self.market_store_path, conf.market_model_path) @@ -29,9 +30,8 @@ class HeteroMapTableOrganizer(BaseOrganizer): self.learnware_zip_list = {} self.learnware_folder_list = {} self.count = 0 - # default root path: ../../.learnware - self.root_path = conf.market_root_path self.dbops = DatabaseOperations(conf.database_url, "market_" + self.market_id) + self.auto_update = False self.auto_update_limit = auto_update_limit os.makedirs(self.learnware_pool_path, exist_ok=True) From 5588d79076d917494e21da4ba7994babdecf9a53 Mon Sep 17 00:00:00 2001 From: Peng Tan Date: Tue, 7 Nov 2023 13:45:04 +0800 Subject: [PATCH 11/90] [ENH] add test_prepare_learnware_randomly & test_generated_learnwares (multi-processing) --- learnware/market/module.py | 2 +- .../example_learnwares/config.py | 1 + .../example_learnware_0/__init__.py | 22 ++ .../example_learnware_0/learnware.yaml | 8 + .../example_learnware_0/requirements.txt | 1 + .../example_learnware_1/__init__.py | 22 ++ .../example_learnware_1/learnware.yaml | 8 + .../example_learnware_1/requirements.txt | 1 + .../test_hetero_market/test_hetero.py | 236 ++++++++++++++++++ 9 files changed, 300 insertions(+), 1 deletion(-) create mode 100644 tests/test_market/test_hetero_market/example_learnwares/config.py create mode 100644 tests/test_market/test_hetero_market/example_learnwares/example_learnware_0/__init__.py create mode 100644 tests/test_market/test_hetero_market/example_learnwares/example_learnware_0/learnware.yaml create mode 100644 tests/test_market/test_hetero_market/example_learnwares/example_learnware_0/requirements.txt create mode 100644 tests/test_market/test_hetero_market/example_learnwares/example_learnware_1/__init__.py create mode 100644 tests/test_market/test_hetero_market/example_learnwares/example_learnware_1/learnware.yaml create mode 100644 tests/test_market/test_hetero_market/example_learnwares/example_learnware_1/requirements.txt create mode 100644 tests/test_market/test_hetero_market/test_hetero.py diff --git a/learnware/market/module.py b/learnware/market/module.py index 43499ec..f60e06a 100644 --- a/learnware/market/module.py +++ b/learnware/market/module.py @@ -11,7 +11,7 @@ MARKET_CONFIG = { "hetero": { "organizer": HeteroMapTableOrganizer(), "searcher": HeteroMapTableSearcher(), - "checker_list": [] + "checker_list": [EasySemanticChecker(), EasyStatChecker()] } } diff --git a/tests/test_market/test_hetero_market/example_learnwares/config.py b/tests/test_market/test_hetero_market/example_learnwares/config.py new file mode 100644 index 0000000..6c8459a --- /dev/null +++ b/tests/test_market/test_hetero_market/example_learnwares/config.py @@ -0,0 +1 @@ +input_shape_list=[20, 30] # 20-input shape of example learnware 0, 30-input shape of example learnware 1 \ No newline at end of file diff --git a/tests/test_market/test_hetero_market/example_learnwares/example_learnware_0/__init__.py b/tests/test_market/test_hetero_market/example_learnwares/example_learnware_0/__init__.py new file mode 100644 index 0000000..e9c6cf0 --- /dev/null +++ b/tests/test_market/test_hetero_market/example_learnwares/example_learnware_0/__init__.py @@ -0,0 +1,22 @@ +from learnware.model import BaseModel +import numpy as np +import joblib +import os + + +class MyModel(BaseModel): + def __init__(self): + super(MyModel, self).__init__(input_shape=(20,), output_shape=(1,)) + dir_path = os.path.dirname(os.path.abspath(__file__)) + model_path=os.path.join(dir_path, "ridge.pkl") + model = joblib.load(model_path) + self.model=model + + def fit(self, X: np.ndarray, y: np.ndarray): + pass + + def predict(self, X: np.ndarray) -> np.ndarray: + return self.model.predict(X) + + def finetune(self, X: np.ndarray, y: np.ndarray): + pass \ No newline at end of file diff --git a/tests/test_market/test_hetero_market/example_learnwares/example_learnware_0/learnware.yaml b/tests/test_market/test_hetero_market/example_learnwares/example_learnware_0/learnware.yaml new file mode 100644 index 0000000..4a37a37 --- /dev/null +++ b/tests/test_market/test_hetero_market/example_learnwares/example_learnware_0/learnware.yaml @@ -0,0 +1,8 @@ +model: + class_name: MyModel + kwargs: {} +stat_specifications: + - module_path: learnware.specification + class_name: RKMETableSpecification + file_name: stat.json + kwargs: {} \ No newline at end of file diff --git a/tests/test_market/test_hetero_market/example_learnwares/example_learnware_0/requirements.txt b/tests/test_market/test_hetero_market/example_learnwares/example_learnware_0/requirements.txt new file mode 100644 index 0000000..1da1c5f --- /dev/null +++ b/tests/test_market/test_hetero_market/example_learnwares/example_learnware_0/requirements.txt @@ -0,0 +1 @@ +learnware == 0.1.0.999 \ No newline at end of file diff --git a/tests/test_market/test_hetero_market/example_learnwares/example_learnware_1/__init__.py b/tests/test_market/test_hetero_market/example_learnwares/example_learnware_1/__init__.py new file mode 100644 index 0000000..934e352 --- /dev/null +++ b/tests/test_market/test_hetero_market/example_learnwares/example_learnware_1/__init__.py @@ -0,0 +1,22 @@ +from learnware.model import BaseModel +import numpy as np +import joblib +import os + + +class MyModel(BaseModel): + def __init__(self): + super(MyModel, self).__init__(input_shape=(30,), output_shape=(1,)) + dir_path = os.path.dirname(os.path.abspath(__file__)) + model_path=os.path.join(dir_path, "ridge.pkl") + model = joblib.load(model_path) + self.model=model + + def fit(self, X: np.ndarray, y: np.ndarray): + pass + + def predict(self, X: np.ndarray) -> np.ndarray: + return self.model.predict(X) + + def finetune(self, X: np.ndarray, y: np.ndarray): + pass \ No newline at end of file diff --git a/tests/test_market/test_hetero_market/example_learnwares/example_learnware_1/learnware.yaml b/tests/test_market/test_hetero_market/example_learnwares/example_learnware_1/learnware.yaml new file mode 100644 index 0000000..4a37a37 --- /dev/null +++ b/tests/test_market/test_hetero_market/example_learnwares/example_learnware_1/learnware.yaml @@ -0,0 +1,8 @@ +model: + class_name: MyModel + kwargs: {} +stat_specifications: + - module_path: learnware.specification + class_name: RKMETableSpecification + file_name: stat.json + kwargs: {} \ No newline at end of file diff --git a/tests/test_market/test_hetero_market/example_learnwares/example_learnware_1/requirements.txt b/tests/test_market/test_hetero_market/example_learnwares/example_learnware_1/requirements.txt new file mode 100644 index 0000000..1da1c5f --- /dev/null +++ b/tests/test_market/test_hetero_market/example_learnwares/example_learnware_1/requirements.txt @@ -0,0 +1 @@ +learnware == 0.1.0.999 \ No newline at end of file diff --git a/tests/test_market/test_hetero_market/test_hetero.py b/tests/test_market/test_hetero_market/test_hetero.py new file mode 100644 index 0000000..32b4759 --- /dev/null +++ b/tests/test_market/test_hetero_market/test_hetero.py @@ -0,0 +1,236 @@ +import sys +import unittest +import os +import copy +import joblib +import zipfile +import numpy as np +from sklearn.linear_model import Ridge +from sklearn.datasets import make_regression +from sklearn.datasets import load_digits +from shutil import copyfile, rmtree +from multiprocessing import Pool +from learnware.client import LearnwareClient + +import learnware +from learnware.market import instantiate_learnware_market, BaseUserInfo +import learnware.specification as specification +from example_learnwares.config import input_shape_list + +curr_root = os.path.dirname(os.path.abspath(__file__)) + +user_semantic = { + "Data": {"Values": ["Image"], "Type": "Class"}, + "Task": { + "Values": ["Classification"], + "Type": "Class", + }, + "Library": {"Values": ["Scikit-learn"], "Type": "Class"}, + "Scenario": {"Values": ["Education"], "Type": "Tag"}, + "Description": {"Values": "", "Type": "String"}, + "Name": {"Values": "", "Type": "String"}, + "Output": { + "Dimension": 10, + "Description": { + "0": "the probability of the label is zero", + }, + }, +} + + +def check_learnware(learnware_name, dir_path=os.path.join(curr_root, "learnware_pool")): + print(f"Checking Learnware: {learnware_name}") + zip_file_path = os.path.join(dir_path, learnware_name) + client = LearnwareClient() + # if check_learnware doesn't raise an exception, return True, otherwise, return false + try: + client.check_learnware(zip_file_path) + return True + except Exception as e: + print(f"Learnware {learnware_name} failed the check: {e}") + return False + + +class TestMarket(unittest.TestCase): + @classmethod + def setUpClass(cls) -> None: + np.random.seed(2023) + learnware.init() + + def _init_learnware_market(self): + """initialize learnware market""" + easy_market = instantiate_learnware_market(market_id="hetero_toy", name="hetero", rebuild=True) + return easy_market + + def test_prepare_learnware_randomly(self, learnware_num=5): + self.zip_path_list = [] + X, y = load_digits(return_X_y=True) + + for i in range(learnware_num): + dir_path = os.path.join(curr_root, "learnware_pool", "ridge_%d" % (i)) + os.makedirs(dir_path, exist_ok=True) + + print("Preparing Learnware: %d" % (i)) + + example_learnware_idx=i%2 + input_dim=input_shape_list[example_learnware_idx] + example_learnware_name="example_learnwares/example_learnware_%d" % (example_learnware_idx) + + X, y = make_regression(n_samples=5000, n_features=input_dim, noise=0.1, random_state=42) + + clf=Ridge(alpha=1.0) + clf.fit(X, y) + + joblib.dump(clf, os.path.join(dir_path, "ridge.pkl")) + + spec = specification.utils.generate_rkme_spec(X=X, gamma=0.1, cuda_idx=0) + spec.save(os.path.join(dir_path, "stat.json")) + + init_file = os.path.join(dir_path, "__init__.py") + copyfile( + os.path.join(curr_root, example_learnware_name, "__init__.py"), init_file + ) # cp example_init.py init_file + + yaml_file = os.path.join(dir_path, "learnware.yaml") + copyfile(os.path.join(curr_root, example_learnware_name, "learnware.yaml"), yaml_file) # cp example.yaml yaml_file + + env_file = os.path.join(dir_path, "requirements.txt") + copyfile(os.path.join(curr_root, example_learnware_name, "requirements.txt"), env_file) + + zip_file = dir_path + ".zip" + # zip -q -r -j zip_file dir_path + with zipfile.ZipFile(zip_file, "w") as zip_obj: + for foldername, subfolders, filenames in os.walk(dir_path): + for filename in filenames: + file_path = os.path.join(foldername, filename) + zip_info = zipfile.ZipInfo(filename) + zip_info.compress_type = zipfile.ZIP_STORED + with open(file_path, "rb") as file: + zip_obj.writestr(zip_info, file.read()) + + rmtree(dir_path) # rm -r dir_path + + def test_generated_learnwares(self): + curr_root = os.path.dirname(os.path.abspath(__file__)) + dir_path = os.path.join(curr_root, "learnware_pool") + + # Execute multi-process checking using Pool + with Pool() as pool: + results = pool.starmap(check_learnware, [(name, dir_path) for name in os.listdir(dir_path)]) + + # Use an assert statement to ensure that all checks return True + self.assertTrue(all(results), "Not all learnwares passed the check") + + # def test_upload_delete_learnware(self, learnware_num=5, delete=True): + # easy_market = self._init_learnware_market() + # self.test_prepare_learnware_randomly(learnware_num) + # self.learnware_num = learnware_num + + # print("Total Item:", len(easy_market)) + # assert len(easy_market) == 0, f"The market should be empty!" + + # for idx, zip_path in enumerate(self.zip_path_list): + # semantic_spec = copy.deepcopy(user_semantic) + # semantic_spec["Name"]["Values"] = "learnware_%d" % (idx) + # semantic_spec["Description"]["Values"] = "test_learnware_number_%d" % (idx) + # easy_market.add_learnware(zip_path, semantic_spec) + + # print("Total Item:", len(easy_market)) + # assert len(easy_market) == self.learnware_num, f"The number of learnwares must be {self.learnware_num}!" + + # curr_inds = easy_market.get_learnware_ids() + # print("Available ids After Uploading Learnwares:", curr_inds) + # assert len(curr_inds) == self.learnware_num, f"The number of learnwares must be {self.learnware_num}!" + + # if delete: + # for learnware_id in curr_inds: + # easy_market.delete_learnware(learnware_id) + # self.learnware_num -= 1 + # assert len(easy_market) == self.learnware_num, f"The number of learnwares must be {self.learnware_num}!" + + # curr_inds = easy_market.get_learnware_ids() + # print("Available ids After Deleting Learnwares:", curr_inds) + # assert len(curr_inds) == 0, f"The market should be empty!" + + # return easy_market + + # def test_search_semantics(self, learnware_num=5): + # easy_market = self.test_upload_delete_learnware(learnware_num, delete=False) + # print("Total Item:", len(easy_market)) + # assert len(easy_market) == self.learnware_num, f"The number of learnwares must be {self.learnware_num}!" + + # semantic_spec = copy.deepcopy(user_semantic) + # semantic_spec["Name"]["Values"] = f"learnware_{learnware_num - 1}" + + # user_info = BaseUserInfo(semantic_spec=semantic_spec) + # _, single_learnware_list, _, _ = easy_market.search_learnware(user_info) + + # print("User info:", user_info.get_semantic_spec()) + # print(f"Search result:") + # assert len(single_learnware_list) == 1, f"Exact semantic search failed!" + # for learnware in single_learnware_list: + # semantic_spec1 = learnware.get_specification().get_semantic_spec() + # print("Choose learnware:", learnware.id, semantic_spec1) + # assert semantic_spec1["Name"]["Values"] == semantic_spec["Name"]["Values"], f"Exact semantic search failed!" + + # semantic_spec["Name"]["Values"] = "laernwaer" + # user_info = BaseUserInfo(semantic_spec=semantic_spec) + # _, single_learnware_list, _, _ = easy_market.search_learnware(user_info) + + # print("User info:", user_info.get_semantic_spec()) + # print(f"Search result:") + # assert len(single_learnware_list) == self.learnware_num, f"Fuzzy semantic search failed!" + # for learnware in single_learnware_list: + # semantic_spec1 = learnware.get_specification().get_semantic_spec() + # print("Choose learnware:", learnware.id, semantic_spec1) + + # def test_stat_search(self, learnware_num=5): + # easy_market = self.test_upload_delete_learnware(learnware_num, delete=False) + # print("Total Item:", len(easy_market)) + + # test_folder = os.path.join(curr_root, "test_stat") + + # for idx, zip_path in enumerate(self.zip_path_list): + # unzip_dir = os.path.join(test_folder, f"{idx}") + + # # unzip -o -q zip_path -d unzip_dir + # if os.path.exists(unzip_dir): + # rmtree(unzip_dir) + # os.makedirs(unzip_dir, exist_ok=True) + # with zipfile.ZipFile(zip_path, "r") as zip_obj: + # zip_obj.extractall(path=unzip_dir) + + # user_spec = specification.rkme.RKMETableSpecification() + # user_spec.load(os.path.join(unzip_dir, "svm.json")) + # user_info = BaseUserInfo(semantic_spec=user_semantic, stat_info={"RKMETableSpecification": user_spec}) + # ( + # sorted_score_list, + # single_learnware_list, + # mixture_score, + # mixture_learnware_list, + # ) = easy_market.search_learnware(user_info) + + # assert len(single_learnware_list) == self.learnware_num, f"Statistical search failed!" + # print(f"search result of user{idx}:") + # for score, learnware in zip(sorted_score_list, single_learnware_list): + # print(f"score: {score}, learnware_id: {learnware.id}") + # print(f"mixture_score: {mixture_score}\n") + # mixture_id = " ".join([learnware.id for learnware in mixture_learnware_list]) + # print(f"mixture_learnware: {mixture_id}\n") + + # rmtree(test_folder) # rm -r test_folder + + +def suite(): + _suite = unittest.TestSuite() + _suite.addTest(TestMarket("test_prepare_learnware_randomly")) + _suite.addTest(TestMarket("test_generated_learnwares")) + # _suite.addTest(TestMarket("test_upload_delete_learnware")) + # _suite.addTest(TestMarket("test_search_semantics")) + # _suite.addTest(TestMarket("test_stat_search")) + return _suite + + +if __name__ == "__main__": + runner = unittest.TextTestRunner() + runner.run(suite()) From ce66805f41970ecaf3a053fff0a513a8eaef5554 Mon Sep 17 00:00:00 2001 From: Peng Tan Date: Tue, 7 Nov 2023 14:00:56 +0800 Subject: [PATCH 12/90] [MNT] modify variable names --- .../test_hetero_market/test_hetero.py | 54 +++++++++---------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/tests/test_market/test_hetero_market/test_hetero.py b/tests/test_market/test_hetero_market/test_hetero.py index 32b4759..5d08ea7 100644 --- a/tests/test_market/test_hetero_market/test_hetero.py +++ b/tests/test_market/test_hetero_market/test_hetero.py @@ -59,8 +59,8 @@ class TestMarket(unittest.TestCase): def _init_learnware_market(self): """initialize learnware market""" - easy_market = instantiate_learnware_market(market_id="hetero_toy", name="hetero", rebuild=True) - return easy_market + hetero_market = instantiate_learnware_market(market_id="hetero_toy", name="hetero", rebuild=True) + return hetero_market def test_prepare_learnware_randomly(self, learnware_num=5): self.zip_path_list = [] @@ -121,38 +121,38 @@ class TestMarket(unittest.TestCase): # Use an assert statement to ensure that all checks return True self.assertTrue(all(results), "Not all learnwares passed the check") - # def test_upload_delete_learnware(self, learnware_num=5, delete=True): - # easy_market = self._init_learnware_market() - # self.test_prepare_learnware_randomly(learnware_num) - # self.learnware_num = learnware_num + def test_upload_delete_learnware(self, learnware_num=5, delete=True): + hetero_market = self._init_learnware_market() + self.test_prepare_learnware_randomly(learnware_num) + self.learnware_num = learnware_num - # print("Total Item:", len(easy_market)) - # assert len(easy_market) == 0, f"The market should be empty!" + print("Total Item:", len(hetero_market)) + assert len(hetero_market) == 0, f"The market should be empty!" - # for idx, zip_path in enumerate(self.zip_path_list): - # semantic_spec = copy.deepcopy(user_semantic) - # semantic_spec["Name"]["Values"] = "learnware_%d" % (idx) - # semantic_spec["Description"]["Values"] = "test_learnware_number_%d" % (idx) - # easy_market.add_learnware(zip_path, semantic_spec) + for idx, zip_path in enumerate(self.zip_path_list): + semantic_spec = copy.deepcopy(user_semantic) + semantic_spec["Name"]["Values"] = "learnware_%d" % (idx) + semantic_spec["Description"]["Values"] = "test_learnware_number_%d" % (idx) + hetero_market.add_learnware(zip_path, semantic_spec) - # print("Total Item:", len(easy_market)) - # assert len(easy_market) == self.learnware_num, f"The number of learnwares must be {self.learnware_num}!" + print("Total Item:", len(hetero_market)) + assert len(hetero_market) == self.learnware_num, f"The number of learnwares must be {self.learnware_num}!" - # curr_inds = easy_market.get_learnware_ids() - # print("Available ids After Uploading Learnwares:", curr_inds) - # assert len(curr_inds) == self.learnware_num, f"The number of learnwares must be {self.learnware_num}!" + curr_ids = hetero_market.get_learnware_ids() + print("Available ids After Uploading Learnwares:", curr_ids) + assert len(curr_ids) == self.learnware_num, f"The number of learnwares must be {self.learnware_num}!" - # if delete: - # for learnware_id in curr_inds: - # easy_market.delete_learnware(learnware_id) - # self.learnware_num -= 1 - # assert len(easy_market) == self.learnware_num, f"The number of learnwares must be {self.learnware_num}!" + if delete: + for learnware_id in curr_ids: + hetero_market.delete_learnware(learnware_id) + self.learnware_num -= 1 + assert len(hetero_market) == self.learnware_num, f"The number of learnwares must be {self.learnware_num}!" - # curr_inds = easy_market.get_learnware_ids() - # print("Available ids After Deleting Learnwares:", curr_inds) - # assert len(curr_inds) == 0, f"The market should be empty!" + curr_ids = hetero_market.get_learnware_ids() + print("Available ids After Deleting Learnwares:", curr_ids) + assert len(curr_ids) == 0, f"The market should be empty!" - # return easy_market + return hetero_market # def test_search_semantics(self, learnware_num=5): # easy_market = self.test_upload_delete_learnware(learnware_num, delete=False) From 664463ca57d3addbd12a9fff2d032b32f4226fa2 Mon Sep 17 00:00:00 2001 From: liuht Date: Tue, 7 Nov 2023 16:09:50 +0800 Subject: [PATCH 13/90] [MNT] fix details in add_learnware and searcher --- .gitignore | 2 + .../market/hetergeneous/organizer/__init__.py | 73 +++++++++++++------ learnware/market/hetergeneous/searcher.py | 2 +- learnware/market/module.py | 2 +- learnware/specification/system/heter_table.py | 4 +- 5 files changed, 56 insertions(+), 27 deletions(-) diff --git a/.gitignore b/.gitignore index 92d2a4d..d22ea69 100644 --- a/.gitignore +++ b/.gitignore @@ -27,6 +27,7 @@ dist/ *.db *.json *.zip +*.bin # special software .pytest_cache/ @@ -43,3 +44,4 @@ tmp/ learnware_pool/ PFS/ data/ +learnware/market/hetergeneous/.learnware/* \ No newline at end of file diff --git a/learnware/market/hetergeneous/organizer/__init__.py b/learnware/market/hetergeneous/organizer/__init__.py index db0c05e..85c3e7f 100644 --- a/learnware/market/hetergeneous/organizer/__init__.py +++ b/learnware/market/hetergeneous/organizer/__init__.py @@ -1,16 +1,20 @@ from __future__ import annotations +import copy import multiprocessing import os +import tempfile +import zipfile from collections import defaultdict +from shutil import copyfile, rmtree from typing import List import pandas as pd -from ....learnware import Learnware +from ....learnware import Learnware, get_learnware_from_dirpath from ....logger import get_module_logger from ....specification.system import HeteroSpecification -from ...base import BaseUserInfo +from ...base import BaseChecker, BaseUserInfo from ...easy2 import EasyOrganizer from ..database_ops import DatabaseOperations from .config import C as conf @@ -68,27 +72,53 @@ class HeteroMapTableOrganizer(EasyOrganizer): self.training_args = kwargs def add_learnware( - self, zip_path: str, semantic_spec: dict, check_status: int, learnware: Learnware + self, zip_path: str, semantic_spec: dict, check_status: int, learnware_id: str = None ) -> Tuple[str, int]: - self._update_learnware_list([learnware]) - self.learnware_list[learnware.id] = learnware + logger.info("Get new learnware from %s" % (zip_path)) + + learnware_id = "%08d" % (self.count) if learnware_id is None else learnware_id + target_zip_dir = os.path.join(self.learnware_zip_pool_path, "%s.zip" % (learnware_id)) + target_folder_dir = os.path.join(self.learnware_folder_pool_path, learnware_id) + copyfile(zip_path, target_zip_dir) + + with zipfile.ZipFile(target_zip_dir, "r") as z_file: + z_file.extractall(target_folder_dir) + logger.info("Learnware move to %s, and unzip to %s" % (target_zip_dir, target_folder_dir)) + + try: + new_learnware = get_learnware_from_dirpath( + id=learnware_id, semantic_spec=semantic_spec, learnware_dirpath=target_folder_dir + ) + except: + logger.info("New Learnware Not Properly Added!!!") + try: + os.remove(target_zip_dir) + rmtree(target_folder_dir) + except: + pass + return None, BaseChecker.INVALID_LEARNWARE + + if new_learnware is None: + return None, BaseChecker.INVALID_LEARNWARE + + learnwere_status = check_status if check_status is not None else BaseChecker.NONUSABLE_LEARNWARE + + self._update_learnware_list([new_learnware]) + self.learnware_list[learnware_id] = new_learnware + self.learnware_zip_list[learnware_id] = target_zip_dir + self.learnware_folder_list[learnware_id] = target_folder_dir + self.use_flags[learnware_id] = learnwere_status self.count += 1 if self.auto_update and self.count >= self.auto_update_limit: - train_process = multiprocessing.Process(target=self.train, args=(self.learnware_list,)) + train_process = multiprocessing.Process(target=self.train, args=(self.learnware_list.values(),)) train_process.start() # train_process.join() + + return learnware_id, learnwere_status - def delete_learnware(self, id: str) -> bool: - raise NotImplementedError - - def update_learnware(self, learnware: Learnware): - raise NotImplementedError - - def get_learnwares(self): - return self.learnware_list - - def train(self, learnware_list: List[Learnware]): + def train(self, learnware_list: List[Learnware] = None): + learnware_list = learnware_list or self.learnware_list.values() allset = self._learnwares_to_dataframes(learnware_list) self.market_mapping = HeteroMapping(**self.training_args) market_mapping_trainer = Trainer( @@ -115,7 +145,7 @@ class HeteroMapTableOrganizer(EasyOrganizer): def _update_learnware_specification(self, learnware: Learnware, save_path: str) -> Learnware: specification = learnware.specification - learnware_rkme = specification.get_stat_spec()["RKMEStatSpecification"] + learnware_rkme = specification.get_stat_spec()["RKMETableSpecification"] learnware_features = specification.get_semantic_spec()["Input"]["Description"].values() learnware_hetero_spec = self.market_mapping.hetero_mapping(learnware_rkme, learnware_features) learnware.update_stat_spec("HeteroSpecification", learnware_hetero_spec) @@ -124,7 +154,7 @@ class HeteroMapTableOrganizer(EasyOrganizer): learnware_hetero_spec.save(save_path) def generate_hetero_map_spec(self, user_info: BaseUserInfo) -> HeteroSpecification: - user_rkme = user_info.stat_info["RKMEStatSpecification"] + user_rkme = user_info.stat_info["RKMETableSpecification"] user_features = user_info.semantic_spec["Input"]["Description"].values() user_hetero_spec = self.market_mapping.hetero_mapping(user_rkme, user_features) return user_hetero_spec @@ -133,7 +163,7 @@ class HeteroMapTableOrganizer(EasyOrganizer): learnware_df_dict = defaultdict(list) for learnware in learnware_list: specification = learnware.get_specification() - learnware_rkme = specification.get_stat_spec()["RKMEStatSpecification"] + learnware_rkme = specification.get_stat_spec()["RKMETableSpecification"] learnware_features = specification.get_semantic_spec()["Input"]["Description"] learnware_df = pd.DataFrame(data=learnware_rkme.get_z(), columns=learnware_features.values()) @@ -143,7 +173,4 @@ class HeteroMapTableOrganizer(EasyOrganizer): return merged_dfs def save(self, save_path): - return NotImplementedError - - def __len__(self): - return len(self.learnware_list) + return NotImplementedError \ No newline at end of file diff --git a/learnware/market/hetergeneous/searcher.py b/learnware/market/hetergeneous/searcher.py index d70b06e..90bbefd 100644 --- a/learnware/market/hetergeneous/searcher.py +++ b/learnware/market/hetergeneous/searcher.py @@ -17,7 +17,7 @@ class HeteroMapTableSearcher(BaseSearcher): learnware_list = self.learnware_oganizer.get_learnwares() target_learnware, min_dist = None, None user_hetero_spec = self.learnware_oganizer.generate_hetero_map_spec(user_info) - for learnware in learnware_list.values(): + for learnware in learnware_list: learnware_hetero_spec = learnware.specification.get_stat_spec_by_name("HeteroSpecification") mmd_dist = learnware_hetero_spec.dist(user_hetero_spec) if target_learnware is None or mmd_dist < min_dist: diff --git a/learnware/market/module.py b/learnware/market/module.py index 43499ec..f60e06a 100644 --- a/learnware/market/module.py +++ b/learnware/market/module.py @@ -11,7 +11,7 @@ MARKET_CONFIG = { "hetero": { "organizer": HeteroMapTableOrganizer(), "searcher": HeteroMapTableSearcher(), - "checker_list": [] + "checker_list": [EasySemanticChecker(), EasyStatChecker()] } } diff --git a/learnware/specification/system/heter_table.py b/learnware/specification/system/heter_table.py index e611ca1..f721d8f 100644 --- a/learnware/specification/system/heter_table.py +++ b/learnware/specification/system/heter_table.py @@ -8,7 +8,7 @@ import os import numpy as np import torch -from ..regular.table import RKMEStatSpecification +from ..regular import RKMETableSpecification from ..regular.table.rkme import choose_device, setup_seed, torch_rbf_kernel from .base import SystemStatsSpecification @@ -34,7 +34,7 @@ class HeteroSpecification(SystemStatsSpecification): def get_beta(self) -> np.ndarray: return self.beta.detach().cpu().numpy - def generate_stat_spec_from_system(self, heter_embedding: np.ndarray, rkme_spec: RKMEStatSpecification): + def generate_stat_spec_from_system(self, heter_embedding: np.ndarray, rkme_spec: RKMETableSpecification): self.beta = rkme_spec.beta.to(self.device) self.z = torch.from_numpy(heter_embedding).double().to(self.device) From 75c86fca170d23da1d9582cc30187ab431ed3d75 Mon Sep 17 00:00:00 2001 From: liuht Date: Tue, 7 Nov 2023 21:49:01 +0800 Subject: [PATCH 14/90] [MNT] delete_learnware, buggy auto_update --- .../market/hetergeneous/organizer/__init__.py | 86 ++++++++++++------- .../organizer/hetero_mapping/__init__.py | 9 +- .../hetero_mapping/feature_extractor.py | 2 +- .../organizer/hetero_mapping/trainer.py | 19 ++-- 4 files changed, 71 insertions(+), 45 deletions(-) diff --git a/learnware/market/hetergeneous/organizer/__init__.py b/learnware/market/hetergeneous/organizer/__init__.py index 85c3e7f..8356f0d 100644 --- a/learnware/market/hetergeneous/organizer/__init__.py +++ b/learnware/market/hetergeneous/organizer/__init__.py @@ -10,6 +10,8 @@ from shutil import copyfile, rmtree from typing import List import pandas as pd +from torch import nn +import torch.multiprocessing as mp from ....learnware import Learnware, get_learnware_from_dirpath from ....logger import get_module_logger @@ -24,7 +26,7 @@ logger = get_module_logger("hetero_market") class HeteroMapTableOrganizer(EasyOrganizer): - def reload_market(self, rebuild=False, auto_update_limit=50): + def reload_market(self, rebuild=False, auto_update_limit=100): self.market_store_path = os.path.join(conf.market_root_path, self.market_id) self.market_mapping_path = os.path.join(self.market_store_path, conf.market_model_path) self.learnware_pool_path = os.path.join(self.market_store_path, "learnware_pool") @@ -34,9 +36,21 @@ class HeteroMapTableOrganizer(EasyOrganizer): self.learnware_zip_list = {} self.learnware_folder_list = {} self.count = 0 + self.last_trained_learnware_num = 0 self.dbops = DatabaseOperations(conf.database_url, "market_" + self.market_id) self.auto_update = False self.auto_update_limit = auto_update_limit + self.auto_update_lock = mp.Lock() + self.is_training_in_progress = mp.Value('i', 0) + + if rebuild: + logger.warning("Warning! You are trying to clear current database!") + try: + self.dbops.clear_learnware_table() + rmtree(self.learnware_pool_path) + except Exception as err: + logger.warning(f"Clear current database failed due to {err}!!") + pass os.makedirs(self.learnware_pool_path, exist_ok=True) os.makedirs(self.learnware_zip_pool_path, exist_ok=True) @@ -49,27 +63,20 @@ class HeteroMapTableOrganizer(EasyOrganizer): self.count, ) = self.dbops.load_market() - if rebuild: - logger.warning("Warning! You are trying to clear current database!") - try: - self.dbops.clear_learnware_table() - rmtree(self.learnware_pool_path) - except: - pass + if os.path.exists(self.market_mapping_path): + logger.info(f"Loading Market Mapping from Default Checkpoint {self.market_mapping_path}") + self.market_mapping = HeteroMapping.load(checkpoint=self.market_store_path) + # self._update_learnware_list(self.learnware_list) else: - if os.path.exists(self.market_mapping_path): - logger.info(f"Loading Market Mapping from Default Checkpoint {self.market_mapping_path}") - self.market_mapping = HeteroMapping.load(checkpoint=self.market_store_path) - # self._update_learnware_list(self.learnware_list) - else: - logger.warning(f"No Existing Market Mapping!!") - self.market_mapping = HeteroMapping() - - def reset(self, market_id=None, auto_update=False, **kwargs): + logger.warning(f"No Existing Market Mapping!!") + self.market_mapping = HeteroMapping() + + def reset(self, market_id=None, auto_update=False, auto_update_limit=None, **kwargs): # model training arguments(model architecture + optimization) set via self.reset self.auto_update = auto_update self.market_id = market_id self.training_args = kwargs + if auto_update_limit is not None: self.auto_update_limit = auto_update_limit def add_learnware( self, zip_path: str, semantic_spec: dict, check_status: int, learnware_id: str = None @@ -103,22 +110,34 @@ class HeteroMapTableOrganizer(EasyOrganizer): learnwere_status = check_status if check_status is not None else BaseChecker.NONUSABLE_LEARNWARE + self.dbops.add_learnware( + id=learnware_id, + semantic_spec=semantic_spec, + zip_path=target_zip_dir, + folder_path=target_folder_dir, + use_flag=learnwere_status, + ) + self._update_learnware_list([new_learnware]) self.learnware_list[learnware_id] = new_learnware self.learnware_zip_list[learnware_id] = target_zip_dir self.learnware_folder_list[learnware_id] = target_folder_dir self.use_flags[learnware_id] = learnwere_status - self.count += 1 - - if self.auto_update and self.count >= self.auto_update_limit: - train_process = multiprocessing.Process(target=self.train, args=(self.learnware_list.values(),)) - train_process.start() - # train_process.join() + self.count += 1 + + with self.auto_update_lock: + if self.auto_update and not self.is_training_in_progress.value and self.count - self.last_trained_learnware_num >= self.auto_update_limit: + self.is_training_in_progress.value = 1 + curr_learnware_list = copy.deepcopy(self.learnware_list) + train_process = mp.Process(target=self.train, args=(curr_learnware_list.values(),)) + train_process.start() + # train_process.join() return learnware_id, learnwere_status def train(self, learnware_list: List[Learnware] = None): learnware_list = learnware_list or self.learnware_list.values() + logger.warning(f"Leanwares for training: {[learnware.id for learnware in learnware_list]}") allset = self._learnwares_to_dataframes(learnware_list) self.market_mapping = HeteroMapping(**self.training_args) market_mapping_trainer = Trainer( @@ -134,14 +153,23 @@ class HeteroMapTableOrganizer(EasyOrganizer): # essential hetero-mapping update for each market learnware when market model retrained self._update_learnware_list(learnware_list) + self.last_trained_learnware_num = self.count + + logger.warning(f"Updataed Specification For: {[learnware.id for learnware in learnware_list]}") + + with self.auto_update_lock: + self.is_training_in_progress.value = 0 def _update_learnware_list(self, learnware_list: List[Learnware]): - hetero_mappings_save_path = os.path.join(self.market_store_path, "hetero_mappings") - os.makedirs(hetero_mappings_save_path, exist_ok=True) - for learnware in learnware_list: - learnware.id = learnware.id.replace(",", "_") - hetero_spec_path = os.path.join(hetero_mappings_save_path, f"{learnware.id}.npy") - self._update_learnware_specification(learnware, save_path=hetero_spec_path) + try: + hetero_mappings_save_path = os.path.join(self.market_store_path, "hetero_mappings") + os.makedirs(hetero_mappings_save_path, exist_ok=True) + for learnware in learnware_list: + hetero_spec_path = os.path.join(hetero_mappings_save_path, f"{learnware.id}.npy") + self._update_learnware_specification(learnware, save_path=hetero_spec_path) + logger.info(f"Learnware {learnware.id} HeteroSpecification Successfully Saved") + except Exception as err: + logger.warning(f"Update learnware HeteroSpecification failed! Due to {err}") def _update_learnware_specification(self, learnware: Learnware, save_path: str) -> Learnware: specification = learnware.specification diff --git a/learnware/market/hetergeneous/organizer/hetero_mapping/__init__.py b/learnware/market/hetergeneous/organizer/hetero_mapping/__init__.py index 8a992d5..f0e9de5 100644 --- a/learnware/market/hetergeneous/organizer/hetero_mapping/__init__.py +++ b/learnware/market/hetergeneous/organizer/hetero_mapping/__init__.py @@ -60,6 +60,7 @@ class HeteroMapping(nn.Module): device=device, ) + ##todo: BUG!!!!!! self.encoder = TransformerMultiLayer( hidden_dim=hidden_dim, num_layer=num_layer, @@ -69,9 +70,6 @@ class HeteroMapping(nn.Module): activation=activation, ) self.cls_token = CLSToken(hidden_dim=hidden_dim) - self.device = device - self.to(device) - self.collate_fn = TransTabCollatorForCL( feature_tokenizer=feature_tokenizer, overlap_ratio=overlap_ratio, num_partition=num_partition ) @@ -103,7 +101,7 @@ class HeteroMapping(nn.Module): # load model weight state dict market_model_path = os.path.join(checkpoint, conf.market_model_path) model_info = torch.load(market_model_path, map_location="cpu") - model = HeteroMapping(feature_tokenizer=model_info["feature_tokenizer"], **model_info["model_args"]) + model = HeteroMapping(**model_info["model_args"]) model.load_state_dict(model_info["model_state_dict"], strict=False) return model # self.feature_tokenizer.load(checkpoint) @@ -126,7 +124,7 @@ class HeteroMapping(nn.Module): model_info = { "model_state_dict": self.state_dict(), "model_args": self.model_args, - "feature_tokenizer": self.feature_tokenizer, + # "feature_tokenizer": self.feature_tokenizer, } torch.save(model_info, os.path.join(ckpt_dir, conf.market_model_path)) @@ -407,6 +405,7 @@ class TransformerMultiLayer(nn.Module): use_layer_norm=True, activation=activation, ) + ##todo: BUG!!!!!! stacked_transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layer - 1) self.transformer_encoder.append(stacked_transformer) diff --git a/learnware/market/hetergeneous/organizer/hetero_mapping/feature_extractor.py b/learnware/market/hetergeneous/organizer/hetero_mapping/feature_extractor.py index f2bbe49..55d5805 100644 --- a/learnware/market/hetergeneous/organizer/hetero_mapping/feature_extractor.py +++ b/learnware/market/hetergeneous/organizer/hetero_mapping/feature_extractor.py @@ -68,7 +68,7 @@ class FeatureTokenizer: def __init__( self, - disable_tokenizer_parallel=False, + disable_tokenizer_parallel=True, **kwargs, ) -> None: """args: diff --git a/learnware/market/hetergeneous/organizer/hetero_mapping/trainer.py b/learnware/market/hetergeneous/organizer/hetero_mapping/trainer.py index 5845c36..695d96e 100644 --- a/learnware/market/hetergeneous/organizer/hetero_mapping/trainer.py +++ b/learnware/market/hetergeneous/organizer/hetero_mapping/trainer.py @@ -56,7 +56,7 @@ class Trainer: self.args["steps_per_epoch"] = int(self.args["num_training_steps"] / (num_epoch * len(self.train_set_list))) self.optimizer = None - def train(self): + def train(self, verbose: bool = True): self._create_optimizer() start_time = time.time() final_train_loss = 0 @@ -72,18 +72,17 @@ class Trainer: train_loss_all += loss.item() ite += 1 - logger.info( - "epoch: {}, train loss: {:.4f}, lr: {:.6f}, spent: {:.1f} secs".format( - epoch, - train_loss_all, - self.optimizer.param_groups[0]["lr"], - time.time() - start_time, + if verbose: + logger.info( + "epoch: {}, train loss: {:.4f}, lr: {:.6f}, spent: {:.1f} secs".format( + epoch, + train_loss_all, + self.optimizer.param_groups[0]["lr"], + time.time() - start_time, + ) ) - ) final_train_loss = train_loss_all - # self.save_model(self.output_dir) - logger.info("training complete, cost {:.1f} secs.".format(time.time() - start_time)) return final_train_loss From fd5b7a203c1d60e2100de050ac8ba158c935879e Mon Sep 17 00:00:00 2001 From: Peng Tan Date: Tue, 7 Nov 2023 22:07:28 +0800 Subject: [PATCH 15/90] [MNT] adjustment due to learnware package update --- .../test_hetero_market/test_hetero.py | 41 ++++++++++--------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/tests/test_market/test_hetero_market/test_hetero.py b/tests/test_market/test_hetero_market/test_hetero.py index 5d08ea7..fe7080f 100644 --- a/tests/test_market/test_hetero_market/test_hetero.py +++ b/tests/test_market/test_hetero_market/test_hetero.py @@ -7,7 +7,6 @@ import zipfile import numpy as np from sklearn.linear_model import Ridge from sklearn.datasets import make_regression -from sklearn.datasets import load_digits from shutil import copyfile, rmtree from multiprocessing import Pool from learnware.client import LearnwareClient @@ -20,7 +19,7 @@ from example_learnwares.config import input_shape_list curr_root = os.path.dirname(os.path.abspath(__file__)) user_semantic = { - "Data": {"Values": ["Image"], "Type": "Class"}, + "Data": {"Values": ["Table"], "Type": "Class"}, "Task": { "Values": ["Classification"], "Type": "Class", @@ -29,15 +28,8 @@ user_semantic = { "Scenario": {"Values": ["Education"], "Type": "Tag"}, "Description": {"Values": "", "Type": "String"}, "Name": {"Values": "", "Type": "String"}, - "Output": { - "Dimension": 10, - "Description": { - "0": "the probability of the label is zero", - }, - }, } - def check_learnware(learnware_name, dir_path=os.path.join(curr_root, "learnware_pool")): print(f"Checking Learnware: {learnware_name}") zip_file_path = os.path.join(dir_path, learnware_name) @@ -64,7 +56,6 @@ class TestMarket(unittest.TestCase): def test_prepare_learnware_randomly(self, learnware_num=5): self.zip_path_list = [] - X, y = load_digits(return_X_y=True) for i in range(learnware_num): dir_path = os.path.join(curr_root, "learnware_pool", "ridge_%d" % (i)) @@ -83,7 +74,7 @@ class TestMarket(unittest.TestCase): joblib.dump(clf, os.path.join(dir_path, "ridge.pkl")) - spec = specification.utils.generate_rkme_spec(X=X, gamma=0.1, cuda_idx=0) + spec = specification.generate_rkme_spec(X=X, gamma=0.1, cuda_idx=0) spec.save(os.path.join(dir_path, "stat.json")) init_file = os.path.join(dir_path, "__init__.py") @@ -110,6 +101,8 @@ class TestMarket(unittest.TestCase): rmtree(dir_path) # rm -r dir_path + self.zip_path_list.append(zip_file) + def test_generated_learnwares(self): curr_root = os.path.dirname(os.path.abspath(__file__)) dir_path = os.path.join(curr_root, "learnware_pool") @@ -133,24 +126,34 @@ class TestMarket(unittest.TestCase): semantic_spec = copy.deepcopy(user_semantic) semantic_spec["Name"]["Values"] = "learnware_%d" % (idx) semantic_spec["Description"]["Values"] = "test_learnware_number_%d" % (idx) + semantic_spec["Input"] = { + "Dimension": 64, + "Description": { + f"{i}": f"The value in the grid {i // 8}{i % 8} of the image of hand-written digit." + for i in range(64) + }, + } + semantic_spec["Output"] = { + "Dimension": 10, + "Description": {f"{i}": "The probability for each digit for 0 to 9." for i in range(10)}, + } hetero_market.add_learnware(zip_path, semantic_spec) print("Total Item:", len(hetero_market)) assert len(hetero_market) == self.learnware_num, f"The number of learnwares must be {self.learnware_num}!" - - curr_ids = hetero_market.get_learnware_ids() - print("Available ids After Uploading Learnwares:", curr_ids) - assert len(curr_ids) == self.learnware_num, f"The number of learnwares must be {self.learnware_num}!" + curr_inds = hetero_market.get_learnware_ids() + print("Available ids After Uploading Learnwares:", curr_inds) + assert len(curr_inds) == self.learnware_num, f"The number of learnwares must be {self.learnware_num}!" if delete: - for learnware_id in curr_ids: + for learnware_id in curr_inds: hetero_market.delete_learnware(learnware_id) self.learnware_num -= 1 assert len(hetero_market) == self.learnware_num, f"The number of learnwares must be {self.learnware_num}!" - curr_ids = hetero_market.get_learnware_ids() - print("Available ids After Deleting Learnwares:", curr_ids) - assert len(curr_ids) == 0, f"The market should be empty!" + curr_inds = hetero_market.get_learnware_ids() + print("Available ids After Deleting Learnwares:", curr_inds) + assert len(curr_inds) == 0, f"The market should be empty!" return hetero_market From 19fc3fa8a0c37034e214ab1227ce2f2009afb4d9 Mon Sep 17 00:00:00 2001 From: Peng Tan Date: Wed, 8 Nov 2023 14:42:46 +0800 Subject: [PATCH 16/90] [ENH] add upload_delete_learnware test --- .../example_learnwares/config.py | 82 ++++++++++++++++++- .../test_hetero_market/test_hetero.py | 21 ++--- 2 files changed, 87 insertions(+), 16 deletions(-) diff --git a/tests/test_market/test_hetero_market/example_learnwares/config.py b/tests/test_market/test_hetero_market/example_learnwares/config.py index 6c8459a..941109a 100644 --- a/tests/test_market/test_hetero_market/example_learnwares/config.py +++ b/tests/test_market/test_hetero_market/example_learnwares/config.py @@ -1 +1,81 @@ -input_shape_list=[20, 30] # 20-input shape of example learnware 0, 30-input shape of example learnware 1 \ No newline at end of file +input_shape_list=[20, 30] # 20-input shape of example learnware 0, 30-input shape of example learnware 1 + +input_description_list=[ + { + "Dimension": 20, + "Description": { # medical description + "0": "baseline value: Baseline Fetal Heart Rate (FHR)", + "1": "accelerations: Number of accelerations per second", + "2": "fetal_movement: Number of fetal movements per second", + "3": "uterine_contractions: Number of uterine contractions per second", + "4": "light_decelerations: Number of LDs per second", + "5": "severe_decelerations: Number of SDs per second", + "6": "prolongued_decelerations: Number of PDs per second", + "7": "abnormal_short_term_variability: Percentage of time with abnormal short term variability", + "8": "mean_value_of_short_term_variability: Mean value of short term variability", + "9": "percentage_of_time_with_abnormal_long_term_variability: Percentage of time with abnormal long term variability", + "10": "mean_value_of_long_term_variability: Mean value of long term variability", + "11": "histogram_width: Width of the histogram made using all values from a record", + "12": "histogram_min: Histogram minimum value", + "13": "histogram_max: Histogram maximum value", + "14": "histogram_number_of_peaks: Number of peaks in the exam histogram", + "15": "histogram_number_of_zeroes: Number of zeroes in the exam histogram", + "16": "histogram_mode: Hist mode", + "17": "histogram_mean: Hist mean", + "18": "histogram_median: Hist Median", + "19": "histogram_variance: Hist variance" + }, + }, + { + "Dimension": 30, + "Description": { # business description + "0": "This is a consecutive month number, used for convenience. For example, January 2013 is 0, February 2013 is 1,..., October 2015 is 33.", + "1": "This is the unique identifier for each shop.", + "2": "This is the unique identifier for each item.", + "3": "This is the code representing the city where the shop is located.", + "4": "This is the unique identifier for the category of the item.", + "5": "This is the code representing the type of the item.", + "6": "This is the code representing the subtype of the item.", + "7": "This is the number of this type of item sold in the shop one month ago.", + "8": "This is the number of this type of item sold in the shop two months ago.", + "9": "This is the number of this type of item sold in the shop three months ago.", + "10": "This is the number of this type of item sold in the shop six months ago.", + "11": "This is the number of this type of item sold in the shop twelve months ago.", + "12": "This is the average count of items sold one month ago.", + "13": "This is the average count of this type of item sold one month ago.", + "14": "This is the average count of this type of item sold two months ago.", + "15": "This is the average count of this type of item sold three months ago.", + "16": "This is the average count of this type of item sold six months ago.", + "17": "This is the average count of this type of item sold twelve months ago.", + "18": "This is the average count of items sold in the shop one month ago.", + "19": "This is the average count of items sold in the shop two months ago.", + "20": "This is the average count of items sold in the shop three months ago.", + "21": "This is the average count of items sold in the shop six months ago.", + "22": "This is the average count of items sold in the shop twelve months ago.", + "23": "This is the average count of items in the same category sold one month ago.", + "24": "This is the average count of items in the same category sold in the shop one month ago.", + "25": "This is the average count of items of the same type sold in the shop one month ago.", + "26": "This is the average count of items of the same subtype sold in the shop one month ago.", + "27": "This is the average count of items sold in the same city one month ago.", + "28": "This is the average count of this type of item sold in the same city one month ago.", + "29": "This is the average count of items of the same type sold one month ago." + }, + }, + +] + +output_description_list=[ + { + "Dimension": 1, + "Description": { # medical description + "0": "length of stay: Length of hospital stay (days)" + }, + }, + { + "Dimension": 1, + "Description": { # business description + "0": "sales of the item in the next day: Number of items sold in the next day" + }, + }, + +] \ No newline at end of file diff --git a/tests/test_market/test_hetero_market/test_hetero.py b/tests/test_market/test_hetero_market/test_hetero.py index fe7080f..981bb7d 100644 --- a/tests/test_market/test_hetero_market/test_hetero.py +++ b/tests/test_market/test_hetero_market/test_hetero.py @@ -14,7 +14,7 @@ from learnware.client import LearnwareClient import learnware from learnware.market import instantiate_learnware_market, BaseUserInfo import learnware.specification as specification -from example_learnwares.config import input_shape_list +from example_learnwares.config import input_shape_list, input_description_list, output_description_list curr_root = os.path.dirname(os.path.abspath(__file__)) @@ -126,17 +126,8 @@ class TestMarket(unittest.TestCase): semantic_spec = copy.deepcopy(user_semantic) semantic_spec["Name"]["Values"] = "learnware_%d" % (idx) semantic_spec["Description"]["Values"] = "test_learnware_number_%d" % (idx) - semantic_spec["Input"] = { - "Dimension": 64, - "Description": { - f"{i}": f"The value in the grid {i // 8}{i % 8} of the image of hand-written digit." - for i in range(64) - }, - } - semantic_spec["Output"] = { - "Dimension": 10, - "Description": {f"{i}": "The probability for each digit for 0 to 9." for i in range(10)}, - } + semantic_spec["Input"] = input_description_list[idx % 2] + semantic_spec["Output"] = output_description_list[idx % 2] hetero_market.add_learnware(zip_path, semantic_spec) print("Total Item:", len(hetero_market)) @@ -226,9 +217,9 @@ class TestMarket(unittest.TestCase): def suite(): _suite = unittest.TestSuite() - _suite.addTest(TestMarket("test_prepare_learnware_randomly")) - _suite.addTest(TestMarket("test_generated_learnwares")) - # _suite.addTest(TestMarket("test_upload_delete_learnware")) + # _suite.addTest(TestMarket("test_prepare_learnware_randomly")) + # _suite.addTest(TestMarket("test_generated_learnwares")) + _suite.addTest(TestMarket("test_upload_delete_learnware")) # _suite.addTest(TestMarket("test_search_semantics")) # _suite.addTest(TestMarket("test_stat_search")) return _suite From 65043ebe8db9cc9ef23d644c2d3625f6763b0c38 Mon Sep 17 00:00:00 2001 From: liuht Date: Wed, 8 Nov 2023 15:03:27 +0800 Subject: [PATCH 17/90] [MNT] enhance HeteroSearcher --- learnware/market/__init__.py | 2 +- learnware/market/hetergeneous/__init__.py | 2 +- .../market/hetergeneous/organizer/__init__.py | 1 - learnware/market/hetergeneous/searcher.py | 121 ++++++++++++++++-- learnware/market/module.py | 4 +- 5 files changed, 111 insertions(+), 19 deletions(-) diff --git a/learnware/market/__init__.py b/learnware/market/__init__.py index f30f0dc..42d7eb0 100644 --- a/learnware/market/__init__.py +++ b/learnware/market/__init__.py @@ -3,7 +3,7 @@ from .base import BaseUserInfo, LearnwareMarket, BaseChecker, BaseOrganizer, Bas from .evolve_anchor import EvolvedAnchoredOrganizer from .evolve import EvolvedOrganizer from .easy2 import EasyOrganizer, EasySearcher, EasySemanticChecker, EasyStatChecker -from .hetergeneous import HeteroMapTableOrganizer, HeteroMapTableSearcher +from .hetergeneous import HeteroMapTableOrganizer, HeteroSearcher from .easy import EasyMarket from .classes import CondaChecker diff --git a/learnware/market/hetergeneous/__init__.py b/learnware/market/hetergeneous/__init__.py index dc6608d..e7ab916 100644 --- a/learnware/market/hetergeneous/__init__.py +++ b/learnware/market/hetergeneous/__init__.py @@ -1,2 +1,2 @@ from .organizer import HeteroMapTableOrganizer -from .searcher import HeteroMapTableSearcher +from .searcher import HeteroSearcher diff --git a/learnware/market/hetergeneous/organizer/__init__.py b/learnware/market/hetergeneous/organizer/__init__.py index 8356f0d..671aa50 100644 --- a/learnware/market/hetergeneous/organizer/__init__.py +++ b/learnware/market/hetergeneous/organizer/__init__.py @@ -167,7 +167,6 @@ class HeteroMapTableOrganizer(EasyOrganizer): for learnware in learnware_list: hetero_spec_path = os.path.join(hetero_mappings_save_path, f"{learnware.id}.npy") self._update_learnware_specification(learnware, save_path=hetero_spec_path) - logger.info(f"Learnware {learnware.id} HeteroSpecification Successfully Saved") except Exception as err: logger.warning(f"Update learnware HeteroSpecification failed! Due to {err}") diff --git a/learnware/market/hetergeneous/searcher.py b/learnware/market/hetergeneous/searcher.py index 90bbefd..6b1aa16 100644 --- a/learnware/market/hetergeneous/searcher.py +++ b/learnware/market/hetergeneous/searcher.py @@ -1,29 +1,122 @@ -from typing import List +from typing import Tuple, List, Union from ...learnware import Learnware from ...logger import get_module_logger +from ...specification import HeteroSpecification from ..base import BaseSearcher, BaseUserInfo +from ..easy2 import EasySearcher +from ..utils import parse_specification_type from .organizer import HeteroMapTableOrganizer logger = get_module_logger("hetero_searcher") -class HeteroMapTableSearcher(BaseSearcher): - def __init__(self, organizer: HeteroMapTableOrganizer = None): - super(HeteroMapTableSearcher, self).__init__(organizer) +class HeteroMapTableSearcher(EasySearcher): + def _convert_dist_to_score( + self, dist_list: List[float], dist_epsilon: float = 0.01, min_score: float = 0.92 + ) -> List[float]: + if len(dist_list) == 0: + return [] + + min_dist, max_dist = min(dist_list), max(dist_list) + if min_dist == max_dist: + return [1 for dist in dist_list] + else: + max_score = (max_dist - min_dist) / (max_dist - dist_epsilon) + + if min_dist < dist_epsilon: + dist_epsilon = min_dist + elif max_score < min_score: + dist_epsilon = max_dist - (max_dist - min_dist) / min_score + + return [(max_dist - dist) / (max_dist - dist_epsilon) for dist in dist_list] + + def _search_by_hetero_spec_single( + self, + learnware_list: List[Learnware], + user_hetero_spec: HeteroSpecification + ) -> Tuple[List[float], List[Learnware]]: + hetero_spec_list = [learnware.specification.get_stat_spec_by_name("HeteroSpecification") for learnware in learnware_list] + mmd_dist_list = [] + for hetero_spec in hetero_spec_list: + mmd_dist = hetero_spec.dist(user_hetero_spec) + mmd_dist_list.append(mmd_dist) + + sorted_idx_list = sorted(range(len(learnware_list)), key=lambda k: mmd_dist_list[k]) + sorted_dist_list = [mmd_dist_list[idx] for idx in sorted_idx_list] + sorted_learnware_list = [learnware_list[idx] for idx in sorted_idx_list] - def __call__(self, user_info: BaseUserInfo, check_status: int = None) -> Learnware: + return sorted_dist_list, sorted_learnware_list + + def _filter_by_hetero_spec_single( + self, + sorted_score_list: List[float], + learnware_list: List[Learnware], + filter_score: float = 0.5, + min_num: int = 5 + ) -> Tuple[List[float], List[Learnware]]: + idx = min(min_num, len(learnware_list)) + while idx < len(learnware_list): + if sorted_score_list[idx] < filter_score: + break + idx += 1 + return sorted_score_list[:idx], learnware_list[:idx] + + + def __call__( + self, + learnware_list: List[Learnware], + user_info: BaseUserInfo, + ) -> Tuple[List[float], List[Learnware], float, List[Learnware]]: # todo: use specially assigned search_gamma for calculating mmd dist - learnware_list = self.learnware_oganizer.get_learnwares() - target_learnware, min_dist = None, None user_hetero_spec = self.learnware_oganizer.generate_hetero_map_spec(user_info) - for learnware in learnware_list: - learnware_hetero_spec = learnware.specification.get_stat_spec_by_name("HeteroSpecification") - mmd_dist = learnware_hetero_spec.dist(user_hetero_spec) - if target_learnware is None or mmd_dist < min_dist: - min_dist = mmd_dist - target_learnware = learnware - return target_learnware + logger.info(f"After semantic search, learnware_list length is {len(learnware_list)}") + + sorted_dist_list, single_learnware_list = self._search_by_hetero_spec_single(learnware_list, user_hetero_spec) + sorted_score_list = self._convert_dist_to_score(sorted_dist_list) + + logger.info(f"After search by hetero spec, learnware_list length is {len(single_learnware_list)}") + sorted_score_list, single_learnware_list = self._filter_by_hetero_spec_single( + sorted_score_list, single_learnware_list + ) + + logger.info(f"After filter by hetero spec, learnware_list length is {len(single_learnware_list)}") + return sorted_score_list, single_learnware_list, None, None + + + # for learnware in learnware_list: + # learnware_hetero_spec = learnware.specification.get_stat_spec_by_name("HeteroSpecification") + # mmd_dist = learnware_hetero_spec.dist(user_hetero_spec) + # if target_learnware is None or mmd_dist < min_dist: + # min_dist = mmd_dist + # target_learnware = learnware + # return target_learnware def reset(self, organizer): self.learnware_oganizer = organizer + +class HeteroSearcher(EasySearcher): + def __init__(self, organizer: HeteroMapTableOrganizer = None): + super(HeteroSearcher, self).__init__(organizer) + self.hetero_stat_searcher = HeteroMapTableSearcher(organizer) + + def reset(self, organizer): + super().reset(organizer) + self.hetero_stat_searcher.reset(organizer) + + def __call__( + self, user_info: BaseUserInfo, check_status: int = None, max_search_num: int = 5, search_method: str = "greedy" + ) -> Tuple[List[float], List[Learnware], float, List[Learnware]]: + learnware_list = self.learnware_organizer.get_learnwares(check_status=check_status) + learnware_list = self.semantic_searcher(learnware_list, user_info) + + if len(learnware_list) == 0: + return [], [], 0.0, [] + + if parse_specification_type(stat_specs=user_info.stat_info) is not None: + if user_info.semantic_spec["Input"]["Description"] is not None: + return self.hetero_stat_searcher(learnware_list, user_info) + else: + return self.stat_searcher(learnware_list, user_info, max_search_num, search_method) + else: + return None, learnware_list, 0.0, None \ No newline at end of file diff --git a/learnware/market/module.py b/learnware/market/module.py index f60e06a..c933acb 100644 --- a/learnware/market/module.py +++ b/learnware/market/module.py @@ -1,6 +1,6 @@ from .base import LearnwareMarket from .easy2 import EasyOrganizer, EasySearcher, EasySemanticChecker, EasyStatChecker -from .hetergeneous import HeteroMapTableOrganizer, HeteroMapTableSearcher +from .hetergeneous import HeteroMapTableOrganizer, HeteroSearcher MARKET_CONFIG = { "easy": { @@ -10,7 +10,7 @@ MARKET_CONFIG = { }, "hetero": { "organizer": HeteroMapTableOrganizer(), - "searcher": HeteroMapTableSearcher(), + "searcher": HeteroSearcher(), "checker_list": [EasySemanticChecker(), EasyStatChecker()] } } From 59e807b4937812338e89fd0607943d5220921cdd Mon Sep 17 00:00:00 2001 From: Peng Tan Date: Wed, 8 Nov 2023 15:12:28 +0800 Subject: [PATCH 18/90] [ENH] add test_train_market_model --- .../test_hetero_market/test_hetero.py | 28 ++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/tests/test_market/test_hetero_market/test_hetero.py b/tests/test_market/test_hetero_market/test_hetero.py index 981bb7d..1ad854f 100644 --- a/tests/test_market/test_hetero_market/test_hetero.py +++ b/tests/test_market/test_hetero_market/test_hetero.py @@ -147,6 +147,31 @@ class TestMarket(unittest.TestCase): assert len(curr_inds) == 0, f"The market should be empty!" return hetero_market + + def test_train_market_model(self, learnware_num=5): + hetero_market = self._init_learnware_market() + self.test_prepare_learnware_randomly(learnware_num) + self.learnware_num = learnware_num + + print("Total Item:", len(hetero_market)) + assert len(hetero_market) == 0, f"The market should be empty!" + + for idx, zip_path in enumerate(self.zip_path_list): + semantic_spec = copy.deepcopy(user_semantic) + semantic_spec["Name"]["Values"] = "learnware_%d" % (idx) + semantic_spec["Description"]["Values"] = "test_learnware_number_%d" % (idx) + semantic_spec["Input"] = input_description_list[idx % 2] + semantic_spec["Output"] = output_description_list[idx % 2] + hetero_market.add_learnware(zip_path, semantic_spec) + + print("Total Item:", len(hetero_market)) + assert len(hetero_market) == self.learnware_num, f"The number of learnwares must be {self.learnware_num}!" + curr_inds = hetero_market.get_learnware_ids() + print("Available ids After Uploading Learnwares:", curr_inds) + assert len(curr_inds) == self.learnware_num, f"The number of learnwares must be {self.learnware_num}!" + + organizer=hetero_market.learnware_organizer + organizer.train() # def test_search_semantics(self, learnware_num=5): # easy_market = self.test_upload_delete_learnware(learnware_num, delete=False) @@ -219,7 +244,8 @@ def suite(): _suite = unittest.TestSuite() # _suite.addTest(TestMarket("test_prepare_learnware_randomly")) # _suite.addTest(TestMarket("test_generated_learnwares")) - _suite.addTest(TestMarket("test_upload_delete_learnware")) + # _suite.addTest(TestMarket("test_upload_delete_learnware")) + _suite.addTest(TestMarket("test_train_market_model")) # _suite.addTest(TestMarket("test_search_semantics")) # _suite.addTest(TestMarket("test_stat_search")) return _suite From 29454fa39ecf031cf6a1932d8e1a59722948a2a0 Mon Sep 17 00:00:00 2001 From: liuht Date: Wed, 8 Nov 2023 15:31:24 +0800 Subject: [PATCH 19/90] [FIX] fix hetero market root path --- .../market/hetergeneous/organizer/__init__.py | 9 +++++---- .../market/hetergeneous/organizer/config.py | 17 ++++++++--------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/learnware/market/hetergeneous/organizer/__init__.py b/learnware/market/hetergeneous/organizer/__init__.py index 671aa50..6164ca7 100644 --- a/learnware/market/hetergeneous/organizer/__init__.py +++ b/learnware/market/hetergeneous/organizer/__init__.py @@ -27,11 +27,12 @@ logger = get_module_logger("hetero_market") class HeteroMapTableOrganizer(EasyOrganizer): def reload_market(self, rebuild=False, auto_update_limit=100): - self.market_store_path = os.path.join(conf.market_root_path, self.market_id) + self.market_store_path = os.path.join(conf.hetero_root_path, self.market_id) self.market_mapping_path = os.path.join(self.market_store_path, conf.market_model_path) self.learnware_pool_path = os.path.join(self.market_store_path, "learnware_pool") self.learnware_zip_pool_path = os.path.join(self.market_store_path, "zips") self.learnware_folder_pool_path = os.path.join(self.market_store_path, "unzipped_learnwares") + self.hetero_mappings_path = os.path.join(self.market_store_path, conf.heter_mapping_path) self.learnware_list = {} # id:learnware self.learnware_zip_list = {} self.learnware_folder_list = {} @@ -55,6 +56,8 @@ class HeteroMapTableOrganizer(EasyOrganizer): os.makedirs(self.learnware_pool_path, exist_ok=True) os.makedirs(self.learnware_zip_pool_path, exist_ok=True) os.makedirs(self.learnware_folder_pool_path, exist_ok=True) + os.makedirs(self.hetero_mappings_path, exist_ok=True) + ( self.learnware_list, self.learnware_zip_list, @@ -162,10 +165,8 @@ class HeteroMapTableOrganizer(EasyOrganizer): def _update_learnware_list(self, learnware_list: List[Learnware]): try: - hetero_mappings_save_path = os.path.join(self.market_store_path, "hetero_mappings") - os.makedirs(hetero_mappings_save_path, exist_ok=True) for learnware in learnware_list: - hetero_spec_path = os.path.join(hetero_mappings_save_path, f"{learnware.id}.npy") + hetero_spec_path = os.path.join(self.hetero_mappings_path, f"{learnware.id}.npy") self._update_learnware_specification(learnware, save_path=hetero_spec_path) except Exception as err: logger.warning(f"Update learnware HeteroSpecification failed! Due to {err}") diff --git a/learnware/market/hetergeneous/organizer/config.py b/learnware/market/hetergeneous/organizer/config.py index 2510e3a..c9a4330 100644 --- a/learnware/market/hetergeneous/organizer/config.py +++ b/learnware/market/hetergeneous/organizer/config.py @@ -3,26 +3,25 @@ import copy import json import logging import os -from pathlib import Path from ....config import Config -ROOT_PATH = Path(__file__).resolve().parent.parent -HETERO_ROOT_DIRPATH = os.path.join(ROOT_PATH, ".learnware") +ROOT_PATH = os.path.join(os.path.expanduser("~"), ".learnware") +HETERO_ROOT_PATH = os.path.join(ROOT_PATH, "heterogeneous") PACKAGE_DIRPATH = os.path.dirname(os.path.abspath(__file__)) -LEARNWARE_POOL_PATH = os.path.join(HETERO_ROOT_DIRPATH, "learnware_pool") +LEARNWARE_POOL_PATH = os.path.join(HETERO_ROOT_PATH, "learnware_pool") LEARNWARE_ZIP_POOL_PATH = os.path.join(LEARNWARE_POOL_PATH, "zips") LEARNWARE_FOLDER_POOL_PATH = os.path.join(LEARNWARE_POOL_PATH, "learnwares") -DATABASE_PATH = os.path.join(HETERO_ROOT_DIRPATH, "database") -STDOUT_PATH = os.path.join(HETERO_ROOT_DIRPATH, "stdout") +DATABASE_PATH = os.path.join(HETERO_ROOT_PATH, "database") +STDOUT_PATH = os.path.join(HETERO_ROOT_PATH, "stdout") # relative paths TRAINING_ARGS_NAME = "training_args.json" MODEL_PATH = "pytorch_model.bin" TOKENIZER_DIR = "tokenizer" -HETERO_MAPPING_PATH = "hetero_mapping" +HETERO_MAPPING_PATH = "hetero_mappings" # TODO: Delete them later # os.makedirs(HETERO_ROOT_DIRPATH, exist_ok=True) @@ -30,12 +29,12 @@ HETERO_MAPPING_PATH = "hetero_mapping" # os.makedirs(STDOUT_PATH, exist_ok=True) _DEFAULT_CONFIG = { - "root_path": HETERO_ROOT_DIRPATH, + "root_path": ROOT_PATH, + "hetero_root_path": HETERO_ROOT_PATH, "package_path": PACKAGE_DIRPATH, "stdout_path": STDOUT_PATH, "logging_level": logging.INFO, "logging_outfile": None, - "market_root_path": HETERO_ROOT_DIRPATH, "market_model_path": MODEL_PATH, "market_training_args_path": TRAINING_ARGS_NAME, "market_tokenizer_path": TOKENIZER_DIR, From a0ef47af38a44b980c3f0c870f8560076b34d3c1 Mon Sep 17 00:00:00 2001 From: Peng Tan Date: Wed, 8 Nov 2023 16:29:29 +0800 Subject: [PATCH 20/90] [MNT] add test_search_semantics test --- .../test_hetero_market/test_hetero.py | 54 +++++++++---------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/tests/test_market/test_hetero_market/test_hetero.py b/tests/test_market/test_hetero_market/test_hetero.py index 1ad854f..7d66478 100644 --- a/tests/test_market/test_hetero_market/test_hetero.py +++ b/tests/test_market/test_hetero_market/test_hetero.py @@ -21,7 +21,7 @@ curr_root = os.path.dirname(os.path.abspath(__file__)) user_semantic = { "Data": {"Values": ["Table"], "Type": "Class"}, "Task": { - "Values": ["Classification"], + "Values": ["Regression"], "Type": "Class", }, "Library": {"Values": ["Scikit-learn"], "Type": "Class"}, @@ -173,35 +173,35 @@ class TestMarket(unittest.TestCase): organizer=hetero_market.learnware_organizer organizer.train() - # def test_search_semantics(self, learnware_num=5): - # easy_market = self.test_upload_delete_learnware(learnware_num, delete=False) - # print("Total Item:", len(easy_market)) - # assert len(easy_market) == self.learnware_num, f"The number of learnwares must be {self.learnware_num}!" + def test_search_semantics(self, learnware_num=5): + hetero_market = self.test_upload_delete_learnware(learnware_num, delete=False) + print("Total Item:", len(hetero_market)) + assert len(hetero_market) == self.learnware_num, f"The number of learnwares must be {self.learnware_num}!" - # semantic_spec = copy.deepcopy(user_semantic) - # semantic_spec["Name"]["Values"] = f"learnware_{learnware_num - 1}" + semantic_spec = copy.deepcopy(user_semantic) + semantic_spec["Name"]["Values"] = f"learnware_{learnware_num - 1}" - # user_info = BaseUserInfo(semantic_spec=semantic_spec) - # _, single_learnware_list, _, _ = easy_market.search_learnware(user_info) + user_info = BaseUserInfo(semantic_spec=semantic_spec) + _, single_learnware_list, _, _ = hetero_market.search_learnware(user_info) - # print("User info:", user_info.get_semantic_spec()) - # print(f"Search result:") - # assert len(single_learnware_list) == 1, f"Exact semantic search failed!" - # for learnware in single_learnware_list: - # semantic_spec1 = learnware.get_specification().get_semantic_spec() - # print("Choose learnware:", learnware.id, semantic_spec1) - # assert semantic_spec1["Name"]["Values"] == semantic_spec["Name"]["Values"], f"Exact semantic search failed!" + print("User info:", user_info.get_semantic_spec()) + print(f"Search result:") + assert len(single_learnware_list) == 1, f"Exact semantic search failed!" + for learnware in single_learnware_list: + semantic_spec1 = learnware.get_specification().get_semantic_spec() + print("Choose learnware:", learnware.id, semantic_spec1) + assert semantic_spec1["Name"]["Values"] == semantic_spec["Name"]["Values"], f"Exact semantic search failed!" - # semantic_spec["Name"]["Values"] = "laernwaer" - # user_info = BaseUserInfo(semantic_spec=semantic_spec) - # _, single_learnware_list, _, _ = easy_market.search_learnware(user_info) + semantic_spec["Name"]["Values"] = "laernwaer" + user_info = BaseUserInfo(semantic_spec=semantic_spec) + _, single_learnware_list, _, _ = hetero_market.search_learnware(user_info) - # print("User info:", user_info.get_semantic_spec()) - # print(f"Search result:") - # assert len(single_learnware_list) == self.learnware_num, f"Fuzzy semantic search failed!" - # for learnware in single_learnware_list: - # semantic_spec1 = learnware.get_specification().get_semantic_spec() - # print("Choose learnware:", learnware.id, semantic_spec1) + print("User info:", user_info.get_semantic_spec()) + print(f"Search result:") + assert len(single_learnware_list) == self.learnware_num, f"Fuzzy semantic search failed!" + for learnware in single_learnware_list: + semantic_spec1 = learnware.get_specification().get_semantic_spec() + print("Choose learnware:", learnware.id, semantic_spec1) # def test_stat_search(self, learnware_num=5): # easy_market = self.test_upload_delete_learnware(learnware_num, delete=False) @@ -245,8 +245,8 @@ def suite(): # _suite.addTest(TestMarket("test_prepare_learnware_randomly")) # _suite.addTest(TestMarket("test_generated_learnwares")) # _suite.addTest(TestMarket("test_upload_delete_learnware")) - _suite.addTest(TestMarket("test_train_market_model")) - # _suite.addTest(TestMarket("test_search_semantics")) + # _suite.addTest(TestMarket("test_train_market_model")) + _suite.addTest(TestMarket("test_search_semantics")) # _suite.addTest(TestMarket("test_stat_search")) return _suite From b7702aaad3a5ff61a66939e4512ac5b2b7e4138a Mon Sep 17 00:00:00 2001 From: Peng Tan Date: Wed, 8 Nov 2023 16:57:46 +0800 Subject: [PATCH 21/90] [FIX] fix bugs in searchers and add simple test_stat_search for homo test --- learnware/market/hetergeneous/searcher.py | 2 +- .../test_hetero_market/test_hetero.py | 75 ++++++++++--------- 2 files changed, 39 insertions(+), 38 deletions(-) diff --git a/learnware/market/hetergeneous/searcher.py b/learnware/market/hetergeneous/searcher.py index 6b1aa16..9d7e3a7 100644 --- a/learnware/market/hetergeneous/searcher.py +++ b/learnware/market/hetergeneous/searcher.py @@ -114,7 +114,7 @@ class HeteroSearcher(EasySearcher): return [], [], 0.0, [] if parse_specification_type(stat_specs=user_info.stat_info) is not None: - if user_info.semantic_spec["Input"]["Description"] is not None: + if "Input" in user_info.semantic_spec and user_info.semantic_spec["Input"]["Description"] is not None: return self.hetero_stat_searcher(learnware_list, user_info) else: return self.stat_searcher(learnware_list, user_info, max_search_num, search_method) diff --git a/tests/test_market/test_hetero_market/test_hetero.py b/tests/test_market/test_hetero_market/test_hetero.py index 7d66478..00dfa95 100644 --- a/tests/test_market/test_hetero_market/test_hetero.py +++ b/tests/test_market/test_hetero_market/test_hetero.py @@ -13,7 +13,7 @@ from learnware.client import LearnwareClient import learnware from learnware.market import instantiate_learnware_market, BaseUserInfo -import learnware.specification as specification +from learnware.specification import RKMETableSpecification, generate_rkme_spec from example_learnwares.config import input_shape_list, input_description_list, output_description_list curr_root = os.path.dirname(os.path.abspath(__file__)) @@ -74,7 +74,7 @@ class TestMarket(unittest.TestCase): joblib.dump(clf, os.path.join(dir_path, "ridge.pkl")) - spec = specification.generate_rkme_spec(X=X, gamma=0.1, cuda_idx=0) + spec = generate_rkme_spec(X=X, gamma=0.1, cuda_idx=0) spec.save(os.path.join(dir_path, "stat.json")) init_file = os.path.join(dir_path, "__init__.py") @@ -203,41 +203,42 @@ class TestMarket(unittest.TestCase): semantic_spec1 = learnware.get_specification().get_semantic_spec() print("Choose learnware:", learnware.id, semantic_spec1) - # def test_stat_search(self, learnware_num=5): - # easy_market = self.test_upload_delete_learnware(learnware_num, delete=False) - # print("Total Item:", len(easy_market)) - - # test_folder = os.path.join(curr_root, "test_stat") - - # for idx, zip_path in enumerate(self.zip_path_list): - # unzip_dir = os.path.join(test_folder, f"{idx}") - - # # unzip -o -q zip_path -d unzip_dir - # if os.path.exists(unzip_dir): - # rmtree(unzip_dir) - # os.makedirs(unzip_dir, exist_ok=True) - # with zipfile.ZipFile(zip_path, "r") as zip_obj: - # zip_obj.extractall(path=unzip_dir) - - # user_spec = specification.rkme.RKMETableSpecification() - # user_spec.load(os.path.join(unzip_dir, "svm.json")) - # user_info = BaseUserInfo(semantic_spec=user_semantic, stat_info={"RKMETableSpecification": user_spec}) - # ( - # sorted_score_list, - # single_learnware_list, - # mixture_score, - # mixture_learnware_list, - # ) = easy_market.search_learnware(user_info) + def test_stat_search(self, learnware_num=5): + hetero_market = self.test_upload_delete_learnware(learnware_num, delete=False) + print("Total Item:", len(hetero_market)) - # assert len(single_learnware_list) == self.learnware_num, f"Statistical search failed!" - # print(f"search result of user{idx}:") - # for score, learnware in zip(sorted_score_list, single_learnware_list): - # print(f"score: {score}, learnware_id: {learnware.id}") - # print(f"mixture_score: {mixture_score}\n") - # mixture_id = " ".join([learnware.id for learnware in mixture_learnware_list]) - # print(f"mixture_learnware: {mixture_id}\n") + test_folder = os.path.join(curr_root, "test_stat") - # rmtree(test_folder) # rm -r test_folder + for idx, zip_path in enumerate(self.zip_path_list): + unzip_dir = os.path.join(test_folder, f"{idx}") + + # unzip -o -q zip_path -d unzip_dir + if os.path.exists(unzip_dir): + rmtree(unzip_dir) + os.makedirs(unzip_dir, exist_ok=True) + with zipfile.ZipFile(zip_path, "r") as zip_obj: + zip_obj.extractall(path=unzip_dir) + + user_spec = RKMETableSpecification() + user_spec.load(os.path.join(unzip_dir, "stat.json")) + user_info = BaseUserInfo(semantic_spec=user_semantic, stat_info={"RKMETableSpecification": user_spec}) + ( + sorted_score_list, + single_learnware_list, + mixture_score, + mixture_learnware_list, + ) = hetero_market.search_learnware(user_info) + + target_spec_num=3 if idx%2==0 else 2 + assert len(single_learnware_list) == target_spec_num, f"Statistical search failed!" + print(f"search result of user{idx}:") + for score, learnware in zip(sorted_score_list, single_learnware_list): + print(f"score: {score}, learnware_id: {learnware.id}") + print(f"mixture_score: {mixture_score}\n") + mixture_id = " ".join([learnware.id for learnware in mixture_learnware_list]) + print(f"mixture_learnware: {mixture_id}\n") + + rmtree(test_folder) # rm -r test_folder def suite(): @@ -246,8 +247,8 @@ def suite(): # _suite.addTest(TestMarket("test_generated_learnwares")) # _suite.addTest(TestMarket("test_upload_delete_learnware")) # _suite.addTest(TestMarket("test_train_market_model")) - _suite.addTest(TestMarket("test_search_semantics")) - # _suite.addTest(TestMarket("test_stat_search")) + # _suite.addTest(TestMarket("test_search_semantics")) + _suite.addTest(TestMarket("test_stat_search")) return _suite From 7eaec8a1ec7c34acd1561c487921238b93cdf143 Mon Sep 17 00:00:00 2001 From: Peng Tan Date: Wed, 8 Nov 2023 20:33:58 +0800 Subject: [PATCH 22/90] [ENH] add hetero test for test_stat_search --- .../test_hetero_market/test_hetero.py | 52 +++++++++++++++++-- 1 file changed, 49 insertions(+), 3 deletions(-) diff --git a/tests/test_market/test_hetero_market/test_hetero.py b/tests/test_market/test_hetero_market/test_hetero.py index 00dfa95..81b6493 100644 --- a/tests/test_market/test_hetero_market/test_hetero.py +++ b/tests/test_market/test_hetero_market/test_hetero.py @@ -1,4 +1,4 @@ -import sys +import torch import unittest import os import copy @@ -54,6 +54,7 @@ class TestMarket(unittest.TestCase): hetero_market = instantiate_learnware_market(market_id="hetero_toy", name="hetero", rebuild=True) return hetero_market + def test_prepare_learnware_randomly(self, learnware_num=5): self.zip_path_list = [] @@ -67,7 +68,7 @@ class TestMarket(unittest.TestCase): input_dim=input_shape_list[example_learnware_idx] example_learnware_name="example_learnwares/example_learnware_%d" % (example_learnware_idx) - X, y = make_regression(n_samples=5000, n_features=input_dim, noise=0.1, random_state=42) + X, y = make_regression(n_samples=5000, n_informative=15, n_features=input_dim, noise=0.1, random_state=42) clf=Ridge(alpha=1.0) clf.fit(X, y) @@ -172,6 +173,7 @@ class TestMarket(unittest.TestCase): organizer=hetero_market.learnware_organizer organizer.train() + return hetero_market def test_search_semantics(self, learnware_num=5): hetero_market = self.test_upload_delete_learnware(learnware_num, delete=False) @@ -204,9 +206,53 @@ class TestMarket(unittest.TestCase): print("Choose learnware:", learnware.id, semantic_spec1) def test_stat_search(self, learnware_num=5): - hetero_market = self.test_upload_delete_learnware(learnware_num, delete=False) + hetero_market = self.test_train_market_model(learnware_num) print("Total Item:", len(hetero_market)) + # hetero test + user_dim=15 + + test_folder = os.path.join(curr_root, "test_stat") + + for idx, zip_path in enumerate(self.zip_path_list): + unzip_dir = os.path.join(test_folder, f"{idx}") + + # unzip -o -q zip_path -d unzip_dir + if os.path.exists(unzip_dir): + rmtree(unzip_dir) + os.makedirs(unzip_dir, exist_ok=True) + with zipfile.ZipFile(zip_path, "r") as zip_obj: + zip_obj.extractall(path=unzip_dir) + + user_spec = RKMETableSpecification() + user_spec.load(os.path.join(unzip_dir, "stat.json")) + z=user_spec.get_z() + z=z[:,:user_dim] + device=user_spec.device + z=torch.tensor(z, device=device) + user_spec.z=z + + semantic_spec = copy.deepcopy(user_semantic) + semantic_spec["Input"]=copy.deepcopy(input_description_list[idx%2]) + semantic_spec["Input"]['Dimension']=user_dim + # keep only the first user_dim descriptions + semantic_spec["Input"]['Description']={key: semantic_spec["Input"]['Description'][str(key)] for key in range(user_dim)} + + user_info = BaseUserInfo(semantic_spec=semantic_spec, stat_info={"RKMETableSpecification": user_spec}) + ( + sorted_score_list, + single_learnware_list, + mixture_score, + mixture_learnware_list, + ) = hetero_market.search_learnware(user_info) + + print(f"search result of user{idx}:") + for score, learnware in zip(sorted_score_list, single_learnware_list): + print(f"score: {score}, learnware_id: {learnware.id}") + + rmtree(test_folder) # rm -r test_folder + + # homo test test_folder = os.path.join(curr_root, "test_stat") for idx, zip_path in enumerate(self.zip_path_list): From 2fbfb3948bd597a645bcc69dc629e5499ca81aa2 Mon Sep 17 00:00:00 2001 From: Peng Tan Date: Wed, 8 Nov 2023 21:12:57 +0800 Subject: [PATCH 23/90] [ENH] add test_model_reuse and fix bug in reuser package --- .../reuse/hetero_reuser/feature_alignment.py | 8 ++-- .../example_learnwares/config.py | 23 ++++++++++ .../test_hetero_market/test_hetero.py | 44 ++++++++++++++++++- 3 files changed, 69 insertions(+), 6 deletions(-) diff --git a/learnware/reuse/hetero_reuser/feature_alignment.py b/learnware/reuse/hetero_reuser/feature_alignment.py index 4cbe71d..112d749 100644 --- a/learnware/reuse/hetero_reuser/feature_alignment.py +++ b/learnware/reuse/hetero_reuser/feature_alignment.py @@ -9,7 +9,7 @@ from tqdm import trange from loguru import logger from learnware.learnware import Learnware -from learnware.specification import RKMEStatSpecification +from learnware.specification import RKMETableSpecification from learnware.specification.regular.table.rkme import choose_device from ..base import BaseReuser @@ -26,7 +26,7 @@ class FeatureAligner(BaseReuser): self.device = choose_device(cuda_idx=cuda_idx) def fit(self, user_rkme): - target_rkme=self.learnware.specification.get_stat_spec()["RKMEStatSpecification"] + target_rkme=self.learnware.specification.get_stat_spec()["RKMETableSpecification"] trainer=FeatureAlignmentTrainer(target_rkme=target_rkme, user_rkme=user_rkme, cuda_idx=self.cuda_idx, **self.align_arguments) self.align_model=trainer.model self.align_model.eval() @@ -91,8 +91,8 @@ class FeatureAlignmentTrainer(): def __init__( self, - target_rkme: RKMEStatSpecification, # (X, weight) - user_rkme: RKMEStatSpecification, # (X, weight) + target_rkme: RKMETableSpecification, # (X, weight) + user_rkme: RKMETableSpecification, # (X, weight) extra_labeled_data: Any = None, target_learnware: Learnware = None, num_epoch: int = 50, diff --git a/tests/test_market/test_hetero_market/example_learnwares/config.py b/tests/test_market/test_hetero_market/example_learnwares/config.py index 941109a..b4d4fb4 100644 --- a/tests/test_market/test_hetero_market/example_learnwares/config.py +++ b/tests/test_market/test_hetero_market/example_learnwares/config.py @@ -78,4 +78,27 @@ output_description_list=[ }, }, +] + +user_description_list=[ + { + "Dimension": 15, + "Description": { # medical description + "0": "Whether the patient is on thyroxine medication (0: No, 1: Yes)", + "1": "Whether the patient has been queried about thyroxine medication (0: No, 1: Yes)", + "2": "Whether the patient is on antithyroid medication (0: No, 1: Yes)", + "3": "Whether the patient has undergone thyroid surgery (0: No, 1: Yes)", + "4": "Whether the patient has been queried about hypothyroidism (0: No, 1: Yes)", + "5": "Whether the patient has been queried about hyperthyroidism (0: No, 1: Yes)", + "6": "Whether the patient is pregnant (0: No, 1: Yes)", + "7": "Whether the patient is sick (0: No, 1: Yes)", + "8": "Whether the patient has a tumor (0: No, 1: Yes)", + "9": "Whether the patient is taking lithium (0: No, 1: Yes)", + "10": "Whether the patient has a goitre (enlarged thyroid gland) (0: No, 1: Yes)", + "11": "Whether TSH (Thyroid Stimulating Hormone) level has been measured (0: No, 1: Yes)", + "12": "Whether T3 (Triiodothyronine) level has been measured (0: No, 1: Yes)", + "13": "Whether TT4 (Total Thyroxine) level has been measured (0: No, 1: Yes)", + "14": "Whether T4U (Thyroxine Utilization) level has been measured (0: No, 1: Yes)" + }, + } ] \ No newline at end of file diff --git a/tests/test_market/test_hetero_market/test_hetero.py b/tests/test_market/test_hetero_market/test_hetero.py index 81b6493..19c5ad5 100644 --- a/tests/test_market/test_hetero_market/test_hetero.py +++ b/tests/test_market/test_hetero_market/test_hetero.py @@ -10,11 +10,13 @@ from sklearn.datasets import make_regression from shutil import copyfile, rmtree from multiprocessing import Pool from learnware.client import LearnwareClient +from sklearn.metrics import mean_squared_error import learnware from learnware.market import instantiate_learnware_market, BaseUserInfo from learnware.specification import RKMETableSpecification, generate_rkme_spec -from example_learnwares.config import input_shape_list, input_description_list, output_description_list +from learnware.reuse import HeteroMapTableReuser +from example_learnwares.config import input_shape_list, input_description_list, output_description_list, user_description_list curr_root = os.path.dirname(os.path.abspath(__file__)) @@ -286,6 +288,43 @@ class TestMarket(unittest.TestCase): rmtree(test_folder) # rm -r test_folder + def test_model_reuse(self, learnware_num=5): + # generate toy regression problem + X, y = make_regression(n_samples=5000, n_informative=10, n_features=15, noise=0.1, random_state=0) + + # generate rkme + user_spec = generate_rkme_spec(X=X, gamma=0.1, cuda_idx=0) + + # generate specification + semantic_spec = copy.deepcopy(user_semantic) + semantic_spec["Input"] = user_description_list[0] + user_info=BaseUserInfo(semantic_spec=semantic_spec, stat_info={"RKMETableSpecification": user_spec}) + + # learnware market search + hetero_market = self.test_train_market_model(learnware_num) + ( + sorted_score_list, + single_learnware_list, + mixture_score, + mixture_learnware_list, + ) = hetero_market.search_learnware(user_info) + + # model reuse + print([learnware.id for learnware in single_learnware_list]) + reuser=HeteroMapTableReuser(single_learnware_list[0], task_type='regression') + reuser.fit(user_spec) + y_pred=reuser.predict(X) + + # calculate rmse + rmse=mean_squared_error(y, y_pred, squared=False) + print(f"rmse not finetune: {rmse}") + + # finetune + reuser.finetune(X[:100], y[:100]) + y_pred=reuser.predict(X) + rmse=mean_squared_error(y, y_pred, squared=False) + print(f"rmse finetune: {rmse}") + def suite(): _suite = unittest.TestSuite() @@ -294,7 +333,8 @@ def suite(): # _suite.addTest(TestMarket("test_upload_delete_learnware")) # _suite.addTest(TestMarket("test_train_market_model")) # _suite.addTest(TestMarket("test_search_semantics")) - _suite.addTest(TestMarket("test_stat_search")) + # _suite.addTest(TestMarket("test_stat_search")) + _suite.addTest(TestMarket("test_model_reuse")) return _suite From 68da04dc6bebd2483a79672eee6d081b5b942896 Mon Sep 17 00:00:00 2001 From: Peng Tan Date: Wed, 8 Nov 2023 21:15:19 +0800 Subject: [PATCH 24/90] [MNT] modify results print --- tests/test_market/test_hetero_market/test_hetero.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_market/test_hetero_market/test_hetero.py b/tests/test_market/test_hetero_market/test_hetero.py index 19c5ad5..2d367ac 100644 --- a/tests/test_market/test_hetero_market/test_hetero.py +++ b/tests/test_market/test_hetero_market/test_hetero.py @@ -309,8 +309,11 @@ class TestMarket(unittest.TestCase): mixture_learnware_list, ) = hetero_market.search_learnware(user_info) + # print search results + for score, learnware in zip(sorted_score_list, single_learnware_list): + print(f"score: {score}, learnware_id: {learnware.id}") + # model reuse - print([learnware.id for learnware in single_learnware_list]) reuser=HeteroMapTableReuser(single_learnware_list[0], task_type='regression') reuser.fit(user_spec) y_pred=reuser.predict(X) From 5af0b2fa9263929462d5395af5f43cde30614e57 Mon Sep 17 00:00:00 2001 From: liuht Date: Thu, 9 Nov 2023 13:47:21 +0800 Subject: [PATCH 25/90] [FIX] delete heterogeneous config file --- .../market/hetergeneous/organizer/config.py | 52 ------------------- .../organizer/hetero_mapping/__init__.py | 5 +- .../hetero_mapping/feature_extractor.py | 2 - .../organizer/hetero_mapping/trainer.py | 3 +- 4 files changed, 3 insertions(+), 59 deletions(-) delete mode 100644 learnware/market/hetergeneous/organizer/config.py diff --git a/learnware/market/hetergeneous/organizer/config.py b/learnware/market/hetergeneous/organizer/config.py deleted file mode 100644 index c9a4330..0000000 --- a/learnware/market/hetergeneous/organizer/config.py +++ /dev/null @@ -1,52 +0,0 @@ -# Name of the files used for checkpointing -import copy -import json -import logging -import os - -from ....config import Config - -ROOT_PATH = os.path.join(os.path.expanduser("~"), ".learnware") -HETERO_ROOT_PATH = os.path.join(ROOT_PATH, "heterogeneous") -PACKAGE_DIRPATH = os.path.dirname(os.path.abspath(__file__)) - -LEARNWARE_POOL_PATH = os.path.join(HETERO_ROOT_PATH, "learnware_pool") -LEARNWARE_ZIP_POOL_PATH = os.path.join(LEARNWARE_POOL_PATH, "zips") -LEARNWARE_FOLDER_POOL_PATH = os.path.join(LEARNWARE_POOL_PATH, "learnwares") - -DATABASE_PATH = os.path.join(HETERO_ROOT_PATH, "database") -STDOUT_PATH = os.path.join(HETERO_ROOT_PATH, "stdout") - -# relative paths -TRAINING_ARGS_NAME = "training_args.json" -MODEL_PATH = "pytorch_model.bin" -TOKENIZER_DIR = "tokenizer" -HETERO_MAPPING_PATH = "hetero_mappings" - -# TODO: Delete them later -# os.makedirs(HETERO_ROOT_DIRPATH, exist_ok=True) -# os.makedirs(DATABASE_PATH, exist_ok=True) -# os.makedirs(STDOUT_PATH, exist_ok=True) - -_DEFAULT_CONFIG = { - "root_path": ROOT_PATH, - "hetero_root_path": HETERO_ROOT_PATH, - "package_path": PACKAGE_DIRPATH, - "stdout_path": STDOUT_PATH, - "logging_level": logging.INFO, - "logging_outfile": None, - "market_model_path": MODEL_PATH, - "market_training_args_path": TRAINING_ARGS_NAME, - "market_tokenizer_path": TOKENIZER_DIR, - "heter_mapping_path": HETERO_MAPPING_PATH, - "learnware_pool_path": LEARNWARE_POOL_PATH, - "learnware_zip_pool_path": LEARNWARE_ZIP_POOL_PATH, - "learnware_folder_pool_path": LEARNWARE_FOLDER_POOL_PATH, - "learnware_folder_config": { - "yaml_file": "learnware.yaml", - "module_file": "__init__.py", - }, - "database_url": f"sqlite:///{DATABASE_PATH}", -} - -C = Config(_DEFAULT_CONFIG) diff --git a/learnware/market/hetergeneous/organizer/hetero_mapping/__init__.py b/learnware/market/hetergeneous/organizer/hetero_mapping/__init__.py index f0e9de5..69b7b99 100644 --- a/learnware/market/hetergeneous/organizer/hetero_mapping/__init__.py +++ b/learnware/market/hetergeneous/organizer/hetero_mapping/__init__.py @@ -8,7 +8,6 @@ import torch.nn.functional as F from torch import Tensor, nn from .....specification import HeteroSpecification, RKMETableSpecification -from ..config import C as conf from .feature_extractor import * from .trainer import Trainer, TransTabCollatorForCL @@ -99,7 +98,7 @@ class HeteroMapping(nn.Module): """ # load model weight state dict - market_model_path = os.path.join(checkpoint, conf.market_model_path) + market_model_path = os.path.join(checkpoint, "model.bin") model_info = torch.load(market_model_path, map_location="cpu") model = HeteroMapping(**model_info["model_args"]) model.load_state_dict(model_info["model_state_dict"], strict=False) @@ -126,7 +125,7 @@ class HeteroMapping(nn.Module): "model_args": self.model_args, # "feature_tokenizer": self.feature_tokenizer, } - torch.save(model_info, os.path.join(ckpt_dir, conf.market_model_path)) + torch.save(model_info, os.path.join(ckpt_dir, "model.bin")) def forward(self, x, y=None): # do positive sampling diff --git a/learnware/market/hetergeneous/organizer/hetero_mapping/feature_extractor.py b/learnware/market/hetergeneous/organizer/hetero_mapping/feature_extractor.py index 55d5805..1c23587 100644 --- a/learnware/market/hetergeneous/organizer/hetero_mapping/feature_extractor.py +++ b/learnware/market/hetergeneous/organizer/hetero_mapping/feature_extractor.py @@ -10,8 +10,6 @@ from loguru import logger from torch import Tensor, nn from transformers import BertTokenizerFast -from ..config import C as conf - class WordEmbedding(nn.Module): """ diff --git a/learnware/market/hetergeneous/organizer/hetero_mapping/trainer.py b/learnware/market/hetergeneous/organizer/hetero_mapping/trainer.py index 695d96e..3667f59 100644 --- a/learnware/market/hetergeneous/organizer/hetero_mapping/trainer.py +++ b/learnware/market/hetergeneous/organizer/hetero_mapping/trainer.py @@ -11,7 +11,6 @@ from torch import nn from torch.utils.data import DataLoader, Dataset from tqdm.autonotebook import trange -from ..config import C as conf from .feature_extractor import FeatureTokenizer @@ -101,7 +100,7 @@ class Trainer: if isinstance(v, int) or isinstance(v, str) or isinstance(v, float): train_args[k] = v with open( - os.path.join(output_dir, conf.market_training_args_path), + os.path.join(output_dir, "training_args.json"), "w", encoding="utf-8", ) as f: From 31bb1c276ce78548c59a604d126923d570b62643 Mon Sep 17 00:00:00 2001 From: bxdd Date: Thu, 9 Nov 2023 14:00:17 +0800 Subject: [PATCH 26/90] [FIX] fix syntax error --- learnware/market/hetergeneous/organizer/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/learnware/market/hetergeneous/organizer/__init__.py b/learnware/market/hetergeneous/organizer/__init__.py index 6164ca7..c202ccd 100644 --- a/learnware/market/hetergeneous/organizer/__init__.py +++ b/learnware/market/hetergeneous/organizer/__init__.py @@ -7,7 +7,7 @@ import tempfile import zipfile from collections import defaultdict from shutil import copyfile, rmtree -from typing import List +from typing import List, Tuple import pandas as pd from torch import nn From e7b7e869e2f9a71cdc0827cea0af79e519748e1a Mon Sep 17 00:00:00 2001 From: liuht Date: Thu, 9 Nov 2023 15:43:41 +0800 Subject: [PATCH 27/90] [FIX, ENH] async add_learnware, fix heter typo --- .gitignore | 3 +- learnware/market/__init__.py | 2 +- learnware/market/hetergeneous/database_ops.py | 177 ------------------ .../__init__.py | 0 .../organizer.py | 0 .../organizer/__init__.py | 85 ++++----- .../organizer/hetero_mapping/__init__.py | 0 .../hetero_mapping/feature_extractor.py | 0 .../organizer/hetero_mapping/trainer.py | 0 .../searcher.py | 37 ++-- learnware/market/module.py | 2 +- learnware/specification/system/heter_table.py | 4 +- .../test_hetero_market/test_hetero.py | 5 +- 13 files changed, 74 insertions(+), 241 deletions(-) delete mode 100644 learnware/market/hetergeneous/database_ops.py rename learnware/market/{hetergeneous => heterogeneous}/__init__.py (100%) rename learnware/market/{hetergeneous => heterogeneous}/organizer.py (100%) rename learnware/market/{hetergeneous => heterogeneous}/organizer/__init__.py (74%) rename learnware/market/{hetergeneous => heterogeneous}/organizer/hetero_mapping/__init__.py (100%) rename learnware/market/{hetergeneous => heterogeneous}/organizer/hetero_mapping/feature_extractor.py (100%) rename learnware/market/{hetergeneous => heterogeneous}/organizer/hetero_mapping/trainer.py (100%) rename learnware/market/{hetergeneous => heterogeneous}/searcher.py (80%) diff --git a/.gitignore b/.gitignore index d22ea69..39ba56e 100644 --- a/.gitignore +++ b/.gitignore @@ -43,5 +43,4 @@ cache/ tmp/ learnware_pool/ PFS/ -data/ -learnware/market/hetergeneous/.learnware/* \ No newline at end of file +data/ \ No newline at end of file diff --git a/learnware/market/__init__.py b/learnware/market/__init__.py index be3e1ac..b850f5c 100644 --- a/learnware/market/__init__.py +++ b/learnware/market/__init__.py @@ -3,7 +3,7 @@ from .base import BaseUserInfo, LearnwareMarket, BaseChecker, BaseOrganizer, Bas from .evolve_anchor import EvolvedAnchoredOrganizer from .evolve import EvolvedOrganizer from .easy import EasyOrganizer, EasySearcher, EasySemanticChecker, EasyStatChecker -from .hetergeneous import HeteroMapTableOrganizer, HeteroSearcher +from .heterogeneous import HeteroMapTableOrganizer, HeteroSearcher from .classes import CondaChecker from .module import instantiate_learnware_market diff --git a/learnware/market/hetergeneous/database_ops.py b/learnware/market/hetergeneous/database_ops.py deleted file mode 100644 index 5d8461a..0000000 --- a/learnware/market/hetergeneous/database_ops.py +++ /dev/null @@ -1,177 +0,0 @@ -import json -import os - -from learnware.learnware import get_learnware_from_dirpath -from learnware.logger import get_module_logger -from sqlalchemy import (Column, DateTime, Integer, String, Text, create_engine, - text) -from sqlalchemy.ext.declarative import declarative_base - -logger = get_module_logger("database") -DeclarativeBase = declarative_base() - - -class Learnware(DeclarativeBase): - __tablename__ = "tb_learnware" - - id = Column(String(10), primary_key=True, nullable=False) - semantic_spec = Column(Text, nullable=False) - zip_path = Column(Text, nullable=False) - folder_path = Column(Text, nullable=False) - use_flag = Column(Text, nullable=False) - - pass - - -class DatabaseOperations(object): - def __init__(self, url: str, database_name: str): - if url.startswith("sqlite"): - url = os.path.join(url, f"{database_name}.db") - else: - url = f"{url}/{database_name}" - pass - - self.url = url - self.create_database_if_not_exists(url) - - pass - - def create_database_if_not_exists(self, url): - database_exists = True - - if url.startswith("sqlite"): - # it is sqlite - start = url.find(":///") - path = url[start + 4 :] - if os.path.exists(path): - database_exists = True - pass - else: - database_exists = False - os.makedirs(os.path.dirname(path), exist_ok=True) - pass - pass - elif self.url.startswith("postgresql"): - # it is postgresql - dbname_start = url.rfind("/") - dbname = url[dbname_start + 1 :] - url_no_dbname = url[:dbname_start] + "/postgres" - engine = create_engine(url_no_dbname) - - with engine.connect() as conn: - result = conn.execute(text("SELECT datname FROM pg_database;")) - db_list = set() - - for row in result.fetchall(): - db_list.add(row[0].lower()) - pass - - if dbname.lower() not in db_list: - database_exists = False - conn.execution_options(isolation_level="AUTOCOMMIT").execute( - text("CREATE DATABASE {0};".format(dbname)) - ) - pass - else: - database_exists = True - pass - pass - engine.dispose() - pass - else: - raise Exception(f"Unsupported database url: {self.url}") - pass - - self.engine = create_engine(url, future=True) - - if not database_exists: - DeclarativeBase.metadata.create_all(self.engine) - pass - pass - - def clear_learnware_table(self): - with self.engine.connect() as conn: - conn.execute(text("DELETE FROM tb_learnware;")) - conn.commit() - pass - pass - - def add_learnware(self, id: str, semantic_spec: dict, zip_path, folder_path, use_flag: str): - with self.engine.connect() as conn: - semantic_spec_str = json.dumps(semantic_spec) - conn.execute( - text( - ( - "INSERT INTO tb_learnware (id, semantic_spec, zip_path, folder_path, use_flag)" - "VALUES (:id, :semantic_spec, :zip_path, :folder_path, :use_flag);" - ) - ), - dict( - id=id, - semantic_spec=semantic_spec_str, - zip_path=zip_path, - folder_path=folder_path, - use_flag=use_flag, - ), - ) - conn.commit() - pass - pass - - def delete_learnware(self, id: str): - with self.engine.connect() as conn: - conn.execute(text("DELETE FROM tb_learnware WHERE id=:id;"), dict(id=id)) - conn.commit() - pass - pass - - def update_learnware_semantic_specification(self, id: str, semantic_spec: dict): - with self.engine.connect() as conn: - semantic_spec_str = json.dumps(semantic_spec) - r = conn.execute( - text("UPDATE tb_learnware SET semantic_spec=:semantic_spec WHERE id=:id;"), - dict(id=id, semantic_spec=semantic_spec_str), - ) - conn.commit() - pass - pass - - def update_learnware_use_flag(self, id: str, use_flag: str): - with self.engine.connect() as conn: - r = conn.execute( - text("UPDATE tb_learnware SET use_flag=:use_flag WHERE id=:id;"), - dict(id=id, use_flag=use_flag), - ) - conn.commit() - pass - pass - - def load_market(self): - with self.engine.connect() as conn: - cursor = conn.execute(text("SELECT id, semantic_spec, zip_path, folder_path, use_flag FROM tb_learnware;")) - - learnware_list = {} - zip_list = {} - folder_list = {} - use_flags = {} - max_count = 0 - - for id, semantic_spec, zip_path, folder_path, use_flag in cursor: - id = id.strip() - semantic_spec_dict = json.loads(semantic_spec) - new_learnware = get_learnware_from_dirpath( - id=id, semantic_spec=semantic_spec_dict, learnware_dirpath=folder_path - ) - logger.info(f"Load learnware: {id}") - learnware_list[id] = new_learnware - # assert new_learnware is not None - zip_list[id] = zip_path - folder_list[id] = folder_path - use_flags[id] = use_flag - max_count = max(max_count, int(id)) - pass - - return learnware_list, zip_list, folder_list, use_flags, max_count + 1 - pass - - pass diff --git a/learnware/market/hetergeneous/__init__.py b/learnware/market/heterogeneous/__init__.py similarity index 100% rename from learnware/market/hetergeneous/__init__.py rename to learnware/market/heterogeneous/__init__.py diff --git a/learnware/market/hetergeneous/organizer.py b/learnware/market/heterogeneous/organizer.py similarity index 100% rename from learnware/market/hetergeneous/organizer.py rename to learnware/market/heterogeneous/organizer.py diff --git a/learnware/market/hetergeneous/organizer/__init__.py b/learnware/market/heterogeneous/organizer/__init__.py similarity index 74% rename from learnware/market/hetergeneous/organizer/__init__.py rename to learnware/market/heterogeneous/organizer/__init__.py index c202ccd..9933c19 100644 --- a/learnware/market/hetergeneous/organizer/__init__.py +++ b/learnware/market/heterogeneous/organizer/__init__.py @@ -17,9 +17,9 @@ from ....learnware import Learnware, get_learnware_from_dirpath from ....logger import get_module_logger from ....specification.system import HeteroSpecification from ...base import BaseChecker, BaseUserInfo -from ...easy2 import EasyOrganizer -from ..database_ops import DatabaseOperations -from .config import C as conf +from ...easy import EasyOrganizer +from ...easy.database_ops import DatabaseOperations +from ....config import C as conf from .hetero_mapping import HeteroMapping, Trainer logger = get_module_logger("hetero_market") @@ -27,12 +27,12 @@ logger = get_module_logger("hetero_market") class HeteroMapTableOrganizer(EasyOrganizer): def reload_market(self, rebuild=False, auto_update_limit=100): - self.market_store_path = os.path.join(conf.hetero_root_path, self.market_id) - self.market_mapping_path = os.path.join(self.market_store_path, conf.market_model_path) + self.market_store_path = os.path.join(conf.root_path, self.market_id) + self.market_mapping_path = os.path.join(self.market_store_path, "model.bin") self.learnware_pool_path = os.path.join(self.market_store_path, "learnware_pool") self.learnware_zip_pool_path = os.path.join(self.market_store_path, "zips") self.learnware_folder_pool_path = os.path.join(self.market_store_path, "unzipped_learnwares") - self.hetero_mappings_path = os.path.join(self.market_store_path, conf.heter_mapping_path) + self.hetero_mappings_path = os.path.join(self.market_store_path, "hetero_mappings") self.learnware_list = {} # id:learnware self.learnware_zip_list = {} self.learnware_folder_list = {} @@ -41,8 +41,6 @@ class HeteroMapTableOrganizer(EasyOrganizer): self.dbops = DatabaseOperations(conf.database_url, "market_" + self.market_id) self.auto_update = False self.auto_update_limit = auto_update_limit - self.auto_update_lock = mp.Lock() - self.is_training_in_progress = mp.Value('i', 0) if rebuild: logger.warning("Warning! You are trying to clear current database!") @@ -75,7 +73,6 @@ class HeteroMapTableOrganizer(EasyOrganizer): self.market_mapping = HeteroMapping() def reset(self, market_id=None, auto_update=False, auto_update_limit=None, **kwargs): - # model training arguments(model architecture + optimization) set via self.reset self.auto_update = auto_update self.market_id = market_id self.training_args = kwargs @@ -126,42 +123,45 @@ class HeteroMapTableOrganizer(EasyOrganizer): self.learnware_zip_list[learnware_id] = target_zip_dir self.learnware_folder_list[learnware_id] = target_folder_dir self.use_flags[learnware_id] = learnwere_status - self.count += 1 - - with self.auto_update_lock: - if self.auto_update and not self.is_training_in_progress.value and self.count - self.last_trained_learnware_num >= self.auto_update_limit: - self.is_training_in_progress.value = 1 - curr_learnware_list = copy.deepcopy(self.learnware_list) - train_process = mp.Process(target=self.train, args=(curr_learnware_list.values(),)) - train_process.start() - # train_process.join() + self.count += 1 + + if self.auto_update and self.count - self.last_trained_learnware_num == self.auto_update_limit + 1: + logger.warning(f"Leanwares for training: {self.get_learnware_ids()}") + + updated_market_mapping = self.train( + learnware_list=self.learnware_list.values(), + save_dir=self.market_store_path, + **self.training_args + ) + + logger.warning(f"Market mapping train completed. Now update HeteroSpecification for {self.get_learnware_ids()}") + + self.market_mapping = updated_market_mapping + self._update_learnware_list(self.learnware_list.values()) + self.last_trained_learnware_num = self.count return learnware_id, learnwere_status - def train(self, learnware_list: List[Learnware] = None): - learnware_list = learnware_list or self.learnware_list.values() - logger.warning(f"Leanwares for training: {[learnware.id for learnware in learnware_list]}") - allset = self._learnwares_to_dataframes(learnware_list) - self.market_mapping = HeteroMapping(**self.training_args) + @staticmethod + def train(learnware_list: List[Learnware] = None, save_dir: str = None, **kwargs): + allset = HeteroMapTableOrganizer._learnwares_to_dataframes(learnware_list) + market_mapping = HeteroMapping(**kwargs) market_mapping_trainer = Trainer( - model=self.market_mapping, + model=market_mapping, train_set_list=allset, - collate_fn=self.market_mapping.collate_fn, - **self.training_args, + collate_fn=market_mapping.collate_fn, + **kwargs, ) - market_mapping_trainer.train() - # auto save whenever market model retrained - market_mapping_trainer.save_model(output_dir=self.market_store_path) - - # essential hetero-mapping update for each market learnware when market model retrained - self._update_learnware_list(learnware_list) - self.last_trained_learnware_num = self.count + market_mapping_trainer.train() + market_mapping_trainer.save_model(output_dir=save_dir) - logger.warning(f"Updataed Specification For: {[learnware.id for learnware in learnware_list]}") + return market_mapping - with self.auto_update_lock: - self.is_training_in_progress.value = 0 + ############################################ + # save_model & generateing new specification + # should be moved out of train thread + ############################################ def _update_learnware_list(self, learnware_list: List[Learnware]): try: @@ -169,7 +169,7 @@ class HeteroMapTableOrganizer(EasyOrganizer): hetero_spec_path = os.path.join(self.hetero_mappings_path, f"{learnware.id}.npy") self._update_learnware_specification(learnware, save_path=hetero_spec_path) except Exception as err: - logger.warning(f"Update learnware HeteroSpecification failed! Due to {err}") + logger.warning(f"Update HeteroSpecification failed! Due to {err}") def _update_learnware_specification(self, learnware: Learnware, save_path: str) -> Learnware: specification = learnware.specification @@ -178,16 +178,16 @@ class HeteroMapTableOrganizer(EasyOrganizer): learnware_hetero_spec = self.market_mapping.hetero_mapping(learnware_rkme, learnware_features) learnware.update_stat_spec("HeteroSpecification", learnware_hetero_spec) - # custom hetero spec save path? learnware_hetero_spec.save(save_path) def generate_hetero_map_spec(self, user_info: BaseUserInfo) -> HeteroSpecification: user_rkme = user_info.stat_info["RKMETableSpecification"] - user_features = user_info.semantic_spec["Input"]["Description"].values() + user_features = user_info.get_semantic_spec()["Input"]["Description"].values() user_hetero_spec = self.market_mapping.hetero_mapping(user_rkme, user_features) return user_hetero_spec - def _learnwares_to_dataframes(self, learnware_list: List[Learnware]) -> List[pd.DataFrame]: + @staticmethod + def _learnwares_to_dataframes(learnware_list: List[Learnware]) -> List[pd.DataFrame]: learnware_df_dict = defaultdict(list) for learnware in learnware_list: specification = learnware.get_specification() @@ -198,7 +198,4 @@ class HeteroMapTableOrganizer(EasyOrganizer): learnware_df_dict[tuple(sorted(learnware_features))].append(learnware_df) merged_dfs = [pd.concat(dfs) for dfs in learnware_df_dict.values()] - return merged_dfs - - def save(self, save_path): - return NotImplementedError \ No newline at end of file + return merged_dfs \ No newline at end of file diff --git a/learnware/market/hetergeneous/organizer/hetero_mapping/__init__.py b/learnware/market/heterogeneous/organizer/hetero_mapping/__init__.py similarity index 100% rename from learnware/market/hetergeneous/organizer/hetero_mapping/__init__.py rename to learnware/market/heterogeneous/organizer/hetero_mapping/__init__.py diff --git a/learnware/market/hetergeneous/organizer/hetero_mapping/feature_extractor.py b/learnware/market/heterogeneous/organizer/hetero_mapping/feature_extractor.py similarity index 100% rename from learnware/market/hetergeneous/organizer/hetero_mapping/feature_extractor.py rename to learnware/market/heterogeneous/organizer/hetero_mapping/feature_extractor.py diff --git a/learnware/market/hetergeneous/organizer/hetero_mapping/trainer.py b/learnware/market/heterogeneous/organizer/hetero_mapping/trainer.py similarity index 100% rename from learnware/market/hetergeneous/organizer/hetero_mapping/trainer.py rename to learnware/market/heterogeneous/organizer/hetero_mapping/trainer.py diff --git a/learnware/market/hetergeneous/searcher.py b/learnware/market/heterogeneous/searcher.py similarity index 80% rename from learnware/market/hetergeneous/searcher.py rename to learnware/market/heterogeneous/searcher.py index 9d7e3a7..4ed3444 100644 --- a/learnware/market/hetergeneous/searcher.py +++ b/learnware/market/heterogeneous/searcher.py @@ -1,10 +1,12 @@ from typing import Tuple, List, Union +import numpy as np + from ...learnware import Learnware from ...logger import get_module_logger from ...specification import HeteroSpecification from ..base import BaseSearcher, BaseUserInfo -from ..easy2 import EasySearcher +from ..easy import EasySearcher from ..utils import parse_specification_type from .organizer import HeteroMapTableOrganizer @@ -38,7 +40,7 @@ class HeteroMapTableSearcher(EasySearcher): ) -> Tuple[List[float], List[Learnware]]: hetero_spec_list = [learnware.specification.get_stat_spec_by_name("HeteroSpecification") for learnware in learnware_list] mmd_dist_list = [] - for hetero_spec in hetero_spec_list: + for idx, hetero_spec in enumerate(hetero_spec_list): mmd_dist = hetero_spec.dist(user_hetero_spec) mmd_dist_list.append(mmd_dist) @@ -83,15 +85,6 @@ class HeteroMapTableSearcher(EasySearcher): logger.info(f"After filter by hetero spec, learnware_list length is {len(single_learnware_list)}") return sorted_score_list, single_learnware_list, None, None - - # for learnware in learnware_list: - # learnware_hetero_spec = learnware.specification.get_stat_spec_by_name("HeteroSpecification") - # mmd_dist = learnware_hetero_spec.dist(user_hetero_spec) - # if target_learnware is None or mmd_dist < min_dist: - # min_dist = mmd_dist - # target_learnware = learnware - # return target_learnware - def reset(self, organizer): self.learnware_oganizer = organizer @@ -103,6 +96,26 @@ class HeteroSearcher(EasySearcher): def reset(self, organizer): super().reset(organizer) self.hetero_stat_searcher.reset(organizer) + + @staticmethod + def check_user_info(user_info: BaseUserInfo): + try: + user_stat_spec = user_info.get_stat_info("RKMETableSpecification") + user_input_shape = user_stat_spec.get_z().shape[1] + + user_input_description = user_info.get_semantic_spec()["Input"] + + user_description_dim = int(user_input_description["Dimension"]) + user_description_feature_num = len(user_input_description["Description"]) + + if user_input_shape != user_description_dim or user_input_shape != user_description_feature_num: + logger.warning("User data feature dimensions mismatch with semantic specification") + return False + + return True + except: + logger.info(f"No heterogeneous search information provided. Use homogeneous search instead.") + return False def __call__( self, user_info: BaseUserInfo, check_status: int = None, max_search_num: int = 5, search_method: str = "greedy" @@ -114,7 +127,7 @@ class HeteroSearcher(EasySearcher): return [], [], 0.0, [] if parse_specification_type(stat_specs=user_info.stat_info) is not None: - if "Input" in user_info.semantic_spec and user_info.semantic_spec["Input"]["Description"] is not None: + if self.check_user_info(user_info): return self.hetero_stat_searcher(learnware_list, user_info) else: return self.stat_searcher(learnware_list, user_info, max_search_num, search_method) diff --git a/learnware/market/module.py b/learnware/market/module.py index f0903e4..945bbaf 100644 --- a/learnware/market/module.py +++ b/learnware/market/module.py @@ -1,6 +1,6 @@ from .base import LearnwareMarket from .easy import EasyOrganizer, EasySearcher, EasySemanticChecker, EasyStatChecker -from .hetergeneous import HeteroMapTableOrganizer, HeteroSearcher +from .heterogeneous import HeteroMapTableOrganizer, HeteroSearcher MARKET_CONFIG = { "easy": { diff --git a/learnware/specification/system/heter_table.py b/learnware/specification/system/heter_table.py index f721d8f..a574daf 100644 --- a/learnware/specification/system/heter_table.py +++ b/learnware/specification/system/heter_table.py @@ -29,10 +29,10 @@ class HeteroSpecification(SystemStatsSpecification): super(HeteroSpecification, self).__init__(type=self.__class__.__name__) def get_z(self) -> np.ndarray: - return self.z.detach().cpu().numpy + return self.z.detach().cpu().numpy() def get_beta(self) -> np.ndarray: - return self.beta.detach().cpu().numpy + return self.beta.detach().cpu().numpy() def generate_stat_spec_from_system(self, heter_embedding: np.ndarray, rkme_spec: RKMETableSpecification): self.beta = rkme_spec.beta.to(self.device) diff --git a/tests/test_market/test_hetero_market/test_hetero.py b/tests/test_market/test_hetero_market/test_hetero.py index 2d367ac..83d4d95 100644 --- a/tests/test_market/test_hetero_market/test_hetero.py +++ b/tests/test_market/test_hetero_market/test_hetero.py @@ -155,6 +155,7 @@ class TestMarket(unittest.TestCase): hetero_market = self._init_learnware_market() self.test_prepare_learnware_randomly(learnware_num) self.learnware_num = learnware_num + hetero_market.learnware_organizer.reset(auto_update=True, auto_update_limit=learnware_num) print("Total Item:", len(hetero_market)) assert len(hetero_market) == 0, f"The market should be empty!" @@ -173,8 +174,8 @@ class TestMarket(unittest.TestCase): print("Available ids After Uploading Learnwares:", curr_inds) assert len(curr_inds) == self.learnware_num, f"The number of learnwares must be {self.learnware_num}!" - organizer=hetero_market.learnware_organizer - organizer.train() + # organizer=hetero_market.learnware_organizer + # organizer.train(hetero_market.learnware_organizer.learnware_list.values()) return hetero_market def test_search_semantics(self, learnware_num=5): From 46d84c1f728293ec3d8ccfbed1c02dbbc52f2268 Mon Sep 17 00:00:00 2001 From: Peng Tan Date: Fri, 10 Nov 2023 15:21:47 +0800 Subject: [PATCH 28/90] [MNT] enhance check_user_info and verify it with test --- learnware/market/heterogeneous/searcher.py | 5 +++++ learnware/market/module.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/learnware/market/heterogeneous/searcher.py b/learnware/market/heterogeneous/searcher.py index 4ed3444..aee6c5b 100644 --- a/learnware/market/heterogeneous/searcher.py +++ b/learnware/market/heterogeneous/searcher.py @@ -105,6 +105,11 @@ class HeteroSearcher(EasySearcher): user_input_description = user_info.get_semantic_spec()["Input"] + user_task_type=user_info.get_semantic_spec()["Task"]["Values"] + if user_task_type not in [["Classification"], ["Regression"]]: + logger.warning("User doesn't provide correct task type, it must be either Classification or Regression") + return False + user_description_dim = int(user_input_description["Dimension"]) user_description_feature_num = len(user_input_description["Description"]) diff --git a/learnware/market/module.py b/learnware/market/module.py index 945bbaf..d48e03e 100644 --- a/learnware/market/module.py +++ b/learnware/market/module.py @@ -11,7 +11,7 @@ MARKET_CONFIG = { "hetero": { "organizer": HeteroMapTableOrganizer(), "searcher": HeteroSearcher(), - "checker_list": [EasySemanticChecker(), EasyStatChecker()] + "checker_list": [] } } From 08a8eb5f444cee07c5401848358cd3a6cbb275b5 Mon Sep 17 00:00:00 2001 From: Peng Tan Date: Fri, 10 Nov 2023 15:33:46 +0800 Subject: [PATCH 29/90] Merge branch 'main' into feature/hetero --- tests/test_market/test_hetero_market/test_hetero.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_market/test_hetero_market/test_hetero.py b/tests/test_market/test_hetero_market/test_hetero.py index 83d4d95..075c0f7 100644 --- a/tests/test_market/test_hetero_market/test_hetero.py +++ b/tests/test_market/test_hetero_market/test_hetero.py @@ -334,10 +334,10 @@ def suite(): _suite = unittest.TestSuite() # _suite.addTest(TestMarket("test_prepare_learnware_randomly")) # _suite.addTest(TestMarket("test_generated_learnwares")) - # _suite.addTest(TestMarket("test_upload_delete_learnware")) - # _suite.addTest(TestMarket("test_train_market_model")) - # _suite.addTest(TestMarket("test_search_semantics")) - # _suite.addTest(TestMarket("test_stat_search")) + _suite.addTest(TestMarket("test_upload_delete_learnware")) + _suite.addTest(TestMarket("test_train_market_model")) + _suite.addTest(TestMarket("test_search_semantics")) + _suite.addTest(TestMarket("test_stat_search")) _suite.addTest(TestMarket("test_model_reuse")) return _suite From 9cb01fedc5c367b3e815fa26b5be3975e61d82c5 Mon Sep 17 00:00:00 2001 From: Peng Tan Date: Fri, 10 Nov 2023 16:10:52 +0800 Subject: [PATCH 30/90] [MNT] enhance user_info checker, it can provide more accurate info; modify the test accordingly. --- learnware/market/heterogeneous/searcher.py | 17 +++++++------- .../test_hetero_market/test_hetero.py | 23 +++++++++++++++---- 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/learnware/market/heterogeneous/searcher.py b/learnware/market/heterogeneous/searcher.py index aee6c5b..2d0f418 100644 --- a/learnware/market/heterogeneous/searcher.py +++ b/learnware/market/heterogeneous/searcher.py @@ -103,23 +103,22 @@ class HeteroSearcher(EasySearcher): user_stat_spec = user_info.get_stat_info("RKMETableSpecification") user_input_shape = user_stat_spec.get_z().shape[1] - user_input_description = user_info.get_semantic_spec()["Input"] - - user_task_type=user_info.get_semantic_spec()["Task"]["Values"] + user_task_type = user_info.get_semantic_spec().get("Task", {}).get("Values") if user_task_type not in [["Classification"], ["Regression"]]: - logger.warning("User doesn't provide correct task type, it must be either Classification or Regression") + logger.warning("User doesn't provide correct task type, it must be either Classification or Regression.") return False - user_description_dim = int(user_input_description["Dimension"]) - user_description_feature_num = len(user_input_description["Description"]) + user_input_description = user_info.get_semantic_spec().get("Input", {}) + user_description_dim = int(user_input_description.get("Dimension", 0)) + user_description_feature_num = len(user_input_description.get("Description", [])) if user_input_shape != user_description_dim or user_input_shape != user_description_feature_num: - logger.warning("User data feature dimensions mismatch with semantic specification") + logger.warning("User data feature dimensions mismatch with semantic specification.") return False return True - except: - logger.info(f"No heterogeneous search information provided. Use homogeneous search instead.") + except Exception as e: + logger.info(f"Invalid heterogeneous search information provided. Use homogeneous search instead. Error: {e}") return False def __call__( diff --git a/tests/test_market/test_hetero_market/test_hetero.py b/tests/test_market/test_hetero_market/test_hetero.py index 075c0f7..1642fe9 100644 --- a/tests/test_market/test_hetero_market/test_hetero.py +++ b/tests/test_market/test_hetero_market/test_hetero.py @@ -253,6 +253,21 @@ class TestMarket(unittest.TestCase): for score, learnware in zip(sorted_score_list, single_learnware_list): print(f"score: {score}, learnware_id: {learnware.id}") + # delete key "Task" in semantic_spec, use homo search and print WARNING INFO with "User doesn't provide correct task type" + print("delele key 'Task' test:") + semantic_spec.pop("Task") + + # repeat search + user_info = BaseUserInfo(semantic_spec=semantic_spec, stat_info={"RKMETableSpecification": user_spec}) + ( + sorted_score_list, + single_learnware_list, + mixture_score, + mixture_learnware_list, + ) = hetero_market.search_learnware(user_info) + + assert(len(single_learnware_list)==0), f"Statistical search failed!" + rmtree(test_folder) # rm -r test_folder # homo test @@ -334,11 +349,11 @@ def suite(): _suite = unittest.TestSuite() # _suite.addTest(TestMarket("test_prepare_learnware_randomly")) # _suite.addTest(TestMarket("test_generated_learnwares")) - _suite.addTest(TestMarket("test_upload_delete_learnware")) - _suite.addTest(TestMarket("test_train_market_model")) - _suite.addTest(TestMarket("test_search_semantics")) + # _suite.addTest(TestMarket("test_upload_delete_learnware")) + # _suite.addTest(TestMarket("test_train_market_model")) + # _suite.addTest(TestMarket("test_search_semantics")) _suite.addTest(TestMarket("test_stat_search")) - _suite.addTest(TestMarket("test_model_reuse")) + # _suite.addTest(TestMarket("test_model_reuse")) return _suite From e4c1c15675df0e86ff85e6b73d1b3aaaa1bf78dd Mon Sep 17 00:00:00 2001 From: Peng Tan Date: Fri, 10 Nov 2023 16:18:30 +0800 Subject: [PATCH 31/90] [MNT] add cases for test_stat_search to check the logger info for wrong cases. --- .../test_hetero_market/test_hetero.py | 25 ++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/tests/test_market/test_hetero_market/test_hetero.py b/tests/test_market/test_hetero_market/test_hetero.py index 1642fe9..db16e78 100644 --- a/tests/test_market/test_hetero_market/test_hetero.py +++ b/tests/test_market/test_hetero_market/test_hetero.py @@ -213,6 +213,7 @@ class TestMarket(unittest.TestCase): print("Total Item:", len(hetero_market)) # hetero test + print("+++++ HETERO TEST ++++++") user_dim=15 test_folder = os.path.join(curr_root, "test_stat") @@ -235,6 +236,7 @@ class TestMarket(unittest.TestCase): z=torch.tensor(z, device=device) user_spec.z=z + print(">> normal case test:") semantic_spec = copy.deepcopy(user_semantic) semantic_spec["Input"]=copy.deepcopy(input_description_list[idx%2]) semantic_spec["Input"]['Dimension']=user_dim @@ -254,7 +256,7 @@ class TestMarket(unittest.TestCase): print(f"score: {score}, learnware_id: {learnware.id}") # delete key "Task" in semantic_spec, use homo search and print WARNING INFO with "User doesn't provide correct task type" - print("delele key 'Task' test:") + print(">> delele key 'Task' test:") semantic_spec.pop("Task") # repeat search @@ -268,9 +270,30 @@ class TestMarket(unittest.TestCase): assert(len(single_learnware_list)==0), f"Statistical search failed!" + # modify semantic info with mismatch dim, use homo search and print "User data feature dimensions mismatch with semantic specification." + print(">> mismatch dim test") + semantic_spec = copy.deepcopy(user_semantic) + semantic_spec["Input"]=copy.deepcopy(input_description_list[idx%2]) + semantic_spec["Input"]['Dimension']=user_dim-2 + # keep only the first user_dim descriptions + semantic_spec["Input"]['Description']={key: semantic_spec["Input"]['Description'][str(key)] for key in range(user_dim)} + + # repeat search + user_info = BaseUserInfo(semantic_spec=semantic_spec, stat_info={"RKMETableSpecification": user_spec}) + ( + sorted_score_list, + single_learnware_list, + mixture_score, + mixture_learnware_list, + ) = hetero_market.search_learnware(user_info) + + assert(len(single_learnware_list)==0), f"Statistical search failed!" + + rmtree(test_folder) # rm -r test_folder # homo test + print("\n+++++ HOMO TEST ++++++") test_folder = os.path.join(curr_root, "test_stat") for idx, zip_path in enumerate(self.zip_path_list): From 02d47d53f6cd4a53efed1a6b6e5d782c29c049f9 Mon Sep 17 00:00:00 2001 From: Peng Tan Date: Fri, 10 Nov 2023 16:30:51 +0800 Subject: [PATCH 32/90] [MNT] modify check_user_info --- learnware/market/heterogeneous/searcher.py | 4 ++-- .../test_hetero_market/test_hetero.py | 18 +++++++++++++++--- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/learnware/market/heterogeneous/searcher.py b/learnware/market/heterogeneous/searcher.py index 2d0f418..ebab847 100644 --- a/learnware/market/heterogeneous/searcher.py +++ b/learnware/market/heterogeneous/searcher.py @@ -103,12 +103,12 @@ class HeteroSearcher(EasySearcher): user_stat_spec = user_info.get_stat_info("RKMETableSpecification") user_input_shape = user_stat_spec.get_z().shape[1] - user_task_type = user_info.get_semantic_spec().get("Task", {}).get("Values") + user_task_type = user_info.get_semantic_spec()["Task"]["Values"] if user_task_type not in [["Classification"], ["Regression"]]: logger.warning("User doesn't provide correct task type, it must be either Classification or Regression.") return False - user_input_description = user_info.get_semantic_spec().get("Input", {}) + user_input_description = user_info.get_semantic_spec()["Input"] user_description_dim = int(user_input_description.get("Dimension", 0)) user_description_feature_num = len(user_input_description.get("Description", [])) diff --git a/tests/test_market/test_hetero_market/test_hetero.py b/tests/test_market/test_hetero_market/test_hetero.py index db16e78..27e11f7 100644 --- a/tests/test_market/test_hetero_market/test_hetero.py +++ b/tests/test_market/test_hetero_market/test_hetero.py @@ -255,11 +255,25 @@ class TestMarket(unittest.TestCase): for score, learnware in zip(sorted_score_list, single_learnware_list): print(f"score: {score}, learnware_id: {learnware.id}") + # empty value of key "Task" in semantic_spec, use homo search and print + print(">> test for key 'Task' has empty 'Values':") + semantic_spec["Task"]={"Values":{}} + + user_info = BaseUserInfo(semantic_spec=semantic_spec, stat_info={"RKMETableSpecification": user_spec}) + ( + sorted_score_list, + single_learnware_list, + mixture_score, + mixture_learnware_list, + ) = hetero_market.search_learnware(user_info) + + assert(len(single_learnware_list)==0), f"Statistical search failed!" + + # delete key "Task" in semantic_spec, use homo search and print WARNING INFO with "User doesn't provide correct task type" print(">> delele key 'Task' test:") semantic_spec.pop("Task") - # repeat search user_info = BaseUserInfo(semantic_spec=semantic_spec, stat_info={"RKMETableSpecification": user_spec}) ( sorted_score_list, @@ -275,10 +289,8 @@ class TestMarket(unittest.TestCase): semantic_spec = copy.deepcopy(user_semantic) semantic_spec["Input"]=copy.deepcopy(input_description_list[idx%2]) semantic_spec["Input"]['Dimension']=user_dim-2 - # keep only the first user_dim descriptions semantic_spec["Input"]['Description']={key: semantic_spec["Input"]['Description'][str(key)] for key in range(user_dim)} - # repeat search user_info = BaseUserInfo(semantic_spec=semantic_spec, stat_info={"RKMETableSpecification": user_spec}) ( sorted_score_list, From f694a6a8a144692f182e3ff30c7c3c8f811a6df2 Mon Sep 17 00:00:00 2001 From: Peng Tan Date: Fri, 10 Nov 2023 16:32:55 +0800 Subject: [PATCH 33/90] [MNT] modify details for check_user_info --- learnware/market/heterogeneous/searcher.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/learnware/market/heterogeneous/searcher.py b/learnware/market/heterogeneous/searcher.py index ebab847..9782cf7 100644 --- a/learnware/market/heterogeneous/searcher.py +++ b/learnware/market/heterogeneous/searcher.py @@ -109,8 +109,8 @@ class HeteroSearcher(EasySearcher): return False user_input_description = user_info.get_semantic_spec()["Input"] - user_description_dim = int(user_input_description.get("Dimension", 0)) - user_description_feature_num = len(user_input_description.get("Description", [])) + user_description_dim = int(user_input_description["Dimension"]) + user_description_feature_num = len(user_input_description["Description"]) if user_input_shape != user_description_dim or user_input_shape != user_description_feature_num: logger.warning("User data feature dimensions mismatch with semantic specification.") From e65f43c168ecfd8eb7d9497040f39c533ca1fee7 Mon Sep 17 00:00:00 2001 From: liuht Date: Fri, 10 Nov 2023 19:16:19 +0800 Subject: [PATCH 34/90] [ENH] modify add_learnware --- .../heterogeneous/organizer/__init__.py | 115 +++++++++++------- 1 file changed, 68 insertions(+), 47 deletions(-) diff --git a/learnware/market/heterogeneous/organizer/__init__.py b/learnware/market/heterogeneous/organizer/__init__.py index 9933c19..fdc73fe 100644 --- a/learnware/market/heterogeneous/organizer/__init__.py +++ b/learnware/market/heterogeneous/organizer/__init__.py @@ -37,7 +37,8 @@ class HeteroMapTableOrganizer(EasyOrganizer): self.learnware_zip_list = {} self.learnware_folder_list = {} self.count = 0 - self.last_trained_learnware_num = 0 + self.training_count = 1 + self.last_training_count = 0 self.dbops = DatabaseOperations(conf.database_url, "market_" + self.market_id) self.auto_update = False self.auto_update_limit = auto_update_limit @@ -65,12 +66,25 @@ class HeteroMapTableOrganizer(EasyOrganizer): ) = self.dbops.load_market() if os.path.exists(self.market_mapping_path): - logger.info(f"Loading Market Mapping from Default Checkpoint {self.market_mapping_path}") + logger.info(f"Reload market mapping from checkpoint {self.market_mapping_path}") self.market_mapping = HeteroMapping.load(checkpoint=self.market_store_path) - # self._update_learnware_list(self.learnware_list) + if not rebuild: + if os.path.exists(self.hetero_mappings_path): + for hetero_json_path in os.listdir(self.hetero_mappings_path): + idx = hetero_json_path.split('.')[0] + hetero_spec = HeteroSpecification() + hetero_spec.load(os.path.join(self.hetero_mappings_path, f"{idx}.json")) + try: + self.learnware_list[idx].update_stat_spec("HeteroSpecification", hetero_spec) + except: + logger.warning(f"Learnware ID {idx} NOT Found!") + else: + logger.info("No HeteroSpecifications to reload. Use loaded market mapping to regenerate.") + self._update_learnware_by_ids(self.learnware_list.keys()) else: - logger.warning(f"No Existing Market Mapping!!") + logger.warning(f"No market mapping to reload!!") self.market_mapping = HeteroMapping() + # rmtree(self.hetero_mappings_path) def reset(self, market_id=None, auto_update=False, auto_update_limit=None, **kwargs): self.auto_update = auto_update @@ -81,6 +95,11 @@ class HeteroMapTableOrganizer(EasyOrganizer): def add_learnware( self, zip_path: str, semantic_spec: dict, check_status: int, learnware_id: str = None ) -> Tuple[str, int]: + if check_status == BaseChecker.INVALID_LEARNWARE: + logger.warning("Learnware is invalid!") + return None, BaseChecker.INVALID_LEARNWARE + + semantic_spec = copy.deepcopy(semantic_spec) logger.info("Get new learnware from %s" % (zip_path)) learnware_id = "%08d" % (self.count) if learnware_id is None else learnware_id @@ -118,32 +137,34 @@ class HeteroMapTableOrganizer(EasyOrganizer): use_flag=learnwere_status, ) - self._update_learnware_list([new_learnware]) self.learnware_list[learnware_id] = new_learnware self.learnware_zip_list[learnware_id] = target_zip_dir self.learnware_folder_list[learnware_id] = target_folder_dir self.use_flags[learnware_id] = learnwere_status + self._update_learnware_by_ids([learnware_id]) self.count += 1 + self.training_count += ([learnware_id] == self._get_table_type_learnware_ids([learnware_id])) - if self.auto_update and self.count - self.last_trained_learnware_num == self.auto_update_limit + 1: - logger.warning(f"Leanwares for training: {self.get_learnware_ids()}") + if self.auto_update and self.training_count - self.last_training_count == self.auto_update_limit + 1: + training_learnware_ids = self._get_table_type_learnware_ids(self.get_learnware_ids()) + training_learnwares = self.get_learnware_by_ids(training_learnware_ids) + logger.warning(f"Leanwares for training: {training_learnware_ids}") updated_market_mapping = self.train( - learnware_list=self.learnware_list.values(), + learnware_list=training_learnwares, save_dir=self.market_store_path, **self.training_args ) - logger.warning(f"Market mapping train completed. Now update HeteroSpecification for {self.get_learnware_ids()}") - + logger.warning(f"Market mapping train completed. Now update HeteroSpecification for {training_learnware_ids}") self.market_mapping = updated_market_mapping - self._update_learnware_list(self.learnware_list.values()) - self.last_trained_learnware_num = self.count + self._update_learnware_by_ids(training_learnware_ids) + self.last_training_count = len(training_learnware_ids) return learnware_id, learnwere_status @staticmethod - def train(learnware_list: List[Learnware] = None, save_dir: str = None, **kwargs): + def train(learnware_list: List[Learnware], save_dir: str, **kwargs): allset = HeteroMapTableOrganizer._learnwares_to_dataframes(learnware_list) market_mapping = HeteroMapping(**kwargs) market_mapping_trainer = Trainer( @@ -157,45 +178,45 @@ class HeteroMapTableOrganizer(EasyOrganizer): market_mapping_trainer.save_model(output_dir=save_dir) return market_mapping - - ############################################ - # save_model & generateing new specification - # should be moved out of train thread - ############################################ - - def _update_learnware_list(self, learnware_list: List[Learnware]): - try: - for learnware in learnware_list: - hetero_spec_path = os.path.join(self.hetero_mappings_path, f"{learnware.id}.npy") - self._update_learnware_specification(learnware, save_path=hetero_spec_path) - except Exception as err: - logger.warning(f"Update HeteroSpecification failed! Due to {err}") - - def _update_learnware_specification(self, learnware: Learnware, save_path: str) -> Learnware: - specification = learnware.specification - learnware_rkme = specification.get_stat_spec()["RKMETableSpecification"] - learnware_features = specification.get_semantic_spec()["Input"]["Description"].values() - learnware_hetero_spec = self.market_mapping.hetero_mapping(learnware_rkme, learnware_features) - learnware.update_stat_spec("HeteroSpecification", learnware_hetero_spec) - - learnware_hetero_spec.save(save_path) - + + def _update_learnware_by_ids(self, ids: List[str]): + ids = self._get_table_type_learnware_ids(ids) + for id in ids: + try: + spec = self.learnware_list[id].get_specification() + semantic_spec, stat_spec = spec.get_semantic_spec(), spec.get_stat_spec()["RKMETableSpecification"] + features = semantic_spec["Input"]["Description"].values() + hetero_spec = self.market_mapping.hetero_mapping(stat_spec, features) + self.learnware_list[id].update_stat_spec("HeteroSpecification", hetero_spec) + + save_path = os.path.join(self.hetero_mappings_path, f"{id}.json") + hetero_spec.save(save_path) + except Exception as err: + logger.warning(f"Learnware {id} generate HeteroSpecification failed! Due to {err}") + def generate_hetero_map_spec(self, user_info: BaseUserInfo) -> HeteroSpecification: - user_rkme = user_info.stat_info["RKMETableSpecification"] + user_stat_spec = user_info.stat_info["RKMETableSpecification"] user_features = user_info.get_semantic_spec()["Input"]["Description"].values() - user_hetero_spec = self.market_mapping.hetero_mapping(user_rkme, user_features) + + user_hetero_spec = self.market_mapping.hetero_mapping(user_stat_spec, user_features) return user_hetero_spec @staticmethod def _learnwares_to_dataframes(learnware_list: List[Learnware]) -> List[pd.DataFrame]: learnware_df_dict = defaultdict(list) for learnware in learnware_list: - specification = learnware.get_specification() - learnware_rkme = specification.get_stat_spec()["RKMETableSpecification"] - learnware_features = specification.get_semantic_spec()["Input"]["Description"] - learnware_df = pd.DataFrame(data=learnware_rkme.get_z(), columns=learnware_features.values()) - - learnware_df_dict[tuple(sorted(learnware_features))].append(learnware_df) - - merged_dfs = [pd.concat(dfs) for dfs in learnware_df_dict.values()] - return merged_dfs \ No newline at end of file + spec = learnware.get_specification() + stat_spec = spec.get_stat_spec()["RKMETableSpecification"] + features = spec.get_semantic_spec()["Input"]["Description"] + learnware_df = pd.DataFrame(data=stat_spec.get_z(), columns=features.values()) + learnware_df_dict[tuple(sorted(features))].append(learnware_df) + + return [pd.concat(dfs) for dfs in learnware_df_dict.values()] + + def _get_table_type_learnware_ids(self, ids: List[str]) -> List[str]: + ret = [] + for id in ids: + semantic_spec = self.learnware_list[id].get_specification().get_semantic_spec() + if semantic_spec["Data"]["Values"][0] == "Table": + ret.append(id) + return ret \ No newline at end of file From 238f26345978d4dd7565a3abfe5fa0a58a52aa39 Mon Sep 17 00:00:00 2001 From: Peng Tan Date: Fri, 10 Nov 2023 20:21:54 +0800 Subject: [PATCH 35/90] [ENH] enhance FeatureAugmentReuser for classification; add corresponding test; add explanations. --- learnware/reuse/feature_augment_reuser.py | 104 ++++++++++++++---- learnware/reuse/hetero_reuser/__init__.py | 10 +- .../reuse/hetero_reuser/feature_alignment.py | 6 +- .../test_hetero_market/test_hetero.py | 11 +- tests/test_workflow/test_workflow.py | 16 ++- 5 files changed, 101 insertions(+), 46 deletions(-) diff --git a/learnware/reuse/feature_augment_reuser.py b/learnware/reuse/feature_augment_reuser.py index 2f1c835..af98a0a 100644 --- a/learnware/reuse/feature_augment_reuser.py +++ b/learnware/reuse/feature_augment_reuser.py @@ -1,38 +1,95 @@ -from typing import List import numpy as np -from sklearn.linear_model import RidgeCV - +from sklearn.linear_model import RidgeCV, LogisticRegressionCV from .base import BaseReuser from learnware.learnware import Learnware - class FeatureAugmentReuser(BaseReuser): - def __init__(self, learnware: Learnware = None, task_type: str = None): - self.learnware=learnware - assert task_type in ["classification", "regression"] - self.task_type=task_type - - def predict(self, x_test: np.ndarray) -> np.ndarray: - x_test=self._fill_data(x_test) - y_pred=self.learnware.predict(x_test) - x_test_aug=np.concatenate((x_test, y_pred.reshape(-1, 1)), axis=1) - y_pred_aug=self.output_aligner.predict(x_test_aug) + """ + FeatureAugmentReuser is a class for augmenting features using predictions of a given learnware model and applying regression or classification on the augmented dataset. + + This class supports two modes: + - "regression": Uses RidgeCV for regression tasks. + - "classification": Uses LogisticRegressionCV for classification tasks. + """ + + def __init__(self, learnware: Learnware = None, mode: str = None): + """ + Initializes the FeatureAugmentReuser with a learnware model and a mode. + + Parameters + ---------- + learnware : Learnware + A learnware model used for initial predictions. + mode : str + The mode of operation, either "regression" or "classification". + """ + self.learnware = learnware + assert mode in ["classification", "regression"], "Mode must be either 'classification' or 'regression'" + self.mode = mode + + def predict(self, user_data: np.ndarray) -> np.ndarray: + """ + Predicts the output for user data using the trained output aligner model. + + Parameters + ---------- + user_data : np.ndarray + Input data for making predictions. + + Returns + ------- + np.ndarray + Predicted output from the output aligner model. + """ + user_data = self._fill_data(user_data) + y_pred = self.learnware.predict(user_data) + user_data_aug = np.concatenate((user_data, y_pred.reshape(-1, 1)), axis=1) + y_pred_aug = self.output_aligner.predict(user_data_aug) return y_pred_aug - def fit(self, x_train, y_train): - x_train=self._fill_data(x_train) - y_pred=self.learnware.predict(x_train) - x_train_aug=np.concatenate((x_train, y_pred.reshape(-1, 1)), axis=1) - if self.task_type=="regression": + def fit(self, x_train: np.ndarray, y_train: np.ndarray): + """ + Trains the output aligner model using the training data augmented with predictions from the learnware model. + + Parameters + ---------- + x_train : np.ndarray + Training data features. + y_train : np.ndarray + Training data labels. + """ + x_train = self._fill_data(x_train) + y_pred = self.learnware.predict(x_train) + x_train_aug = np.concatenate((x_train, y_pred.reshape(-1, 1)), axis=1) + if self.mode == "regression": alpha_list = [0.01, 0.1, 1.0, 10, 100] ridge_cv = RidgeCV(alphas=alpha_list, store_cv_values=True) ridge_cv.fit(x_train_aug, y_train) - self.output_aligner=ridge_cv - elif self.task_type=="classification": - raise NotImplementedError("Not implemented yet!") + self.output_aligner = ridge_cv + elif self.mode == "classification": + self.output_aligner = LogisticRegressionCV() + self.output_aligner.fit(x_train_aug, y_train) def _fill_data(self, X: np.ndarray): + """ + Fills missing data (NaN, Inf) in the input array with the mean of the column. + + Parameters + ---------- + X : np.ndarray + Input data array that may contain missing values. + + Returns + ------- + np.ndarray + Data array with missing values filled. + + Raises + ------ + ValueError + If a column in X contains only exceptional values (NaN, Inf). + """ X[np.isinf(X) | np.isneginf(X) | np.isposinf(X) | np.isneginf(X)] = np.nan if np.any(np.isnan(X)): for col in range(X.shape[1]): @@ -40,7 +97,6 @@ class FeatureAugmentReuser(BaseReuser): if np.any(is_nan): if np.all(is_nan): raise ValueError(f"All values in column {col} are exceptional, e.g., NaN and Inf.") - # Fill np.nan with np.nanmean col_mean = np.nanmean(X[:, col]) X[:, col] = np.where(is_nan, col_mean, X[:, col]) - return X \ No newline at end of file + return X diff --git a/learnware/reuse/hetero_reuser/__init__.py b/learnware/reuse/hetero_reuser/__init__.py index 43523df..69d24a1 100644 --- a/learnware/reuse/hetero_reuser/__init__.py +++ b/learnware/reuse/hetero_reuser/__init__.py @@ -6,20 +6,20 @@ from ..feature_augment_reuser import FeatureAugmentReuser class HeteroMapTableReuser(BaseReuser): - def __init__(self, learnware: Learnware = None, task_type: str = None, cuda_idx=0, **align_arguments): + def __init__(self, learnware: Learnware = None, mode: str = None, cuda_idx=0, **align_arguments): self.learnware=learnware - assert task_type in ["classification", "regression"] - self.task_type=task_type + assert mode in ["classification", "regression"] + self.mode=mode self.cuda_idx=cuda_idx self.align_arguments=align_arguments def fit(self, user_rkme): - self.feature_aligner=FeatureAligner(learnware=self.learnware, task_type=self.task_type, cuda_idx=self.cuda_idx, **self.align_arguments) + self.feature_aligner=FeatureAligner(learnware=self.learnware, mode=self.mode, cuda_idx=self.cuda_idx, **self.align_arguments) self.feature_aligner.fit(user_rkme) self.reuser=self.feature_aligner def finetune(self, x_train,y_train): - self.reuser=FeatureAugmentReuser(learnware=self.feature_aligner, task_type=self.task_type) + self.reuser=FeatureAugmentReuser(learnware=self.feature_aligner, mode=self.mode) self.reuser.fit(x_train, y_train) def predict(self, user_data): diff --git a/learnware/reuse/hetero_reuser/feature_alignment.py b/learnware/reuse/hetero_reuser/feature_alignment.py index 112d749..2bb3f71 100644 --- a/learnware/reuse/hetero_reuser/feature_alignment.py +++ b/learnware/reuse/hetero_reuser/feature_alignment.py @@ -17,10 +17,10 @@ from ..base import BaseReuser class FeatureAligner(BaseReuser): - def __init__(self, learnware: Learnware = None, task_type: str = None, cuda_idx=0, **align_arguments): + def __init__(self, learnware: Learnware = None, mode: str = None, cuda_idx=0, **align_arguments): self.learnware=learnware - assert task_type in ["classification", "regression"] - self.task_type=task_type + assert mode in ["classification", "regression"] + self.mode=mode self.align_arguments=align_arguments self.cuda_idx=cuda_idx self.device = choose_device(cuda_idx=cuda_idx) diff --git a/tests/test_market/test_hetero_market/test_hetero.py b/tests/test_market/test_hetero_market/test_hetero.py index 27e11f7..a2d7c3d 100644 --- a/tests/test_market/test_hetero_market/test_hetero.py +++ b/tests/test_market/test_hetero_market/test_hetero.py @@ -365,15 +365,8 @@ class TestMarket(unittest.TestCase): print(f"score: {score}, learnware_id: {learnware.id}") # model reuse - reuser=HeteroMapTableReuser(single_learnware_list[0], task_type='regression') + reuser=HeteroMapTableReuser(single_learnware_list[0], mode='regression') reuser.fit(user_spec) - y_pred=reuser.predict(X) - - # calculate rmse - rmse=mean_squared_error(y, y_pred, squared=False) - print(f"rmse not finetune: {rmse}") - - # finetune reuser.finetune(X[:100], y[:100]) y_pred=reuser.predict(X) rmse=mean_squared_error(y, y_pred, squared=False) @@ -388,7 +381,7 @@ def suite(): # _suite.addTest(TestMarket("test_train_market_model")) # _suite.addTest(TestMarket("test_search_semantics")) _suite.addTest(TestMarket("test_stat_search")) - # _suite.addTest(TestMarket("test_model_reuse")) + _suite.addTest(TestMarket("test_model_reuse")) return _suite diff --git a/tests/test_workflow/test_workflow.py b/tests/test_workflow/test_workflow.py index fac8348..ef41449 100644 --- a/tests/test_workflow/test_workflow.py +++ b/tests/test_workflow/test_workflow.py @@ -13,7 +13,7 @@ from shutil import copyfile, rmtree import learnware from learnware.market import instantiate_learnware_market, BaseUserInfo from learnware.specification import RKMETableSpecification, generate_rkme_spec -from learnware.reuse import JobSelectorReuser, AveragingReuser, EnsemblePruningReuser +from learnware.reuse import JobSelectorReuser, AveragingReuser, EnsemblePruningReuser, FeatureAugmentReuser curr_root = os.path.dirname(os.path.abspath(__file__)) @@ -219,17 +219,23 @@ class TestWorkflow(unittest.TestCase): reuse_ensemble.fit(train_X[-200:], train_y[-200:]) ensemble_pruning_predict_y = reuse_ensemble.predict(user_data=data_X) + # Use feature augment reuser to reuse the searched learnwares to make prediction + reuse_feature_augment = FeatureAugmentReuser(learnware=reuse_ensemble, mode="classification") + reuse_feature_augment.fit(train_X[-200:], train_y[-200:]) + feature_augment_predict_y = reuse_feature_augment.predict(user_data=data_X) + print("Job Selector Acc:", np.sum(np.argmax(job_selector_predict_y, axis=1) == data_y) / len(data_y)) print("Averaging Reuser Acc:", np.sum(np.argmax(ensemble_predict_y, axis=1) == data_y) / len(data_y)) print("Ensemble Pruning Reuser Acc:", np.sum(ensemble_pruning_predict_y == data_y) / len(data_y)) + print("Feature Augment Reuser Acc:", np.sum(feature_augment_predict_y == data_y) / len(data_y)) def suite(): _suite = unittest.TestSuite() - _suite.addTest(TestWorkflow("test_prepare_learnware_randomly")) - _suite.addTest(TestWorkflow("test_upload_delete_learnware")) - _suite.addTest(TestWorkflow("test_search_semantics")) - _suite.addTest(TestWorkflow("test_stat_search")) + # _suite.addTest(TestWorkflow("test_prepare_learnware_randomly")) + # _suite.addTest(TestWorkflow("test_upload_delete_learnware")) + # _suite.addTest(TestWorkflow("test_search_semantics")) + # _suite.addTest(TestWorkflow("test_stat_search")) _suite.addTest(TestWorkflow("test_learnware_reuse")) return _suite From 8a7e43422aa90399a188edb8e504bd31f0dbb0ed Mon Sep 17 00:00:00 2001 From: Peng Tan Date: Fri, 10 Nov 2023 21:12:47 +0800 Subject: [PATCH 36/90] [DOC] add docs for hetero_reuser --- learnware/reuse/feature_augment_reuser.py | 8 +- learnware/reuse/hetero_reuser/__init__.py | 80 +++++- .../reuse/hetero_reuser/feature_alignment.py | 227 ++++++++++++++++-- 3 files changed, 278 insertions(+), 37 deletions(-) diff --git a/learnware/reuse/feature_augment_reuser.py b/learnware/reuse/feature_augment_reuser.py index af98a0a..eaf6c43 100644 --- a/learnware/reuse/feature_augment_reuser.py +++ b/learnware/reuse/feature_augment_reuser.py @@ -15,7 +15,7 @@ class FeatureAugmentReuser(BaseReuser): def __init__(self, learnware: Learnware = None, mode: str = None): """ - Initializes the FeatureAugmentReuser with a learnware model and a mode. + Initialize the FeatureAugmentReuser with a learnware model and a mode. Parameters ---------- @@ -30,7 +30,7 @@ class FeatureAugmentReuser(BaseReuser): def predict(self, user_data: np.ndarray) -> np.ndarray: """ - Predicts the output for user data using the trained output aligner model. + Predict the output for user data using the trained output aligner model. Parameters ---------- @@ -50,7 +50,7 @@ class FeatureAugmentReuser(BaseReuser): def fit(self, x_train: np.ndarray, y_train: np.ndarray): """ - Trains the output aligner model using the training data augmented with predictions from the learnware model. + Train the output aligner model using the training data augmented with predictions from the learnware model. Parameters ---------- @@ -73,7 +73,7 @@ class FeatureAugmentReuser(BaseReuser): def _fill_data(self, X: np.ndarray): """ - Fills missing data (NaN, Inf) in the input array with the mean of the column. + Fill missing data (NaN, Inf) in the input array with the mean of the column. Parameters ---------- diff --git a/learnware/reuse/hetero_reuser/__init__.py b/learnware/reuse/hetero_reuser/__init__.py index 69d24a1..91edd7a 100644 --- a/learnware/reuse/hetero_reuser/__init__.py +++ b/learnware/reuse/hetero_reuser/__init__.py @@ -5,22 +5,82 @@ from ..feature_augment_reuser import FeatureAugmentReuser class HeteroMapTableReuser(BaseReuser): + """ + HeteroMapTableReuser is a class designed for reusing learnware models with feature alignment and augmentation. + It can handle both classification and regression tasks and supports fine-tuning on additional training data. + + Attributes + ---------- + learnware : Learnware + The learnware model to be reused. + mode : str + The mode of operation, either "classification" or "regression". + cuda_idx : int + Index of the CUDA device to be used for computations. + align_arguments : dict + Additional arguments for feature alignment. + """ def __init__(self, learnware: Learnware = None, mode: str = None, cuda_idx=0, **align_arguments): - self.learnware=learnware - assert mode in ["classification", "regression"] - self.mode=mode - self.cuda_idx=cuda_idx - self.align_arguments=align_arguments + """ + Initialize the HeteroMapTableReuser with a learnware model, mode, CUDA device index, and alignment arguments. + + Parameters + ---------- + learnware : Learnware + A learnware model used for initial predictions. + mode : str + The mode of operation, either "regression" or "classification". + cuda_idx : int + The index of the CUDA device for computations. + align_arguments : dict + Additional arguments to be passed to the feature alignment process. + """ + self.learnware = learnware + assert mode in ["classification", "regression"], "Mode must be either 'classification' or 'regression'" + self.mode = mode + self.cuda_idx = cuda_idx + self.align_arguments = align_arguments def fit(self, user_rkme): - self.feature_aligner=FeatureAligner(learnware=self.learnware, mode=self.mode, cuda_idx=self.cuda_idx, **self.align_arguments) + """ + Fit the feature aligner using the user RKME (Relative Knowledge Model Embeddings) specification. + + Parameters + ---------- + user_rkme : RKMETableSpecification + The RKME specification from the user dataset. + """ + self.feature_aligner = FeatureAligner(learnware=self.learnware, mode=self.mode, cuda_idx=self.cuda_idx, **self.align_arguments) self.feature_aligner.fit(user_rkme) - self.reuser=self.feature_aligner + self.reuser = self.feature_aligner - def finetune(self, x_train,y_train): - self.reuser=FeatureAugmentReuser(learnware=self.feature_aligner, mode=self.mode) + def finetune(self, x_train, y_train): + """ + Fine-tune the feature aligner using additional training data. + + Parameters + ---------- + x_train : ndarray + Training data features. + y_train : ndarray + Training data labels. + """ + self.reuser = FeatureAugmentReuser(learnware=self.feature_aligner, mode=self.mode) self.reuser.fit(x_train, y_train) def predict(self, user_data): - return self.reuser.predict(user_data) \ No newline at end of file + """ + Predict the output for user data using the feature aligner or the fine-tuned model. + + Parameters + ---------- + user_data : ndarray + Input data for making predictions. + + Returns + ------- + ndarray + Predicted output from the model. + """ + return self.reuser.predict(user_data) diff --git a/learnware/reuse/hetero_reuser/feature_alignment.py b/learnware/reuse/hetero_reuser/feature_alignment.py index 2bb3f71..def7764 100644 --- a/learnware/reuse/hetero_reuser/feature_alignment.py +++ b/learnware/reuse/hetero_reuser/feature_alignment.py @@ -16,28 +16,98 @@ from ..base import BaseReuser class FeatureAligner(BaseReuser): + """ + FeatureAligner is a class for aligning features from a user dataset with a target dataset using a learnware model. + It supports both classification and regression tasks and uses a feature alignment trainer for alignment. + + Attributes + ---------- + learnware : Learnware + The learnware model used for final prediction. + mode : str + Operation mode, either "classification" or "regression". + align_arguments : dict + Additional arguments for the feature alignment trainer. + cuda_idx : int + Index of the CUDA device to be used for computations. + device : torch.device + The device (CPU or CUDA) on which computations will be performed. + """ def __init__(self, learnware: Learnware = None, mode: str = None, cuda_idx=0, **align_arguments): - self.learnware=learnware - assert mode in ["classification", "regression"] - self.mode=mode - self.align_arguments=align_arguments - self.cuda_idx=cuda_idx + """ + Initialize the FeatureAligner with a learnware model, mode, CUDA device index, and alignment arguments. + + Parameters + ---------- + learnware : Learnware + A learnware model used for initial predictions. + mode : str + The mode of operation, either "regression" or "classification". + cuda_idx : int + The index of the CUDA device for computations. + align_arguments : dict + Additional arguments to be passed to the feature alignment trainer. + """ + self.learnware = learnware + assert mode in ["classification", "regression"], "Mode must be either 'classification' or 'regression'" + self.mode = mode + self.align_arguments = align_arguments + self.cuda_idx = cuda_idx self.device = choose_device(cuda_idx=cuda_idx) - def fit(self, user_rkme): - target_rkme=self.learnware.specification.get_stat_spec()["RKMETableSpecification"] - trainer=FeatureAlignmentTrainer(target_rkme=target_rkme, user_rkme=user_rkme, cuda_idx=self.cuda_idx, **self.align_arguments) - self.align_model=trainer.model + def fit(self, user_rkme: RKMETableSpecification): + """ + Fit the align model using the RKME (Relative Knowledge Model Embeddings) specifications from the learnware model. + + Parameters + ---------- + user_rkme : RKMETableSpecification + The RKME specification from the user dataset. + """ + target_rkme = self.learnware.specification.get_stat_spec()["RKMETableSpecification"] + trainer = FeatureAlignmentTrainer(target_rkme=target_rkme, user_rkme=user_rkme, cuda_idx=self.cuda_idx, **self.align_arguments) + self.align_model = trainer.model self.align_model.eval() def predict(self, user_data: ndarray) -> ndarray: - user_data=self._fill_data(user_data) - transformed_user_data=self.align_model(torch.tensor(user_data, device=self.device).float()).detach().cpu().numpy() - y_pred=self.learnware.predict(transformed_user_data) + """ + Predict the output for user data using the aligned model and learnware model. + + Parameters + ---------- + user_data : ndarray + Input data for making predictions. + + Returns + ------- + ndarray + Predicted output from the learnware model after alignment. + """ + user_data = self._fill_data(user_data) + transformed_user_data = self.align_model(torch.tensor(user_data, device=self.device).float()).detach().cpu().numpy() + y_pred = self.learnware.predict(transformed_user_data) return y_pred - + def _fill_data(self, X: np.ndarray): + """ + Fill missing data (NaN, Inf) in the input array with the mean of the column. + + Parameters + ---------- + X : np.ndarray + Input data array that may contain missing values. + + Returns + ------- + np.ndarray + Data array with missing values filled. + + Raises + ------ + ValueError + If a column in X contains only exceptional values (NaN, Inf). + """ X[np.isinf(X) | np.isneginf(X) | np.isposinf(X) | np.isneginf(X)] = np.nan if np.any(np.isnan(X)): for col in range(X.shape[1]): @@ -45,15 +115,38 @@ class FeatureAligner(BaseReuser): if np.any(is_nan): if np.all(is_nan): raise ValueError(f"All values in column {col} are exceptional, e.g., NaN and Inf.") - # Fill np.nan with np.nanmean col_mean = np.nanmean(X[:, col]) X[:, col] = np.where(is_nan, col_mean, X[:, col]) return X + class FeatureAlignmentModel(nn.Module): + """ + FeatureAlignmentModel is a neural network module designed for feature alignment tasks. + It consists of multiple fully connected (dense) layers, optional dropout and batch normalization layers, + and supports different activation functions. + """ - def __init__(self, input_dim, output_dim, hidden_dims=[1024], activation="relu", dropout_ratio=0, use_bn=False): + def __init__(self, input_dim: int, output_dim: int, hidden_dims: list = [1024], activation: str = "relu", dropout_ratio: float = 0, use_bn: bool = False): + """ + Initialize the FeatureAlignmentModel. + + Parameters + ---------- + input_dim : int + The dimensionality of the input features. + output_dim : int + The dimensionality of the output features. + hidden_dims : List[int], optional + A list specifying the number of units in each hidden layer. + activation : str, optional + The activation function to use. Supported options are "relu", "gelu", "selu", and "leakyrelu". + dropout_ratio : float, optional + The dropout ratio applied to each layer (0 means no dropout). + use_bn : bool, optional + Whether to use batch normalization after each fully connected layer. + """ super().__init__() dims = [input_dim] + hidden_dims + [output_dim] self.fc_list = nn.ModuleList() @@ -78,16 +171,66 @@ class FeatureAlignmentModel(nn.Module): else: self.activation = F.relu - def forward(self, x): + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Forward pass of the model. + + Parameters + ---------- + x : torch.Tensor + Input tensor. + + Returns + ------- + torch.Tensor + Output tensor after passing through the model. + """ if len(self.fc_list) > 0: for fc, drop in zip(self.fc_list, self.drop_list): - x = fc(x) - x = self.activation(x) - x = drop(x) - return self.final_fc(x) + x = fc(x) # Apply fully connected layer + x = self.activation(x) # Apply activation function + x = drop(x) # Apply dropout + return self.final_fc(x) # Return output from final fully connected layer class FeatureAlignmentTrainer(): + """ + FeatureAlignmentTrainer is a class designed to train a neural network for aligning features from a user dataset + to a target dataset. It utilizes Maximum Mean Discrepancy (MMD) as the loss function for training. + + Attributes + ---------- + target_rkme : RKMETableSpecification + The RKME (Relative Knowledge Model Embeddings) specification of the target dataset. + user_rkme : RKMETableSpecification + The RKME specification of the user dataset. + extra_labeled_data : Any, optional + Additional labeled data for training, if available. + target_learnware : Learnware, optional + The learnware model used for the target dataset. + num_epoch : int + The number of training epochs. + lr : float + Learning rate for the optimizer. + gamma : float + The gamma parameter for the Gaussian kernel in MMD computation. + network_type : str + Type of the neural network used for feature alignment. + optimizer_type : str + Type of optimizer to be used in training ('Adam' or 'SGD'). + hidden_dims : List[int] + A list specifying the number of units in each hidden layer. + activation : str + The activation function to use in the network. + dropout_ratio : float + The dropout ratio applied to each layer. + use_bn : bool + Whether to use batch normalization after each fully connected layer. + const : float + A constant value used in training. + cuda_idx : int + Index of the CUDA device to be used for computations. + """ def __init__( self, @@ -107,7 +250,8 @@ class FeatureAlignmentTrainer(): const: float = 1e1, cuda_idx: int = 0 ): - """Training the base mapping network + """ + Initialize the FeatureAlignmentTrainer with the specified parameters. """ self.target_rkme = target_rkme self.user_rkme = user_rkme @@ -129,19 +273,56 @@ class FeatureAlignmentTrainer(): else: self.train() - def gaussian_kernel(self, x1, x2): + def gaussian_kernel(self, x1: torch.Tensor, x2: torch.Tensor): + """ + Compute the Gaussian kernel between two sets of samples. + + Parameters + ---------- + x1 : torch.Tensor + First set of samples. + x2 : torch.Tensor + Second set of samples. + + Returns + ------- + torch.Tensor + The computed Gaussian kernel matrix. + """ x1 = x1.double() x2 = x2.double() X12norm = torch.sum(x1**2, 1, keepdim=True) - 2 * x1 @ x2.T + torch.sum(x2**2, 1, keepdim=True).T return torch.exp(-X12norm * self.args["gamma"]) - def compute_mmd(self, user_X, user_weight, target_X, target_weight): + def compute_mmd(self, user_X: torch.Tensor, user_weight: torch.Tensor, target_X: torch.Tensor, target_weight: torch.Tensor) -> torch.Tensor: + """ + Compute the Maximum Mean Discrepancy (MMD) between the user and target datasets. + + Parameters + ---------- + user_X : torch.Tensor + Transformed user data. + user_weight : torch.Tensor + Weights of the user data. + target_X : torch.Tensor + Target data. + target_weight : torch.Tensor + Weights of the target data. + + Returns + ------- + torch.Tensor + The computed MMD loss. + """ term1 = torch.sum(self.gaussian_kernel(user_X, user_X) * (user_weight.T @ user_weight)) term2 = torch.sum(self.gaussian_kernel(user_X, target_X) * (user_weight.T @ target_weight)) term3 = torch.sum(self.gaussian_kernel(target_X, target_X) * (target_weight.T @ target_weight)) return term1 - 2 * term2 + term3 def train(self): + """ + Train the feature alignment model using MMD as the loss function. + """ args = self.args input_dim = self.user_rkme.get_z().shape[1] output_dim = self.target_rkme.get_z().shape[1] From bb5996c3c5ad49c70a5a48abb4eb62c52fbdf02a Mon Sep 17 00:00:00 2001 From: Gene Date: Sat, 11 Nov 2023 12:40:38 +0800 Subject: [PATCH 37/90] [MNT] modify details about class name and logger --- .../heterogeneous/organizer/__init__.py | 49 ++++++++++--------- .../organizer/hetero_mapping/__init__.py | 6 +-- .../hetero_mapping/feature_extractor.py | 1 - .../organizer/hetero_mapping/trainer.py | 4 +- learnware/market/heterogeneous/searcher.py | 38 +++++++------- .../reuse/hetero_reuser/feature_alignment.py | 45 +++++++++++------ learnware/specification/__init__.py | 4 +- learnware/specification/regular/__init__.py | 2 +- learnware/specification/regular/base.py | 2 +- learnware/specification/regular/image/rkme.py | 4 +- learnware/specification/regular/table/rkme.py | 4 +- learnware/specification/system/__init__.py | 2 +- learnware/specification/system/base.py | 6 +-- learnware/specification/system/heter_table.py | 10 ++-- 14 files changed, 98 insertions(+), 79 deletions(-) diff --git a/learnware/market/heterogeneous/organizer/__init__.py b/learnware/market/heterogeneous/organizer/__init__.py index fdc73fe..51cd12a 100644 --- a/learnware/market/heterogeneous/organizer/__init__.py +++ b/learnware/market/heterogeneous/organizer/__init__.py @@ -15,7 +15,7 @@ import torch.multiprocessing as mp from ....learnware import Learnware, get_learnware_from_dirpath from ....logger import get_module_logger -from ....specification.system import HeteroSpecification +from ....specification.system import HeteroMapTableSpecification from ...base import BaseChecker, BaseUserInfo from ...easy import EasyOrganizer from ...easy.database_ops import DatabaseOperations @@ -71,15 +71,15 @@ class HeteroMapTableOrganizer(EasyOrganizer): if not rebuild: if os.path.exists(self.hetero_mappings_path): for hetero_json_path in os.listdir(self.hetero_mappings_path): - idx = hetero_json_path.split('.')[0] - hetero_spec = HeteroSpecification() + idx = hetero_json_path.split(".")[0] + hetero_spec = HeteroMapTableSpecification() hetero_spec.load(os.path.join(self.hetero_mappings_path, f"{idx}.json")) try: - self.learnware_list[idx].update_stat_spec("HeteroSpecification", hetero_spec) + self.learnware_list[idx].update_stat_spec("HeteroMapTableSpecification", hetero_spec) except: logger.warning(f"Learnware ID {idx} NOT Found!") else: - logger.info("No HeteroSpecifications to reload. Use loaded market mapping to regenerate.") + logger.info("No HeteroMapTableSpecification to reload. Use loaded market mapping to regenerate.") self._update_learnware_by_ids(self.learnware_list.keys()) else: logger.warning(f"No market mapping to reload!!") @@ -90,7 +90,8 @@ class HeteroMapTableOrganizer(EasyOrganizer): self.auto_update = auto_update self.market_id = market_id self.training_args = kwargs - if auto_update_limit is not None: self.auto_update_limit = auto_update_limit + if auto_update_limit is not None: + self.auto_update_limit = auto_update_limit def add_learnware( self, zip_path: str, semantic_spec: dict, check_status: int, learnware_id: str = None @@ -98,7 +99,7 @@ class HeteroMapTableOrganizer(EasyOrganizer): if check_status == BaseChecker.INVALID_LEARNWARE: logger.warning("Learnware is invalid!") return None, BaseChecker.INVALID_LEARNWARE - + semantic_spec = copy.deepcopy(semantic_spec) logger.info("Get new learnware from %s" % (zip_path)) @@ -123,7 +124,7 @@ class HeteroMapTableOrganizer(EasyOrganizer): except: pass return None, BaseChecker.INVALID_LEARNWARE - + if new_learnware is None: return None, BaseChecker.INVALID_LEARNWARE @@ -143,7 +144,7 @@ class HeteroMapTableOrganizer(EasyOrganizer): self.use_flags[learnware_id] = learnwere_status self._update_learnware_by_ids([learnware_id]) self.count += 1 - self.training_count += ([learnware_id] == self._get_table_type_learnware_ids([learnware_id])) + self.training_count += [learnware_id] == self._get_table_type_learnware_ids([learnware_id]) if self.auto_update and self.training_count - self.last_training_count == self.auto_update_limit + 1: training_learnware_ids = self._get_table_type_learnware_ids(self.get_learnware_ids()) @@ -151,16 +152,16 @@ class HeteroMapTableOrganizer(EasyOrganizer): logger.warning(f"Leanwares for training: {training_learnware_ids}") updated_market_mapping = self.train( - learnware_list=training_learnwares, - save_dir=self.market_store_path, - **self.training_args + learnware_list=training_learnwares, save_dir=self.market_store_path, **self.training_args + ) + + logger.warning( + f"Market mapping train completed. Now update HeteroMapTableSpecification for {training_learnware_ids}" ) - - logger.warning(f"Market mapping train completed. Now update HeteroSpecification for {training_learnware_ids}") self.market_mapping = updated_market_mapping self._update_learnware_by_ids(training_learnware_ids) self.last_training_count = len(training_learnware_ids) - + return learnware_id, learnwere_status @staticmethod @@ -178,7 +179,7 @@ class HeteroMapTableOrganizer(EasyOrganizer): market_mapping_trainer.save_model(output_dir=save_dir) return market_mapping - + def _update_learnware_by_ids(self, ids: List[str]): ids = self._get_table_type_learnware_ids(ids) for id in ids: @@ -187,14 +188,14 @@ class HeteroMapTableOrganizer(EasyOrganizer): semantic_spec, stat_spec = spec.get_semantic_spec(), spec.get_stat_spec()["RKMETableSpecification"] features = semantic_spec["Input"]["Description"].values() hetero_spec = self.market_mapping.hetero_mapping(stat_spec, features) - self.learnware_list[id].update_stat_spec("HeteroSpecification", hetero_spec) - + self.learnware_list[id].update_stat_spec("HeteroMapTableSpecification", hetero_spec) + save_path = os.path.join(self.hetero_mappings_path, f"{id}.json") hetero_spec.save(save_path) except Exception as err: - logger.warning(f"Learnware {id} generate HeteroSpecification failed! Due to {err}") - - def generate_hetero_map_spec(self, user_info: BaseUserInfo) -> HeteroSpecification: + logger.warning(f"Learnware {id} generate HeteroMapTableSpecification failed! Due to {err}") + + def generate_hetero_map_spec(self, user_info: BaseUserInfo) -> HeteroMapTableSpecification: user_stat_spec = user_info.stat_info["RKMETableSpecification"] user_features = user_info.get_semantic_spec()["Input"]["Description"].values() @@ -210,13 +211,13 @@ class HeteroMapTableOrganizer(EasyOrganizer): features = spec.get_semantic_spec()["Input"]["Description"] learnware_df = pd.DataFrame(data=stat_spec.get_z(), columns=features.values()) learnware_df_dict[tuple(sorted(features))].append(learnware_df) - + return [pd.concat(dfs) for dfs in learnware_df_dict.values()] - + def _get_table_type_learnware_ids(self, ids: List[str]) -> List[str]: ret = [] for id in ids: semantic_spec = self.learnware_list[id].get_specification().get_semantic_spec() if semantic_spec["Data"]["Values"][0] == "Table": ret.append(id) - return ret \ No newline at end of file + return ret diff --git a/learnware/market/heterogeneous/organizer/hetero_mapping/__init__.py b/learnware/market/heterogeneous/organizer/hetero_mapping/__init__.py index 69b7b99..231dfff 100644 --- a/learnware/market/heterogeneous/organizer/hetero_mapping/__init__.py +++ b/learnware/market/heterogeneous/organizer/hetero_mapping/__init__.py @@ -7,7 +7,7 @@ import torch import torch.nn.functional as F from torch import Tensor, nn -from .....specification import HeteroSpecification, RKMETableSpecification +from .....specification import HeteroMapTableSpecification, RKMETableSpecification from .feature_extractor import * from .trainer import Trainer, TransTabCollatorForCL @@ -147,8 +147,8 @@ class HeteroMapping(nn.Module): loss = self._self_supervised_contrastive_loss(feat_x_multiview) return loss - def hetero_mapping(self, rkme_spec: RKMETableSpecification, cols: List[str]) -> HeteroSpecification: - hetero_spec = HeteroSpecification() + def hetero_mapping(self, rkme_spec: RKMETableSpecification, cols: List[str]) -> HeteroMapTableSpecification: + hetero_spec = HeteroMapTableSpecification() hetero_input_df = pd.DataFrame(data=rkme_spec.get_z(), columns=cols) hetero_embedding = self._extract_batch_features(hetero_input_df) hetero_spec.generate_stat_spec_from_system(hetero_embedding, rkme_spec) diff --git a/learnware/market/heterogeneous/organizer/hetero_mapping/feature_extractor.py b/learnware/market/heterogeneous/organizer/hetero_mapping/feature_extractor.py index 1c23587..66eea79 100644 --- a/learnware/market/heterogeneous/organizer/hetero_mapping/feature_extractor.py +++ b/learnware/market/heterogeneous/organizer/hetero_mapping/feature_extractor.py @@ -6,7 +6,6 @@ from typing import Dict import numpy as np import torch import torch.nn.init as nn_init -from loguru import logger from torch import Tensor, nn from transformers import BertTokenizerFast diff --git a/learnware/market/heterogeneous/organizer/hetero_mapping/trainer.py b/learnware/market/heterogeneous/organizer/hetero_mapping/trainer.py index 3667f59..e9083a0 100644 --- a/learnware/market/heterogeneous/organizer/hetero_mapping/trainer.py +++ b/learnware/market/heterogeneous/organizer/hetero_mapping/trainer.py @@ -6,12 +6,14 @@ import time import numpy as np import pandas as pd import torch -from loguru import logger from torch import nn from torch.utils.data import DataLoader, Dataset from tqdm.autonotebook import trange from .feature_extractor import FeatureTokenizer +from .....logger import get_module_logger + +logger = get_module_logger("hetero_mapping_trainer") class Trainer: diff --git a/learnware/market/heterogeneous/searcher.py b/learnware/market/heterogeneous/searcher.py index 9782cf7..c30ad1e 100644 --- a/learnware/market/heterogeneous/searcher.py +++ b/learnware/market/heterogeneous/searcher.py @@ -4,7 +4,7 @@ import numpy as np from ...learnware import Learnware from ...logger import get_module_logger -from ...specification import HeteroSpecification +from ...specification import HeteroMapTableSpecification from ..base import BaseSearcher, BaseUserInfo from ..easy import EasySearcher from ..utils import parse_specification_type @@ -34,28 +34,28 @@ class HeteroMapTableSearcher(EasySearcher): return [(max_dist - dist) / (max_dist - dist_epsilon) for dist in dist_list] def _search_by_hetero_spec_single( - self, - learnware_list: List[Learnware], - user_hetero_spec: HeteroSpecification + self, learnware_list: List[Learnware], user_hetero_spec: HeteroMapTableSpecification ) -> Tuple[List[float], List[Learnware]]: - hetero_spec_list = [learnware.specification.get_stat_spec_by_name("HeteroSpecification") for learnware in learnware_list] + hetero_spec_list = [ + learnware.specification.get_stat_spec_by_name("HeteroMapTableSpecification") for learnware in learnware_list + ] mmd_dist_list = [] for idx, hetero_spec in enumerate(hetero_spec_list): mmd_dist = hetero_spec.dist(user_hetero_spec) mmd_dist_list.append(mmd_dist) - + sorted_idx_list = sorted(range(len(learnware_list)), key=lambda k: mmd_dist_list[k]) sorted_dist_list = [mmd_dist_list[idx] for idx in sorted_idx_list] sorted_learnware_list = [learnware_list[idx] for idx in sorted_idx_list] return sorted_dist_list, sorted_learnware_list - + def _filter_by_hetero_spec_single( self, sorted_score_list: List[float], learnware_list: List[Learnware], filter_score: float = 0.5, - min_num: int = 5 + min_num: int = 5, ) -> Tuple[List[float], List[Learnware]]: idx = min(min_num, len(learnware_list)) while idx < len(learnware_list): @@ -64,11 +64,10 @@ class HeteroMapTableSearcher(EasySearcher): idx += 1 return sorted_score_list[:idx], learnware_list[:idx] - def __call__( - self, - learnware_list: List[Learnware], - user_info: BaseUserInfo, + self, + learnware_list: List[Learnware], + user_info: BaseUserInfo, ) -> Tuple[List[float], List[Learnware], float, List[Learnware]]: # todo: use specially assigned search_gamma for calculating mmd dist user_hetero_spec = self.learnware_oganizer.generate_hetero_map_spec(user_info) @@ -88,6 +87,7 @@ class HeteroMapTableSearcher(EasySearcher): def reset(self, organizer): self.learnware_oganizer = organizer + class HeteroSearcher(EasySearcher): def __init__(self, organizer: HeteroMapTableOrganizer = None): super(HeteroSearcher, self).__init__(organizer) @@ -96,7 +96,7 @@ class HeteroSearcher(EasySearcher): def reset(self, organizer): super().reset(organizer) self.hetero_stat_searcher.reset(organizer) - + @staticmethod def check_user_info(user_info: BaseUserInfo): try: @@ -105,7 +105,9 @@ class HeteroSearcher(EasySearcher): user_task_type = user_info.get_semantic_spec()["Task"]["Values"] if user_task_type not in [["Classification"], ["Regression"]]: - logger.warning("User doesn't provide correct task type, it must be either Classification or Regression.") + logger.warning( + "User doesn't provide correct task type, it must be either Classification or Regression." + ) return False user_input_description = user_info.get_semantic_spec()["Input"] @@ -115,10 +117,12 @@ class HeteroSearcher(EasySearcher): if user_input_shape != user_description_dim or user_input_shape != user_description_feature_num: logger.warning("User data feature dimensions mismatch with semantic specification.") return False - + return True except Exception as e: - logger.info(f"Invalid heterogeneous search information provided. Use homogeneous search instead. Error: {e}") + logger.info( + f"Invalid heterogeneous search information provided. Use homogeneous search instead. Error: {e}" + ) return False def __call__( @@ -136,4 +140,4 @@ class HeteroSearcher(EasySearcher): else: return self.stat_searcher(learnware_list, user_info, max_search_num, search_method) else: - return None, learnware_list, 0.0, None \ No newline at end of file + return None, learnware_list, 0.0, None diff --git a/learnware/reuse/hetero_reuser/feature_alignment.py b/learnware/reuse/hetero_reuser/feature_alignment.py index def7764..7f81187 100644 --- a/learnware/reuse/hetero_reuser/feature_alignment.py +++ b/learnware/reuse/hetero_reuser/feature_alignment.py @@ -6,13 +6,15 @@ import torch.nn.functional as F import torch import time from tqdm import trange -from loguru import logger from learnware.learnware import Learnware from learnware.specification import RKMETableSpecification from learnware.specification.regular.table.rkme import choose_device from ..base import BaseReuser +from ...logger import get_module_logger + +logger = get_module_logger("hetero_feature_alignment") class FeatureAligner(BaseReuser): @@ -66,7 +68,9 @@ class FeatureAligner(BaseReuser): The RKME specification from the user dataset. """ target_rkme = self.learnware.specification.get_stat_spec()["RKMETableSpecification"] - trainer = FeatureAlignmentTrainer(target_rkme=target_rkme, user_rkme=user_rkme, cuda_idx=self.cuda_idx, **self.align_arguments) + trainer = FeatureAlignmentTrainer( + target_rkme=target_rkme, user_rkme=user_rkme, cuda_idx=self.cuda_idx, **self.align_arguments + ) self.align_model = trainer.model self.align_model.eval() @@ -85,7 +89,9 @@ class FeatureAligner(BaseReuser): Predicted output from the learnware model after alignment. """ user_data = self._fill_data(user_data) - transformed_user_data = self.align_model(torch.tensor(user_data, device=self.device).float()).detach().cpu().numpy() + transformed_user_data = ( + self.align_model(torch.tensor(user_data, device=self.device).float()).detach().cpu().numpy() + ) y_pred = self.learnware.predict(transformed_user_data) return y_pred @@ -120,7 +126,6 @@ class FeatureAligner(BaseReuser): return X - class FeatureAlignmentModel(nn.Module): """ FeatureAlignmentModel is a neural network module designed for feature alignment tasks. @@ -128,7 +133,15 @@ class FeatureAlignmentModel(nn.Module): and supports different activation functions. """ - def __init__(self, input_dim: int, output_dim: int, hidden_dims: list = [1024], activation: str = "relu", dropout_ratio: float = 0, use_bn: bool = False): + def __init__( + self, + input_dim: int, + output_dim: int, + hidden_dims: list = [1024], + activation: str = "relu", + dropout_ratio: float = 0, + use_bn: bool = False, + ): """ Initialize the FeatureAlignmentModel. @@ -187,13 +200,13 @@ class FeatureAlignmentModel(nn.Module): """ if len(self.fc_list) > 0: for fc, drop in zip(self.fc_list, self.drop_list): - x = fc(x) # Apply fully connected layer + x = fc(x) # Apply fully connected layer x = self.activation(x) # Apply activation function - x = drop(x) # Apply dropout + x = drop(x) # Apply dropout return self.final_fc(x) # Return output from final fully connected layer - -class FeatureAlignmentTrainer(): + +class FeatureAlignmentTrainer: """ FeatureAlignmentTrainer is a class designed to train a neural network for aligning features from a user dataset to a target dataset. It utilizes Maximum Mean Discrepancy (MMD) as the loss function for training. @@ -248,7 +261,7 @@ class FeatureAlignmentTrainer(): dropout_ratio: float = 0, use_bn: bool = False, const: float = 1e1, - cuda_idx: int = 0 + cuda_idx: int = 0, ): """ Initialize the FeatureAlignmentTrainer with the specified parameters. @@ -266,7 +279,7 @@ class FeatureAlignmentTrainer(): } self.network_type = network_type self.optimizer_type = optimizer_type - self.const=const + self.const = const self.device = choose_device(cuda_idx=cuda_idx) if extra_labeled_data is not None and target_learnware is not None: self.train_with_labeled_data(extra_labeled_data[0], extra_labeled_data[1], target_learnware) @@ -294,7 +307,9 @@ class FeatureAlignmentTrainer(): X12norm = torch.sum(x1**2, 1, keepdim=True) - 2 * x1 @ x2.T + torch.sum(x2**2, 1, keepdim=True).T return torch.exp(-X12norm * self.args["gamma"]) - def compute_mmd(self, user_X: torch.Tensor, user_weight: torch.Tensor, target_X: torch.Tensor, target_weight: torch.Tensor) -> torch.Tensor: + def compute_mmd( + self, user_X: torch.Tensor, user_weight: torch.Tensor, target_X: torch.Tensor, target_weight: torch.Tensor + ) -> torch.Tensor: """ Compute the Maximum Mean Discrepancy (MMD) between the user and target datasets. @@ -327,7 +342,9 @@ class FeatureAlignmentTrainer(): input_dim = self.user_rkme.get_z().shape[1] output_dim = self.target_rkme.get_z().shape[1] - user_model=FeatureAlignmentModel(input_dim, output_dim, args["hidden_dims"], args["activation"], args["dropout_ratio"], args["use_bn"]) + user_model = FeatureAlignmentModel( + input_dim, output_dim, args["hidden_dims"], args["activation"], args["dropout_ratio"], args["use_bn"] + ) # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") user_model.to(self.device) @@ -355,4 +372,4 @@ class FeatureAlignmentTrainer(): ) self.model = user_model - logger.info("training complete, cost {:.1f} secs.".format(time.time() - start_time)) \ No newline at end of file + logger.info("training complete, cost {:.1f} secs.".format(time.time() - start_time)) diff --git a/learnware/specification/__init__.py b/learnware/specification/__init__.py index fae0c7c..0332890 100644 --- a/learnware/specification/__init__.py +++ b/learnware/specification/__init__.py @@ -1,13 +1,13 @@ from .base import Specification, BaseStatSpecification from .regular import ( - RegularStatsSpecification, + RegularStatSpecification, RKMEStatSpecification, RKMETableSpecification, RKMEImageSpecification, RKMETextSpecification, ) -from .system import HeteroSpecification +from .system import HeteroMapTableSpecification from ..utils import is_torch_avaliable diff --git a/learnware/specification/regular/__init__.py b/learnware/specification/regular/__init__.py index 9d46114..bd01aa9 100644 --- a/learnware/specification/regular/__init__.py +++ b/learnware/specification/regular/__init__.py @@ -1,4 +1,4 @@ -from .base import RegularStatsSpecification +from .base import RegularStatSpecification from ...utils import is_torch_avaliable from .text import RKMETextSpecification diff --git a/learnware/specification/regular/base.py b/learnware/specification/regular/base.py index 6916177..1960f0d 100644 --- a/learnware/specification/regular/base.py +++ b/learnware/specification/regular/base.py @@ -3,7 +3,7 @@ from __future__ import annotations from ..base import BaseStatSpecification -class RegularStatsSpecification(BaseStatSpecification): +class RegularStatSpecification(BaseStatSpecification): def generate_stat_spec(self, **kwargs): self.generate_stat_spec_from_data(**kwargs) diff --git a/learnware/specification/regular/image/rkme.py b/learnware/specification/regular/image/rkme.py index 4421f91..e935afa 100644 --- a/learnware/specification/regular/image/rkme.py +++ b/learnware/specification/regular/image/rkme.py @@ -17,11 +17,11 @@ from torchvision.transforms import Resize from tqdm import tqdm from . import cnn_gp -from ..base import RegularStatsSpecification +from ..base import RegularStatSpecification from ..table.rkme import solve_qp, choose_device, setup_seed -class RKMEImageSpecification(RegularStatsSpecification): +class RKMEImageSpecification(RegularStatSpecification): # INNER_PRODUCT_COUNT = 0 IMAGE_WIDTH = 32 diff --git a/learnware/specification/regular/table/rkme.py b/learnware/specification/regular/table/rkme.py index 17aedc1..d0f113b 100644 --- a/learnware/specification/regular/table/rkme.py +++ b/learnware/specification/regular/table/rkme.py @@ -20,7 +20,7 @@ try: except ImportError: _FAISS_INSTALLED = False -from ..base import RegularStatsSpecification +from ..base import RegularStatSpecification from ....logger import get_module_logger logger = get_module_logger("rkme") @@ -31,7 +31,7 @@ if not _FAISS_INSTALLED: ) -class RKMETableSpecification(RegularStatsSpecification): +class RKMETableSpecification(RegularStatSpecification): """Reduced Kernel Mean Embedding (RKME) Specification""" def __init__(self, gamma: float = 0.1, cuda_idx: int = -1): diff --git a/learnware/specification/system/__init__.py b/learnware/specification/system/__init__.py index 1a8b6ca..e57a155 100644 --- a/learnware/specification/system/__init__.py +++ b/learnware/specification/system/__init__.py @@ -1 +1 @@ -from .heter_table import HeteroSpecification +from .heter_table import HeteroMapTableSpecification diff --git a/learnware/specification/system/base.py b/learnware/specification/system/base.py index 12a7226..281f682 100644 --- a/learnware/specification/system/base.py +++ b/learnware/specification/system/base.py @@ -1,11 +1,7 @@ -from __future__ import annotations - -from loguru import logger - from ..base import BaseStatSpecification -class SystemStatsSpecification(BaseStatSpecification): +class SystemStatSpecification(BaseStatSpecification): def generate_stat_spec(self, **kwargs): self.generate_stat_spec_from_system(**kwargs) diff --git a/learnware/specification/system/heter_table.py b/learnware/specification/system/heter_table.py index a574daf..5329a06 100644 --- a/learnware/specification/system/heter_table.py +++ b/learnware/specification/system/heter_table.py @@ -10,10 +10,10 @@ import torch from ..regular import RKMETableSpecification from ..regular.table.rkme import choose_device, setup_seed, torch_rbf_kernel -from .base import SystemStatsSpecification +from .base import SystemStatSpecification -class HeteroSpecification(SystemStatsSpecification): +class HeteroMapTableSpecification(SystemStatSpecification): """Heterogeneous Embedding Specification""" def __init__(self, gamma: float = 0.1, cuda_idx: int = -1): @@ -26,7 +26,7 @@ class HeteroSpecification(SystemStatsSpecification): torch.cuda.empty_cache() self.device = choose_device(cuda_idx=cuda_idx) setup_seed(0) - super(HeteroSpecification, self).__init__(type=self.__class__.__name__) + super(HeteroMapTableSpecification, self).__init__(type=self.__class__.__name__) def get_z(self) -> np.ndarray: return self.z.detach().cpu().numpy() @@ -38,7 +38,7 @@ class HeteroSpecification(SystemStatsSpecification): self.beta = rkme_spec.beta.to(self.device) self.z = torch.from_numpy(heter_embedding).double().to(self.device) - def inner_prod(self, Embed2: HeteroSpecification) -> float: + def inner_prod(self, Embed2: HeteroMapTableSpecification) -> float: beta_1 = self.beta.reshape(1, -1).double().to(self.device) beta_2 = Embed2.beta.reshape(1, -1).double().to(self.device) Z1 = self.z.double().reshape(self.z.shape[0], -1).to(self.device) @@ -47,7 +47,7 @@ class HeteroSpecification(SystemStatsSpecification): return float(v) - def dist(self, Embed2: HeteroSpecification, omit_term1: bool = False) -> float: + def dist(self, Embed2: HeteroMapTableSpecification, omit_term1: bool = False) -> float: term1 = 0 if omit_term1 else self.inner_prod(self) term2 = self.inner_prod(Embed2) term3 = Embed2.inner_prod(Embed2) From 73d6c916c1a73f7d1e681cb5015428d9c1599af1 Mon Sep 17 00:00:00 2001 From: Peng Tan Date: Sat, 11 Nov 2023 16:53:45 +0800 Subject: [PATCH 38/90] [FIX] fix bugs for logistic regression not converge in default max_iter iterations --- learnware/reuse/feature_augment_reuser.py | 2 +- tests/test_market/test_hetero_market/test_hetero.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/learnware/reuse/feature_augment_reuser.py b/learnware/reuse/feature_augment_reuser.py index eaf6c43..f0c5000 100644 --- a/learnware/reuse/feature_augment_reuser.py +++ b/learnware/reuse/feature_augment_reuser.py @@ -68,7 +68,7 @@ class FeatureAugmentReuser(BaseReuser): ridge_cv.fit(x_train_aug, y_train) self.output_aligner = ridge_cv elif self.mode == "classification": - self.output_aligner = LogisticRegressionCV() + self.output_aligner = LogisticRegressionCV(cv=5, max_iter=1000, random_state=0, multi_class="auto") self.output_aligner.fit(x_train_aug, y_train) def _fill_data(self, X: np.ndarray): diff --git a/tests/test_market/test_hetero_market/test_hetero.py b/tests/test_market/test_hetero_market/test_hetero.py index a2d7c3d..1d73b2c 100644 --- a/tests/test_market/test_hetero_market/test_hetero.py +++ b/tests/test_market/test_hetero_market/test_hetero.py @@ -255,7 +255,7 @@ class TestMarket(unittest.TestCase): for score, learnware in zip(sorted_score_list, single_learnware_list): print(f"score: {score}, learnware_id: {learnware.id}") - # empty value of key "Task" in semantic_spec, use homo search and print + # empty value of key "Task" in semantic_spec, use homo search and print invalid semantic_spec print(">> test for key 'Task' has empty 'Values':") semantic_spec["Task"]={"Values":{}} From 186f7e576aefb31ffdf145a4618d20889eb367c1 Mon Sep 17 00:00:00 2001 From: Gene Date: Sat, 11 Nov 2023 23:29:20 +0800 Subject: [PATCH 39/90] [MNT] remove extra organizer --- learnware/market/heterogeneous/organizer.py | 97 --------------------- 1 file changed, 97 deletions(-) delete mode 100644 learnware/market/heterogeneous/organizer.py diff --git a/learnware/market/heterogeneous/organizer.py b/learnware/market/heterogeneous/organizer.py deleted file mode 100644 index 15e9ea4..0000000 --- a/learnware/market/heterogeneous/organizer.py +++ /dev/null @@ -1,97 +0,0 @@ -from typing import List - -import numpy as np - -from ...learnware import Learnware -from ..evolve.organizer import EvolvedOrganizer - - -class MappingFunction: - def __init__(self) -> None: - pass - - def transform(X: np.ndarray) -> np.ndarray: - """transform the data in one feature space to another feature space. - - Parameters - ---------- - X : np.ndarray - data in one feature space - - Returns - ------- - np.ndarray - transformed data in other feature space - """ - pass - - -class HeterogeneousOrganizer(EvolvedOrganizer): - """Organize learnwares with heterogeneous feature spaces, organizer version with evolved learnwares""" - - def __init__(self, *args, **kwargs): - super(HeterogeneousOrganizer, self).__init__(*args, **kwargs) - self.mapping_function_list = {} - - def _mapping_function_list_initialization(self, learnware_list: List[Learnware]): - """Initialize mapping functions with all submitted learnwares - - Parameters - ---------- - learnware_list : List[Learnware] - list of learnwares - """ - self.mapping_function_list = self.learn_mapping_functions(learnware_list) - - def learn_mapping_functions(self, learnware_list: List[Learnware]) -> List[MappingFunction]: - """Use all statistical specifications of submitted learnwares to generate mapping functions from each original feature space to subsapce and vice verse. - - Parameters - ---------- - learnware_list : List[Learnware] - list of learnwares - - Returns - ------- - List[MappingFunction] - list of mapping functions - """ - pass - - def transform_original_to_subspace( - self, original_feature_space_idx: int, original_feature: np.ndarray - ) -> np.ndarray: - """Transform feature in a original feature space to the subspace. - - Parameters - ---------- - original_feature_space_idx : int - index of the original feature space - original_feature : np.ndarray - data in the original feature space - - Returns - ------- - np.ndarray - mapped data in the subspace - """ - pass - - def transform_subspace_to_original( - self, original_feature_space_idx: int, subspace_feature: np.ndarray - ) -> np.ndarray: - """Transform feature in the subspace to a original feature space. - - Parameters - ---------- - original_feature_space_idx : int - index of the original feature space - subspace_feature : np.ndarray - data in the subspace - - Returns - ------- - np.ndarray - mapped data in the original feature space - """ - pass From e8ba40cf0fc9cc218f27dc1afa14b87045ea7d28 Mon Sep 17 00:00:00 2001 From: Gene Date: Sat, 11 Nov 2023 23:35:26 +0800 Subject: [PATCH 40/90] [MNT] add checker list --- learnware/market/module.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/learnware/market/module.py b/learnware/market/module.py index 290755a..d1cd304 100644 --- a/learnware/market/module.py +++ b/learnware/market/module.py @@ -11,10 +11,10 @@ def get_market_config(): "checker_list": [EasySemanticChecker(), EasyStatChecker()], }, "hetero": { - "organizer": HeteroMapTableOrganizer(), - "searcher": HeteroSearcher(), - "checker_list": [] - } + "organizer": HeteroMapTableOrganizer(), + "searcher": HeteroSearcher(), + "checker_list": [EasySemanticChecker(), EasyStatChecker()], + }, } return market_config From 05b6fd16c9dd725495643cdec9dabd9aab49baea Mon Sep 17 00:00:00 2001 From: Gene Date: Sat, 11 Nov 2023 23:45:27 +0800 Subject: [PATCH 41/90] [MNT] modify details and format code --- learnware/reuse/feature_augment_reuser.py | 1 + learnware/reuse/hetero_reuser/__init__.py | 4 +- .../example_learnwares/config.py | 30 +++---- .../example_learnware_0/__init__.py | 8 +- .../example_learnware_0/learnware.yaml | 0 .../example_learnware_0/requirements.txt | 0 .../example_learnware_1/__init__.py | 8 +- .../example_learnware_1/learnware.yaml | 0 .../example_learnware_1/requirements.txt | 0 .../test_hetero_market/test_hetero.py | 85 +++++++++++-------- tests/test_workflow/test_workflow.py | 8 +- 11 files changed, 77 insertions(+), 67 deletions(-) rename tests/{test_market => }/test_hetero_market/example_learnwares/config.py (90%) rename tests/{test_market => }/test_hetero_market/example_learnwares/example_learnware_0/__init__.py (82%) rename tests/{test_market => }/test_hetero_market/example_learnwares/example_learnware_0/learnware.yaml (100%) rename tests/{test_market => }/test_hetero_market/example_learnwares/example_learnware_0/requirements.txt (100%) rename tests/{test_market => }/test_hetero_market/example_learnwares/example_learnware_1/__init__.py (82%) rename tests/{test_market => }/test_hetero_market/example_learnwares/example_learnware_1/learnware.yaml (100%) rename tests/{test_market => }/test_hetero_market/example_learnwares/example_learnware_1/requirements.txt (100%) rename tests/{test_market => }/test_hetero_market/test_hetero.py (85%) diff --git a/learnware/reuse/feature_augment_reuser.py b/learnware/reuse/feature_augment_reuser.py index f0c5000..b0e5ee6 100644 --- a/learnware/reuse/feature_augment_reuser.py +++ b/learnware/reuse/feature_augment_reuser.py @@ -4,6 +4,7 @@ from sklearn.linear_model import RidgeCV, LogisticRegressionCV from .base import BaseReuser from learnware.learnware import Learnware + class FeatureAugmentReuser(BaseReuser): """ FeatureAugmentReuser is a class for augmenting features using predictions of a given learnware model and applying regression or classification on the augmented dataset. diff --git a/learnware/reuse/hetero_reuser/__init__.py b/learnware/reuse/hetero_reuser/__init__.py index 91edd7a..4a252ec 100644 --- a/learnware/reuse/hetero_reuser/__init__.py +++ b/learnware/reuse/hetero_reuser/__init__.py @@ -51,7 +51,9 @@ class HeteroMapTableReuser(BaseReuser): user_rkme : RKMETableSpecification The RKME specification from the user dataset. """ - self.feature_aligner = FeatureAligner(learnware=self.learnware, mode=self.mode, cuda_idx=self.cuda_idx, **self.align_arguments) + self.feature_aligner = FeatureAligner( + learnware=self.learnware, mode=self.mode, cuda_idx=self.cuda_idx, **self.align_arguments + ) self.feature_aligner.fit(user_rkme) self.reuser = self.feature_aligner diff --git a/tests/test_market/test_hetero_market/example_learnwares/config.py b/tests/test_hetero_market/example_learnwares/config.py similarity index 90% rename from tests/test_market/test_hetero_market/example_learnwares/config.py rename to tests/test_hetero_market/example_learnwares/config.py index b4d4fb4..1816b4c 100644 --- a/tests/test_market/test_hetero_market/example_learnwares/config.py +++ b/tests/test_hetero_market/example_learnwares/config.py @@ -1,9 +1,9 @@ -input_shape_list=[20, 30] # 20-input shape of example learnware 0, 30-input shape of example learnware 1 +input_shape_list = [20, 30] # 20-input shape of example learnware 0, 30-input shape of example learnware 1 -input_description_list=[ +input_description_list = [ { "Dimension": 20, - "Description": { # medical description + "Description": { # medical description "0": "baseline value: Baseline Fetal Heart Rate (FHR)", "1": "accelerations: Number of accelerations per second", "2": "fetal_movement: Number of fetal movements per second", @@ -23,12 +23,12 @@ input_description_list=[ "16": "histogram_mode: Hist mode", "17": "histogram_mean: Hist mean", "18": "histogram_median: Hist Median", - "19": "histogram_variance: Hist variance" + "19": "histogram_variance: Hist variance", }, }, { "Dimension": 30, - "Description": { # business description + "Description": { # business description "0": "This is a consecutive month number, used for convenience. For example, January 2013 is 0, February 2013 is 1,..., October 2015 is 33.", "1": "This is the unique identifier for each shop.", "2": "This is the unique identifier for each item.", @@ -58,32 +58,28 @@ input_description_list=[ "26": "This is the average count of items of the same subtype sold in the shop one month ago.", "27": "This is the average count of items sold in the same city one month ago.", "28": "This is the average count of this type of item sold in the same city one month ago.", - "29": "This is the average count of items of the same type sold one month ago." + "29": "This is the average count of items of the same type sold one month ago.", }, }, - ] -output_description_list=[ +output_description_list = [ { "Dimension": 1, - "Description": { # medical description - "0": "length of stay: Length of hospital stay (days)" - }, + "Description": {"0": "length of stay: Length of hospital stay (days)"}, # medical description }, { "Dimension": 1, - "Description": { # business description + "Description": { # business description "0": "sales of the item in the next day: Number of items sold in the next day" }, }, - ] -user_description_list=[ +user_description_list = [ { "Dimension": 15, - "Description": { # medical description + "Description": { # medical description "0": "Whether the patient is on thyroxine medication (0: No, 1: Yes)", "1": "Whether the patient has been queried about thyroxine medication (0: No, 1: Yes)", "2": "Whether the patient is on antithyroid medication (0: No, 1: Yes)", @@ -98,7 +94,7 @@ user_description_list=[ "11": "Whether TSH (Thyroid Stimulating Hormone) level has been measured (0: No, 1: Yes)", "12": "Whether T3 (Triiodothyronine) level has been measured (0: No, 1: Yes)", "13": "Whether TT4 (Total Thyroxine) level has been measured (0: No, 1: Yes)", - "14": "Whether T4U (Thyroxine Utilization) level has been measured (0: No, 1: Yes)" + "14": "Whether T4U (Thyroxine Utilization) level has been measured (0: No, 1: Yes)", }, } -] \ No newline at end of file +] diff --git a/tests/test_market/test_hetero_market/example_learnwares/example_learnware_0/__init__.py b/tests/test_hetero_market/example_learnwares/example_learnware_0/__init__.py similarity index 82% rename from tests/test_market/test_hetero_market/example_learnwares/example_learnware_0/__init__.py rename to tests/test_hetero_market/example_learnwares/example_learnware_0/__init__.py index e9c6cf0..ea21917 100644 --- a/tests/test_market/test_hetero_market/example_learnwares/example_learnware_0/__init__.py +++ b/tests/test_hetero_market/example_learnwares/example_learnware_0/__init__.py @@ -8,15 +8,15 @@ class MyModel(BaseModel): def __init__(self): super(MyModel, self).__init__(input_shape=(20,), output_shape=(1,)) dir_path = os.path.dirname(os.path.abspath(__file__)) - model_path=os.path.join(dir_path, "ridge.pkl") + model_path = os.path.join(dir_path, "ridge.pkl") model = joblib.load(model_path) - self.model=model + self.model = model def fit(self, X: np.ndarray, y: np.ndarray): pass def predict(self, X: np.ndarray) -> np.ndarray: return self.model.predict(X) - + def finetune(self, X: np.ndarray, y: np.ndarray): - pass \ No newline at end of file + pass diff --git a/tests/test_market/test_hetero_market/example_learnwares/example_learnware_0/learnware.yaml b/tests/test_hetero_market/example_learnwares/example_learnware_0/learnware.yaml similarity index 100% rename from tests/test_market/test_hetero_market/example_learnwares/example_learnware_0/learnware.yaml rename to tests/test_hetero_market/example_learnwares/example_learnware_0/learnware.yaml diff --git a/tests/test_market/test_hetero_market/example_learnwares/example_learnware_0/requirements.txt b/tests/test_hetero_market/example_learnwares/example_learnware_0/requirements.txt similarity index 100% rename from tests/test_market/test_hetero_market/example_learnwares/example_learnware_0/requirements.txt rename to tests/test_hetero_market/example_learnwares/example_learnware_0/requirements.txt diff --git a/tests/test_market/test_hetero_market/example_learnwares/example_learnware_1/__init__.py b/tests/test_hetero_market/example_learnwares/example_learnware_1/__init__.py similarity index 82% rename from tests/test_market/test_hetero_market/example_learnwares/example_learnware_1/__init__.py rename to tests/test_hetero_market/example_learnwares/example_learnware_1/__init__.py index 934e352..11fb9e0 100644 --- a/tests/test_market/test_hetero_market/example_learnwares/example_learnware_1/__init__.py +++ b/tests/test_hetero_market/example_learnwares/example_learnware_1/__init__.py @@ -8,15 +8,15 @@ class MyModel(BaseModel): def __init__(self): super(MyModel, self).__init__(input_shape=(30,), output_shape=(1,)) dir_path = os.path.dirname(os.path.abspath(__file__)) - model_path=os.path.join(dir_path, "ridge.pkl") + model_path = os.path.join(dir_path, "ridge.pkl") model = joblib.load(model_path) - self.model=model + self.model = model def fit(self, X: np.ndarray, y: np.ndarray): pass def predict(self, X: np.ndarray) -> np.ndarray: return self.model.predict(X) - + def finetune(self, X: np.ndarray, y: np.ndarray): - pass \ No newline at end of file + pass diff --git a/tests/test_market/test_hetero_market/example_learnwares/example_learnware_1/learnware.yaml b/tests/test_hetero_market/example_learnwares/example_learnware_1/learnware.yaml similarity index 100% rename from tests/test_market/test_hetero_market/example_learnwares/example_learnware_1/learnware.yaml rename to tests/test_hetero_market/example_learnwares/example_learnware_1/learnware.yaml diff --git a/tests/test_market/test_hetero_market/example_learnwares/example_learnware_1/requirements.txt b/tests/test_hetero_market/example_learnwares/example_learnware_1/requirements.txt similarity index 100% rename from tests/test_market/test_hetero_market/example_learnwares/example_learnware_1/requirements.txt rename to tests/test_hetero_market/example_learnwares/example_learnware_1/requirements.txt diff --git a/tests/test_market/test_hetero_market/test_hetero.py b/tests/test_hetero_market/test_hetero.py similarity index 85% rename from tests/test_market/test_hetero_market/test_hetero.py rename to tests/test_hetero_market/test_hetero.py index 1d73b2c..c42cd97 100644 --- a/tests/test_market/test_hetero_market/test_hetero.py +++ b/tests/test_hetero_market/test_hetero.py @@ -16,7 +16,12 @@ import learnware from learnware.market import instantiate_learnware_market, BaseUserInfo from learnware.specification import RKMETableSpecification, generate_rkme_spec from learnware.reuse import HeteroMapTableReuser -from example_learnwares.config import input_shape_list, input_description_list, output_description_list, user_description_list +from example_learnwares.config import ( + input_shape_list, + input_description_list, + output_description_list, + user_description_list, +) curr_root = os.path.dirname(os.path.abspath(__file__)) @@ -32,6 +37,7 @@ user_semantic = { "Name": {"Values": "", "Type": "String"}, } + def check_learnware(learnware_name, dir_path=os.path.join(curr_root, "learnware_pool")): print(f"Checking Learnware: {learnware_name}") zip_file_path = os.path.join(dir_path, learnware_name) @@ -56,7 +62,6 @@ class TestMarket(unittest.TestCase): hetero_market = instantiate_learnware_market(market_id="hetero_toy", name="hetero", rebuild=True) return hetero_market - def test_prepare_learnware_randomly(self, learnware_num=5): self.zip_path_list = [] @@ -66,13 +71,13 @@ class TestMarket(unittest.TestCase): print("Preparing Learnware: %d" % (i)) - example_learnware_idx=i%2 - input_dim=input_shape_list[example_learnware_idx] - example_learnware_name="example_learnwares/example_learnware_%d" % (example_learnware_idx) + example_learnware_idx = i % 2 + input_dim = input_shape_list[example_learnware_idx] + example_learnware_name = "example_learnwares/example_learnware_%d" % (example_learnware_idx) X, y = make_regression(n_samples=5000, n_informative=15, n_features=input_dim, noise=0.1, random_state=42) - clf=Ridge(alpha=1.0) + clf = Ridge(alpha=1.0) clf.fit(X, y) joblib.dump(clf, os.path.join(dir_path, "ridge.pkl")) @@ -86,7 +91,9 @@ class TestMarket(unittest.TestCase): ) # cp example_init.py init_file yaml_file = os.path.join(dir_path, "learnware.yaml") - copyfile(os.path.join(curr_root, example_learnware_name, "learnware.yaml"), yaml_file) # cp example.yaml yaml_file + copyfile( + os.path.join(curr_root, example_learnware_name, "learnware.yaml"), yaml_file + ) # cp example.yaml yaml_file env_file = os.path.join(dir_path, "requirements.txt") copyfile(os.path.join(curr_root, example_learnware_name, "requirements.txt"), env_file) @@ -143,14 +150,16 @@ class TestMarket(unittest.TestCase): for learnware_id in curr_inds: hetero_market.delete_learnware(learnware_id) self.learnware_num -= 1 - assert len(hetero_market) == self.learnware_num, f"The number of learnwares must be {self.learnware_num}!" + assert ( + len(hetero_market) == self.learnware_num + ), f"The number of learnwares must be {self.learnware_num}!" curr_inds = hetero_market.get_learnware_ids() print("Available ids After Deleting Learnwares:", curr_inds) assert len(curr_inds) == 0, f"The market should be empty!" return hetero_market - + def test_train_market_model(self, learnware_num=5): hetero_market = self._init_learnware_market() self.test_prepare_learnware_randomly(learnware_num) @@ -214,7 +223,7 @@ class TestMarket(unittest.TestCase): # hetero test print("+++++ HETERO TEST ++++++") - user_dim=15 + user_dim = 15 test_folder = os.path.join(curr_root, "test_stat") @@ -230,18 +239,20 @@ class TestMarket(unittest.TestCase): user_spec = RKMETableSpecification() user_spec.load(os.path.join(unzip_dir, "stat.json")) - z=user_spec.get_z() - z=z[:,:user_dim] - device=user_spec.device - z=torch.tensor(z, device=device) - user_spec.z=z + z = user_spec.get_z() + z = z[:, :user_dim] + device = user_spec.device + z = torch.tensor(z, device=device) + user_spec.z = z print(">> normal case test:") semantic_spec = copy.deepcopy(user_semantic) - semantic_spec["Input"]=copy.deepcopy(input_description_list[idx%2]) - semantic_spec["Input"]['Dimension']=user_dim + semantic_spec["Input"] = copy.deepcopy(input_description_list[idx % 2]) + semantic_spec["Input"]["Dimension"] = user_dim # keep only the first user_dim descriptions - semantic_spec["Input"]['Description']={key: semantic_spec["Input"]['Description'][str(key)] for key in range(user_dim)} + semantic_spec["Input"]["Description"] = { + key: semantic_spec["Input"]["Description"][str(key)] for key in range(user_dim) + } user_info = BaseUserInfo(semantic_spec=semantic_spec, stat_info={"RKMETableSpecification": user_spec}) ( @@ -257,7 +268,7 @@ class TestMarket(unittest.TestCase): # empty value of key "Task" in semantic_spec, use homo search and print invalid semantic_spec print(">> test for key 'Task' has empty 'Values':") - semantic_spec["Task"]={"Values":{}} + semantic_spec["Task"] = {"Values": {}} user_info = BaseUserInfo(semantic_spec=semantic_spec, stat_info={"RKMETableSpecification": user_spec}) ( @@ -267,8 +278,7 @@ class TestMarket(unittest.TestCase): mixture_learnware_list, ) = hetero_market.search_learnware(user_info) - assert(len(single_learnware_list)==0), f"Statistical search failed!" - + assert len(single_learnware_list) == 0, f"Statistical search failed!" # delete key "Task" in semantic_spec, use homo search and print WARNING INFO with "User doesn't provide correct task type" print(">> delele key 'Task' test:") @@ -282,14 +292,16 @@ class TestMarket(unittest.TestCase): mixture_learnware_list, ) = hetero_market.search_learnware(user_info) - assert(len(single_learnware_list)==0), f"Statistical search failed!" + assert len(single_learnware_list) == 0, f"Statistical search failed!" # modify semantic info with mismatch dim, use homo search and print "User data feature dimensions mismatch with semantic specification." print(">> mismatch dim test") semantic_spec = copy.deepcopy(user_semantic) - semantic_spec["Input"]=copy.deepcopy(input_description_list[idx%2]) - semantic_spec["Input"]['Dimension']=user_dim-2 - semantic_spec["Input"]['Description']={key: semantic_spec["Input"]['Description'][str(key)] for key in range(user_dim)} + semantic_spec["Input"] = copy.deepcopy(input_description_list[idx % 2]) + semantic_spec["Input"]["Dimension"] = user_dim - 2 + semantic_spec["Input"]["Description"] = { + key: semantic_spec["Input"]["Description"][str(key)] for key in range(user_dim) + } user_info = BaseUserInfo(semantic_spec=semantic_spec, stat_info={"RKMETableSpecification": user_spec}) ( @@ -299,8 +311,7 @@ class TestMarket(unittest.TestCase): mixture_learnware_list, ) = hetero_market.search_learnware(user_info) - assert(len(single_learnware_list)==0), f"Statistical search failed!" - + assert len(single_learnware_list) == 0, f"Statistical search failed!" rmtree(test_folder) # rm -r test_folder @@ -328,7 +339,7 @@ class TestMarket(unittest.TestCase): mixture_learnware_list, ) = hetero_market.search_learnware(user_info) - target_spec_num=3 if idx%2==0 else 2 + target_spec_num = 3 if idx % 2 == 0 else 2 assert len(single_learnware_list) == target_spec_num, f"Statistical search failed!" print(f"search result of user{idx}:") for score, learnware in zip(sorted_score_list, single_learnware_list): @@ -349,7 +360,7 @@ class TestMarket(unittest.TestCase): # generate specification semantic_spec = copy.deepcopy(user_semantic) semantic_spec["Input"] = user_description_list[0] - user_info=BaseUserInfo(semantic_spec=semantic_spec, stat_info={"RKMETableSpecification": user_spec}) + user_info = BaseUserInfo(semantic_spec=semantic_spec, stat_info={"RKMETableSpecification": user_spec}) # learnware market search hetero_market = self.test_train_market_model(learnware_num) @@ -365,21 +376,21 @@ class TestMarket(unittest.TestCase): print(f"score: {score}, learnware_id: {learnware.id}") # model reuse - reuser=HeteroMapTableReuser(single_learnware_list[0], mode='regression') + reuser = HeteroMapTableReuser(single_learnware_list[0], mode="regression") reuser.fit(user_spec) reuser.finetune(X[:100], y[:100]) - y_pred=reuser.predict(X) - rmse=mean_squared_error(y, y_pred, squared=False) + y_pred = reuser.predict(X) + rmse = mean_squared_error(y, y_pred, squared=False) print(f"rmse finetune: {rmse}") def suite(): _suite = unittest.TestSuite() - # _suite.addTest(TestMarket("test_prepare_learnware_randomly")) - # _suite.addTest(TestMarket("test_generated_learnwares")) - # _suite.addTest(TestMarket("test_upload_delete_learnware")) - # _suite.addTest(TestMarket("test_train_market_model")) - # _suite.addTest(TestMarket("test_search_semantics")) + _suite.addTest(TestMarket("test_prepare_learnware_randomly")) + _suite.addTest(TestMarket("test_generated_learnwares")) + _suite.addTest(TestMarket("test_upload_delete_learnware")) + _suite.addTest(TestMarket("test_train_market_model")) + _suite.addTest(TestMarket("test_search_semantics")) _suite.addTest(TestMarket("test_stat_search")) _suite.addTest(TestMarket("test_model_reuse")) return _suite diff --git a/tests/test_workflow/test_workflow.py b/tests/test_workflow/test_workflow.py index ef41449..000aa15 100644 --- a/tests/test_workflow/test_workflow.py +++ b/tests/test_workflow/test_workflow.py @@ -232,10 +232,10 @@ class TestWorkflow(unittest.TestCase): def suite(): _suite = unittest.TestSuite() - # _suite.addTest(TestWorkflow("test_prepare_learnware_randomly")) - # _suite.addTest(TestWorkflow("test_upload_delete_learnware")) - # _suite.addTest(TestWorkflow("test_search_semantics")) - # _suite.addTest(TestWorkflow("test_stat_search")) + _suite.addTest(TestWorkflow("test_prepare_learnware_randomly")) + _suite.addTest(TestWorkflow("test_upload_delete_learnware")) + _suite.addTest(TestWorkflow("test_search_semantics")) + _suite.addTest(TestWorkflow("test_stat_search")) _suite.addTest(TestWorkflow("test_learnware_reuse")) return _suite From e0e5a05de26b05d8257b19f9a7e40975d26ed13b Mon Sep 17 00:00:00 2001 From: Gene Date: Sun, 12 Nov 2023 00:24:13 +0800 Subject: [PATCH 42/90] [MNT] add a dependency and modify import --- learnware/market/easy/searcher.py | 1 - .../heterogeneous/organizer/hetero_mapping/__init__.py | 2 -- learnware/market/heterogeneous/searcher.py | 2 +- learnware/reuse/averaging.py | 2 +- learnware/reuse/ensemble_pruning.py | 2 +- learnware/reuse/feature_augment_reuser.py | 2 +- learnware/reuse/hetero_reuser/__init__.py | 4 ++-- learnware/reuse/hetero_reuser/feature_alignment.py | 6 +++--- setup.py | 1 + 9 files changed, 10 insertions(+), 12 deletions(-) diff --git a/learnware/market/easy/searcher.py b/learnware/market/easy/searcher.py index 73bd593..5340c0e 100644 --- a/learnware/market/easy/searcher.py +++ b/learnware/market/easy/searcher.py @@ -435,7 +435,6 @@ class EasyStatSearcher(BaseSearcher): ): continue - # TODO: must we check dim for Text and Image specification? rkme_dim = str(list(rkme.get_z().shape)[1:]) if rkme_dim == user_rkme_dim: filtered_learnware_list.append(learnware) diff --git a/learnware/market/heterogeneous/organizer/hetero_mapping/__init__.py b/learnware/market/heterogeneous/organizer/hetero_mapping/__init__.py index 231dfff..f37e3e7 100644 --- a/learnware/market/heterogeneous/organizer/hetero_mapping/__init__.py +++ b/learnware/market/heterogeneous/organizer/hetero_mapping/__init__.py @@ -59,7 +59,6 @@ class HeteroMapping(nn.Module): device=device, ) - ##todo: BUG!!!!!! self.encoder = TransformerMultiLayer( hidden_dim=hidden_dim, num_layer=num_layer, @@ -404,7 +403,6 @@ class TransformerMultiLayer(nn.Module): use_layer_norm=True, activation=activation, ) - ##todo: BUG!!!!!! stacked_transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layer - 1) self.transformer_encoder.append(stacked_transformer) diff --git a/learnware/market/heterogeneous/searcher.py b/learnware/market/heterogeneous/searcher.py index c30ad1e..4bf43fc 100644 --- a/learnware/market/heterogeneous/searcher.py +++ b/learnware/market/heterogeneous/searcher.py @@ -69,7 +69,7 @@ class HeteroMapTableSearcher(EasySearcher): learnware_list: List[Learnware], user_info: BaseUserInfo, ) -> Tuple[List[float], List[Learnware], float, List[Learnware]]: - # todo: use specially assigned search_gamma for calculating mmd dist + # TODO: use specially assigned search_gamma for calculating mmd dist user_hetero_spec = self.learnware_oganizer.generate_hetero_map_spec(user_info) logger.info(f"After semantic search, learnware_list length is {len(learnware_list)}") diff --git a/learnware/reuse/averaging.py b/learnware/reuse/averaging.py index 80ca8ae..c12083d 100644 --- a/learnware/reuse/averaging.py +++ b/learnware/reuse/averaging.py @@ -4,7 +4,7 @@ from typing import List from scipy.special import softmax -from learnware.learnware import Learnware +from ..learnware import Learnware from .base import BaseReuser from ..logger import get_module_logger diff --git a/learnware/reuse/ensemble_pruning.py b/learnware/reuse/ensemble_pruning.py index 6001880..bb7db40 100644 --- a/learnware/reuse/ensemble_pruning.py +++ b/learnware/reuse/ensemble_pruning.py @@ -4,7 +4,7 @@ import numpy as np import geatpy as ea from typing import List -from learnware.learnware import Learnware +from ..learnware import Learnware from .base import BaseReuser from ..logger import get_module_logger diff --git a/learnware/reuse/feature_augment_reuser.py b/learnware/reuse/feature_augment_reuser.py index b0e5ee6..3844db8 100644 --- a/learnware/reuse/feature_augment_reuser.py +++ b/learnware/reuse/feature_augment_reuser.py @@ -2,7 +2,7 @@ import numpy as np from sklearn.linear_model import RidgeCV, LogisticRegressionCV from .base import BaseReuser -from learnware.learnware import Learnware +from ..learnware import Learnware class FeatureAugmentReuser(BaseReuser): diff --git a/learnware/reuse/hetero_reuser/__init__.py b/learnware/reuse/hetero_reuser/__init__.py index 4a252ec..9bbbdec 100644 --- a/learnware/reuse/hetero_reuser/__init__.py +++ b/learnware/reuse/hetero_reuser/__init__.py @@ -1,5 +1,5 @@ -from learnware.learnware import Learnware -from learnware.reuse.base import BaseReuser +from ...learnware import Learnware +from ..base import BaseReuser from .feature_alignment import FeatureAligner from ..feature_augment_reuser import FeatureAugmentReuser diff --git a/learnware/reuse/hetero_reuser/feature_alignment.py b/learnware/reuse/hetero_reuser/feature_alignment.py index 7f81187..2506b0f 100644 --- a/learnware/reuse/hetero_reuser/feature_alignment.py +++ b/learnware/reuse/hetero_reuser/feature_alignment.py @@ -7,9 +7,9 @@ import torch import time from tqdm import trange -from learnware.learnware import Learnware -from learnware.specification import RKMETableSpecification -from learnware.specification.regular.table.rkme import choose_device +from ...learnware import Learnware +from ...specification import RKMETableSpecification +from ...specification.regular.table.rkme import choose_device from ..base import BaseReuser from ...logger import get_module_logger diff --git a/setup.py b/setup.py index 2c0df29..705000d 100644 --- a/setup.py +++ b/setup.py @@ -66,6 +66,7 @@ REQUIRED = [ "rapidfuzz>=3.4.0", "langdetect>=1.0.9", "huggingface-hub<0.18", + "transformers>=4.34.1", "portalocker>=2.0.0", "qpsolvers[clarabel]>=4.0.1", ] From 9c0a93bb7bc4406a869aaf1b4269a1e53df26c8e Mon Sep 17 00:00:00 2001 From: Gene Date: Sun, 12 Nov 2023 01:49:48 +0800 Subject: [PATCH 43/90] [MNT] modify details about specification, reuse, tests --- examples/dataset_m5_workflow/main.py | 6 +- examples/dataset_pfs_workflow/main.py | 6 +- examples/workflow_by_code/main.py | 6 +- learnware/reuse/__init__.py | 17 +++-- ...e_augment_reuser.py => feature_augment.py} | 51 +++++++++++--- learnware/reuse/hetero_reuser/__init__.py | 16 +++-- ...{feature_alignment.py => feature_align.py} | 69 ++++++++----------- learnware/reuse/job_selector.py | 8 +-- learnware/specification/__init__.py | 4 +- learnware/specification/module.py | 15 ++-- learnware/specification/regular/text/rkme.py | 1 + learnware/specification/system/__init__.py | 2 +- .../{heter_table.py => hetero_table.py} | 8 +-- tests/test_hetero_market/test_hetero.py | 6 +- tests/test_specification/test_rkme.py | 4 +- tests/test_workflow/test_workflow.py | 6 +- 16 files changed, 127 insertions(+), 98 deletions(-) rename learnware/reuse/{feature_augment_reuser.py => feature_augment.py} (71%) rename learnware/reuse/hetero_reuser/{feature_alignment.py => feature_align.py} (86%) rename learnware/specification/system/{heter_table.py => hetero_table.py} (98%) diff --git a/examples/dataset_m5_workflow/main.py b/examples/dataset_m5_workflow/main.py index bc8a369..74338b9 100644 --- a/examples/dataset_m5_workflow/main.py +++ b/examples/dataset_m5_workflow/main.py @@ -9,7 +9,7 @@ from shutil import copyfile, rmtree import learnware from learnware.market import instantiate_learnware_market, BaseUserInfo from learnware.reuse import JobSelectorReuser, AveragingReuser -from learnware.specification import generate_rkme_spec +from learnware.specification import generate_rkme_table_spec from m5 import DataLoader from learnware.logger import get_module_logger @@ -98,7 +98,7 @@ class M5DatasetWorkflow: for idx in tqdm(idx_list): train_x, train_y, test_x, test_y = m5.get_idx_data(idx) st = time.time() - spec = generate_rkme_spec(X=train_x, gamma=0.1, cuda_idx=0) + spec = generate_rkme_table_spec(X=train_x, gamma=0.1, cuda_idx=0) ed = time.time() logger.info("Stat spec generated in %.3f s" % (ed - st)) @@ -150,7 +150,7 @@ class M5DatasetWorkflow: for idx in idx_list: train_x, train_y, test_x, test_y = m5.get_idx_data(idx) - user_spec = generate_rkme_spec(X=test_x, gamma=0.1, cuda_idx=0) + user_spec = generate_rkme_table_spec(X=test_x, gamma=0.1, cuda_idx=0) user_spec_path = f"./user_spec/user_{idx}.json" user_spec.save(user_spec_path) diff --git a/examples/dataset_pfs_workflow/main.py b/examples/dataset_pfs_workflow/main.py index e0a8fac..356cdbc 100644 --- a/examples/dataset_pfs_workflow/main.py +++ b/examples/dataset_pfs_workflow/main.py @@ -9,7 +9,7 @@ from shutil import copyfile, rmtree import learnware from learnware.market import instantiate_learnware_market, BaseUserInfo from learnware.reuse import JobSelectorReuser, AveragingReuser -from learnware.specification import generate_rkme_spec +from learnware.specification import generate_rkme_table_spec from pfs import Dataloader from learnware.logger import get_module_logger @@ -95,7 +95,7 @@ class PFSDatasetWorkflow: for idx in tqdm(idx_list): train_x, train_y, test_x, test_y = pfs.get_idx_data(idx) st = time.time() - spec = generate_rkme_spec(X=train_x, gamma=0.1, cuda_idx=0) + spec = generate_rkme_table_spec(X=train_x, gamma=0.1, cuda_idx=0) ed = time.time() logger.info("Stat spec generated in %.3f s" % (ed - st)) @@ -147,7 +147,7 @@ class PFSDatasetWorkflow: for idx in idx_list: train_x, train_y, test_x, test_y = pfs.get_idx_data(idx) - user_spec = generate_rkme_spec(X=test_x, gamma=0.1, cuda_idx=0) + user_spec = generate_rkme_table_spec(X=test_x, gamma=0.1, cuda_idx=0) user_spec_path = f"./user_spec/user_{idx}.json" user_spec.save(user_spec_path) diff --git a/examples/workflow_by_code/main.py b/examples/workflow_by_code/main.py index 40c6e62..8a08a61 100644 --- a/examples/workflow_by_code/main.py +++ b/examples/workflow_by_code/main.py @@ -12,7 +12,7 @@ from shutil import copyfile, rmtree import learnware from learnware.market import instantiate_learnware_market, BaseUserInfo from learnware.reuse import JobSelectorReuser, AveragingReuser -from learnware.specification import generate_rkme_spec, RKMETableSpecification +from learnware.specification import generate_rkme_table_spec, RKMETableSpecification curr_root = os.path.dirname(os.path.abspath(__file__)) @@ -53,7 +53,7 @@ class LearnwareMarketWorkflow: joblib.dump(clf, os.path.join(dir_path, "svm.pkl")) - spec = generate_rkme_spec(X=data_X, gamma=0.1, cuda_idx=0) + spec = generate_rkme_table_spec(X=data_X, gamma=0.1, cuda_idx=0) spec.save(os.path.join(dir_path, "svm.json")) init_file = os.path.join(dir_path, "__init__.py") @@ -173,7 +173,7 @@ class LearnwareMarketWorkflow: X, y = load_digits(return_X_y=True) _, data_X, _, data_y = train_test_split(X, y, test_size=0.3, shuffle=True) - stat_spec = generate_rkme_spec(X=data_X, gamma=0.1, cuda_idx=0) + stat_spec = generate_rkme_table_spec(X=data_X, gamma=0.1, cuda_idx=0) user_info = BaseUserInfo(semantic_spec=user_semantic, stat_info={"RKMETableSpecification": stat_spec}) _, _, _, mixture_learnware_list = easy_market.search_learnware(user_info) diff --git a/learnware/reuse/__init__.py b/learnware/reuse/__init__.py index d8040a9..93a4153 100644 --- a/learnware/reuse/__init__.py +++ b/learnware/reuse/__init__.py @@ -12,9 +12,16 @@ else: if not is_torch_avaliable(verbose=False): AveragingReuser = None - logger.warning("AveragingReuser is skipped due to 'torch' is not installed!") + FeatureAugmentReuser = None + HeteroMapTableReuser = None + FeatureAlignReuser = None + logger.warning( + "[AveragingReuser, FeatureAugmentReuser, HeteroMapTableReuser, FeatureAlignReuser] is skipped due to 'torch' is not installed!" + ) else: from .averaging import AveragingReuser + from .feature_augment import FeatureAugmentReuser + from .hetero_reuser import HeteroMapTableReuser, FeatureAlignReuser if not is_lightgbm_avaliable(verbose=False) or not is_torch_avaliable(verbose=False): JobSelectorReuser = None @@ -32,11 +39,3 @@ if not is_lightgbm_avaliable(verbose=False) or not is_torch_avaliable(verbose=Fa logger.warning(f"JobSelectorReuser is skipped due to {uninstall_packages} is not installed!") else: from .job_selector import JobSelectorReuser - -if not is_torch_avaliable(verbose=False): - HeteroMapTableReuser = None - logger.warning("FeatureAugmentReuser is skipped due to 'torch' is not installed!") -else: - from .hetero_reuser import HeteroMapTableReuser - -from .feature_augment_reuser import FeatureAugmentReuser diff --git a/learnware/reuse/feature_augment_reuser.py b/learnware/reuse/feature_augment.py similarity index 71% rename from learnware/reuse/feature_augment_reuser.py rename to learnware/reuse/feature_augment.py index 3844db8..653bfa7 100644 --- a/learnware/reuse/feature_augment_reuser.py +++ b/learnware/reuse/feature_augment.py @@ -1,6 +1,8 @@ +import torch import numpy as np - +from typing import Union from sklearn.linear_model import RidgeCV, LogisticRegressionCV + from .base import BaseReuser from ..learnware import Learnware @@ -14,13 +16,13 @@ class FeatureAugmentReuser(BaseReuser): - "classification": Uses LogisticRegressionCV for classification tasks. """ - def __init__(self, learnware: Learnware = None, mode: str = None): + def __init__(self, learnware: Union[Learnware, BaseReuser] = None, mode: str = None): """ Initialize the FeatureAugmentReuser with a learnware model and a mode. Parameters ---------- - learnware : Learnware + learnware : Union[Learnware, BaseReuser] A learnware model used for initial predictions. mode : str The mode of operation, either "regression" or "classification". @@ -28,6 +30,7 @@ class FeatureAugmentReuser(BaseReuser): self.learnware = learnware assert mode in ["classification", "regression"], "Mode must be either 'classification' or 'regression'" self.mode = mode + self.output_aligner = None def predict(self, user_data: np.ndarray) -> np.ndarray: """ @@ -43,10 +46,12 @@ class FeatureAugmentReuser(BaseReuser): np.ndarray Predicted output from the output aligner model. """ + assert self.output_aligner is not None, "FeatureAugmentReuser is not trained by labeled data yet." + user_data = self._fill_data(user_data) - y_pred = self.learnware.predict(user_data) - user_data_aug = np.concatenate((user_data, y_pred.reshape(-1, 1)), axis=1) + user_data_aug = self._get_augment_data(user_data) y_pred_aug = self.output_aligner.predict(user_data_aug) + return y_pred_aug def fit(self, x_train: np.ndarray, y_train: np.ndarray): @@ -61,18 +66,18 @@ class FeatureAugmentReuser(BaseReuser): Training data labels. """ x_train = self._fill_data(x_train) - y_pred = self.learnware.predict(x_train) - x_train_aug = np.concatenate((x_train, y_pred.reshape(-1, 1)), axis=1) + x_train_aug = self._get_augment_data(x_train) + if self.mode == "regression": alpha_list = [0.01, 0.1, 1.0, 10, 100] ridge_cv = RidgeCV(alphas=alpha_list, store_cv_values=True) ridge_cv.fit(x_train_aug, y_train) self.output_aligner = ridge_cv - elif self.mode == "classification": + else: self.output_aligner = LogisticRegressionCV(cv=5, max_iter=1000, random_state=0, multi_class="auto") self.output_aligner.fit(x_train_aug, y_train) - def _fill_data(self, X: np.ndarray): + def _fill_data(self, X: np.ndarray) -> np.ndarray: """ Fill missing data (NaN, Inf) in the input array with the mean of the column. @@ -101,3 +106,31 @@ class FeatureAugmentReuser(BaseReuser): col_mean = np.nanmean(X[:, col]) X[:, col] = np.where(is_nan, col_mean, X[:, col]) return X + + def _get_augment_data(self, X: np.ndarray) -> np.ndarray: + """Get the augmented data with model output. + + Parameters + ---------- + X : np.ndarray + Input data. + + Returns + ------- + np.ndarray + Augment data with model output. + + Raises + ------ + TypeError + If the type of model output not in [np.ndarray, torch.Tensor]. + """ + y_pred = self.learnware.predict(X) + if isinstance(y_pred, torch.Tensor): + y_pred = y_pred.detach().cpu().numpy() + if not isinstance(y_pred, np.ndarray): + raise TypeError(f"Model output must be np.ndarray or torch.Tensor") + if len(y_pred.shape) == 1: + y_pred = y_pred.reshape(-1, 1) + + return np.concatenate((X, y_pred), axis=1) diff --git a/learnware/reuse/hetero_reuser/__init__.py b/learnware/reuse/hetero_reuser/__init__.py index 9bbbdec..c794524 100644 --- a/learnware/reuse/hetero_reuser/__init__.py +++ b/learnware/reuse/hetero_reuser/__init__.py @@ -1,7 +1,7 @@ from ...learnware import Learnware from ..base import BaseReuser -from .feature_alignment import FeatureAligner -from ..feature_augment_reuser import FeatureAugmentReuser +from .feature_align import FeatureAlignReuser +from ..feature_augment import FeatureAugmentReuser class HeteroMapTableReuser(BaseReuser): @@ -41,6 +41,8 @@ class HeteroMapTableReuser(BaseReuser): self.mode = mode self.cuda_idx = cuda_idx self.align_arguments = align_arguments + self.reuser = None + self.feature_align_reuser = None def fit(self, user_rkme): """ @@ -51,11 +53,11 @@ class HeteroMapTableReuser(BaseReuser): user_rkme : RKMETableSpecification The RKME specification from the user dataset. """ - self.feature_aligner = FeatureAligner( + self.feature_align_reuser = FeatureAlignReuser( learnware=self.learnware, mode=self.mode, cuda_idx=self.cuda_idx, **self.align_arguments ) - self.feature_aligner.fit(user_rkme) - self.reuser = self.feature_aligner + self.feature_align_reuser.fit(user_rkme) + self.reuser = self.feature_align_reuser def finetune(self, x_train, y_train): """ @@ -68,7 +70,8 @@ class HeteroMapTableReuser(BaseReuser): y_train : ndarray Training data labels. """ - self.reuser = FeatureAugmentReuser(learnware=self.feature_aligner, mode=self.mode) + assert self.feature_align_reuser is not None, "HeteroMapTableReuser must be fitted before fine-tuning." + self.reuser = FeatureAugmentReuser(learnware=self.feature_align_reuser, mode=self.mode) self.reuser.fit(x_train, y_train) def predict(self, user_data): @@ -85,4 +88,5 @@ class HeteroMapTableReuser(BaseReuser): ndarray Predicted output from the model. """ + assert self.reuser is not None, "HeteroMapTableReuser must be fitted before making predictions." return self.reuser.predict(user_data) diff --git a/learnware/reuse/hetero_reuser/feature_alignment.py b/learnware/reuse/hetero_reuser/feature_align.py similarity index 86% rename from learnware/reuse/hetero_reuser/feature_alignment.py rename to learnware/reuse/hetero_reuser/feature_align.py index 2506b0f..7bcd1f9 100644 --- a/learnware/reuse/hetero_reuser/feature_alignment.py +++ b/learnware/reuse/hetero_reuser/feature_align.py @@ -1,25 +1,23 @@ -from typing import List, Any +import time +import torch import numpy as np -from numpy import ndarray import torch.nn as nn -import torch.nn.functional as F -import torch -import time +from typing import List from tqdm import trange +import torch.nn.functional as F +from ..base import BaseReuser +from ...logger import get_module_logger from ...learnware import Learnware from ...specification import RKMETableSpecification from ...specification.regular.table.rkme import choose_device -from ..base import BaseReuser -from ...logger import get_module_logger +logger = get_module_logger("hetero_feature_align") -logger = get_module_logger("hetero_feature_alignment") - -class FeatureAligner(BaseReuser): +class FeatureAlignReuser(BaseReuser): """ - FeatureAligner is a class for aligning features from a user dataset with a target dataset using a learnware model. + FeatureAlignReuser is a class for aligning features from a user dataset with a target dataset using a learnware model. It supports both classification and regression tasks and uses a feature alignment trainer for alignment. Attributes @@ -38,7 +36,7 @@ class FeatureAligner(BaseReuser): def __init__(self, learnware: Learnware = None, mode: str = None, cuda_idx=0, **align_arguments): """ - Initialize the FeatureAligner with a learnware model, mode, CUDA device index, and alignment arguments. + Initialize the FeatureAlignReuser with a learnware model, mode, CUDA device index, and alignment arguments. Parameters ---------- @@ -57,6 +55,7 @@ class FeatureAligner(BaseReuser): self.align_arguments = align_arguments self.cuda_idx = cuda_idx self.device = choose_device(cuda_idx=cuda_idx) + self.align_model = None def fit(self, user_rkme: RKMETableSpecification): """ @@ -68,26 +67,27 @@ class FeatureAligner(BaseReuser): The RKME specification from the user dataset. """ target_rkme = self.learnware.specification.get_stat_spec()["RKMETableSpecification"] - trainer = FeatureAlignmentTrainer( + trainer = FeatureAlignTrainer( target_rkme=target_rkme, user_rkme=user_rkme, cuda_idx=self.cuda_idx, **self.align_arguments ) self.align_model = trainer.model self.align_model.eval() - def predict(self, user_data: ndarray) -> ndarray: + def predict(self, user_data: np.ndarray) -> np.ndarray: """ Predict the output for user data using the aligned model and learnware model. Parameters ---------- - user_data : ndarray + user_data : np.ndarray Input data for making predictions. Returns ------- - ndarray + np.ndarray Predicted output from the learnware model after alignment. """ + assert self.align_model is not None, "FeatureAlignReuser must be fitted before making predictions." user_data = self._fill_data(user_data) transformed_user_data = ( self.align_model(torch.tensor(user_data, device=self.device).float()).detach().cpu().numpy() @@ -126,9 +126,9 @@ class FeatureAligner(BaseReuser): return X -class FeatureAlignmentModel(nn.Module): +class FeatureAlignModel(nn.Module): """ - FeatureAlignmentModel is a neural network module designed for feature alignment tasks. + FeatureAlignModel is a neural network module designed for feature alignment tasks. It consists of multiple fully connected (dense) layers, optional dropout and batch normalization layers, and supports different activation functions. """ @@ -143,7 +143,7 @@ class FeatureAlignmentModel(nn.Module): use_bn: bool = False, ): """ - Initialize the FeatureAlignmentModel. + Initialize the FeatureAlignModel. Parameters ---------- @@ -203,24 +203,22 @@ class FeatureAlignmentModel(nn.Module): x = fc(x) # Apply fully connected layer x = self.activation(x) # Apply activation function x = drop(x) # Apply dropout - return self.final_fc(x) # Return output from final fully connected layer + # Return output from final fully connected layer + return self.final_fc(x) -class FeatureAlignmentTrainer: + +class FeatureAlignTrainer: """ - FeatureAlignmentTrainer is a class designed to train a neural network for aligning features from a user dataset + FeatureAlignTrainer is a class designed to train a neural network for aligning features from a user dataset to a target dataset. It utilizes Maximum Mean Discrepancy (MMD) as the loss function for training. Attributes ---------- target_rkme : RKMETableSpecification - The RKME (Relative Knowledge Model Embeddings) specification of the target dataset. + The RKME specification of the target dataset. user_rkme : RKMETableSpecification The RKME specification of the user dataset. - extra_labeled_data : Any, optional - Additional labeled data for training, if available. - target_learnware : Learnware, optional - The learnware model used for the target dataset. num_epoch : int The number of training epochs. lr : float @@ -247,10 +245,8 @@ class FeatureAlignmentTrainer: def __init__( self, - target_rkme: RKMETableSpecification, # (X, weight) - user_rkme: RKMETableSpecification, # (X, weight) - extra_labeled_data: Any = None, - target_learnware: Learnware = None, + target_rkme: RKMETableSpecification, + user_rkme: RKMETableSpecification, num_epoch: int = 50, lr: float = 1e-3, gamma: float = 0.1, @@ -264,7 +260,7 @@ class FeatureAlignmentTrainer: cuda_idx: int = 0, ): """ - Initialize the FeatureAlignmentTrainer with the specified parameters. + Initialize the FeatureAlignTrainer with the specified parameters. """ self.target_rkme = target_rkme self.user_rkme = user_rkme @@ -281,10 +277,7 @@ class FeatureAlignmentTrainer: self.optimizer_type = optimizer_type self.const = const self.device = choose_device(cuda_idx=cuda_idx) - if extra_labeled_data is not None and target_learnware is not None: - self.train_with_labeled_data(extra_labeled_data[0], extra_labeled_data[1], target_learnware) - else: - self.train() + self.train() def gaussian_kernel(self, x1: torch.Tensor, x2: torch.Tensor): """ @@ -342,11 +335,9 @@ class FeatureAlignmentTrainer: input_dim = self.user_rkme.get_z().shape[1] output_dim = self.target_rkme.get_z().shape[1] - user_model = FeatureAlignmentModel( + user_model = FeatureAlignModel( input_dim, output_dim, args["hidden_dims"], args["activation"], args["dropout_ratio"], args["use_bn"] ) - - # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") user_model.to(self.device) user_data_x = torch.tensor(self.user_rkme.get_z(), device=self.device).float() user_data_weight = torch.tensor(self.user_rkme.get_beta(), device=self.device).view(1, -1).double() diff --git a/learnware/reuse/job_selector.py b/learnware/reuse/job_selector.py index d9f9fbf..8077106 100644 --- a/learnware/reuse/job_selector.py +++ b/learnware/reuse/job_selector.py @@ -2,16 +2,14 @@ import torch import numpy as np from typing import List, Union -from qpsolvers import solve_qp from lightgbm import LGBMClassifier, early_stopping from sklearn.metrics import accuracy_score - from .base import BaseReuser from ..market.utils import parse_specification_type from ..learnware import Learnware from ..specification import RKMETableSpecification, RKMETextSpecification -from ..specification import generate_rkme_spec, rkme_solve_qp +from ..specification import generate_rkme_table_spec, rkme_solve_qp from ..logger import get_module_logger logger = get_module_logger("job_selector_reuse") @@ -39,7 +37,7 @@ class JobSelectorReuser(BaseReuser): Parameters ---------- - user_data : np.ndarray + user_data : Union[np.ndarray, List[str]] User's unlabeled raw data. Returns @@ -180,7 +178,7 @@ class JobSelectorReuser(BaseReuser): Inner product matrix calculated from task_rkme_list. """ task_num = len(task_rkme_list) - user_rkme_spec = generate_rkme_spec(X=user_data, reduce=False) + user_rkme_spec = generate_rkme_table_spec(X=user_data, reduce=False) K = task_rkme_matrix v = np.array([user_rkme_spec.inner_prod(task_rkme) for task_rkme in task_rkme_list]) diff --git a/learnware/specification/__init__.py b/learnware/specification/__init__.py index ce795e9..6defc3e 100644 --- a/learnware/specification/__init__.py +++ b/learnware/specification/__init__.py @@ -14,8 +14,8 @@ from ..utils import is_torch_avaliable if not is_torch_avaliable(verbose=False): generate_stat_spec = None - generate_rkme_spec = None + generate_rkme_table_spec = None generate_rkme_image_spec = None generate_rkme_text_spec = None else: - from .module import generate_stat_spec, generate_rkme_spec, generate_rkme_image_spec, generate_rkme_text_spec + from .module import generate_stat_spec, generate_rkme_table_spec, generate_rkme_image_spec, generate_rkme_text_spec diff --git a/learnware/specification/module.py b/learnware/specification/module.py index 7ae6ded..425d318 100644 --- a/learnware/specification/module.py +++ b/learnware/specification/module.py @@ -9,7 +9,7 @@ from .regular import RKMETableSpecification, RKMEImageSpecification, RKMETextSpe from ..config import C -def generate_rkme_spec( +def generate_rkme_table_spec( X: Union[np.ndarray, pd.DataFrame, torch.Tensor], gamma: float = 0.1, reduced_set_size: int = 100, @@ -197,13 +197,18 @@ def generate_rkme_text_spec( return rkme_text_spec -def generate_stat_spec(type="table", *args, **kwargs) -> BaseStatSpecification: +def generate_stat_spec( + type: str, X: Union[np.ndarray, pd.DataFrame, torch.Tensor, List[str]], *args, **kwargs +) -> BaseStatSpecification: """ Interface for users to generate statistical specification. Return a StatSpecification object, use .save() method to save as npy file. Parameters ---------- + type: str + Type of statistical specification. + Supported types: "table", "text", "image" X : np.ndarray Raw data in np.ndarray format. Size of array: (n*d) @@ -214,10 +219,10 @@ def generate_stat_spec(type="table", *args, **kwargs) -> BaseStatSpecification: A StatSpecification object """ if type == "table": - return generate_rkme_spec(*args, **kwargs) + return generate_rkme_table_spec(X=X, *args, **kwargs) elif type == "text": - return generate_rkme_text_spec(*args, **kwargs) + return generate_rkme_text_spec(X=X, *args, **kwargs) elif type == "image": - return generate_rkme_image_spec(*args, **kwargs) + return generate_rkme_image_spec(X=X, *args, **kwargs) else: raise TypeError(f"type {type} is not supported!") diff --git a/learnware/specification/regular/text/rkme.py b/learnware/specification/regular/text/rkme.py index 95dc7f5..a452810 100644 --- a/learnware/specification/regular/text/rkme.py +++ b/learnware/specification/regular/text/rkme.py @@ -2,6 +2,7 @@ import os import langdetect import numpy as np from sentence_transformers import SentenceTransformer + from ..table import RKMETableSpecification from ....logger import get_module_logger diff --git a/learnware/specification/system/__init__.py b/learnware/specification/system/__init__.py index e57a155..a09ecca 100644 --- a/learnware/specification/system/__init__.py +++ b/learnware/specification/system/__init__.py @@ -1 +1 @@ -from .heter_table import HeteroMapTableSpecification +from .hetero_table import HeteroMapTableSpecification diff --git a/learnware/specification/system/heter_table.py b/learnware/specification/system/hetero_table.py similarity index 98% rename from learnware/specification/system/heter_table.py rename to learnware/specification/system/hetero_table.py index 5329a06..918ee11 100644 --- a/learnware/specification/system/heter_table.py +++ b/learnware/specification/system/hetero_table.py @@ -1,12 +1,11 @@ from __future__ import annotations -import codecs +import os import copy import json -import os - -import numpy as np import torch +import codecs +import numpy as np from ..regular import RKMETableSpecification from ..regular.table.rkme import choose_device, setup_seed, torch_rbf_kernel @@ -81,7 +80,6 @@ class HeteroMapTableSpecification(SystemStatSpecification): embedding_to_save["beta"] = embedding_to_save["beta"].detach().cpu().numpy() embedding_to_save["beta"] = embedding_to_save["beta"].tolist() embedding_to_save["device"] = "gpu" if embedding_to_save["cuda_idx"] != -1 else "cpu" - # embedding_to_save["type"] = self.type json.dump( embedding_to_save, codecs.open(save_path, "w", encoding="utf-8"), diff --git a/tests/test_hetero_market/test_hetero.py b/tests/test_hetero_market/test_hetero.py index c42cd97..3a9643f 100644 --- a/tests/test_hetero_market/test_hetero.py +++ b/tests/test_hetero_market/test_hetero.py @@ -14,7 +14,7 @@ from sklearn.metrics import mean_squared_error import learnware from learnware.market import instantiate_learnware_market, BaseUserInfo -from learnware.specification import RKMETableSpecification, generate_rkme_spec +from learnware.specification import RKMETableSpecification, generate_rkme_table_spec from learnware.reuse import HeteroMapTableReuser from example_learnwares.config import ( input_shape_list, @@ -82,7 +82,7 @@ class TestMarket(unittest.TestCase): joblib.dump(clf, os.path.join(dir_path, "ridge.pkl")) - spec = generate_rkme_spec(X=X, gamma=0.1, cuda_idx=0) + spec = generate_rkme_table_spec(X=X, gamma=0.1, cuda_idx=0) spec.save(os.path.join(dir_path, "stat.json")) init_file = os.path.join(dir_path, "__init__.py") @@ -355,7 +355,7 @@ class TestMarket(unittest.TestCase): X, y = make_regression(n_samples=5000, n_informative=10, n_features=15, noise=0.1, random_state=0) # generate rkme - user_spec = generate_rkme_spec(X=X, gamma=0.1, cuda_idx=0) + user_spec = generate_rkme_table_spec(X=X, gamma=0.1, cuda_idx=0) # generate specification semantic_spec = copy.deepcopy(user_semantic) diff --git a/tests/test_specification/test_rkme.py b/tests/test_specification/test_rkme.py index ba280b2..6b2a2fc 100644 --- a/tests/test_specification/test_rkme.py +++ b/tests/test_specification/test_rkme.py @@ -8,13 +8,13 @@ import tempfile import numpy as np from learnware.specification import RKMETableSpecification, RKMEImageSpecification, RKMETextSpecification -from learnware.specification import generate_rkme_image_spec, generate_rkme_spec, generate_rkme_text_spec +from learnware.specification import generate_rkme_image_spec, generate_rkme_table_spec, generate_rkme_text_spec class TestRKME(unittest.TestCase): def test_rkme(self): X = np.random.uniform(-10000, 10000, size=(5000, 200)) - rkme = generate_rkme_spec(X) + rkme = generate_rkme_table_spec(X) rkme.generate_stat_spec_from_data(X) with tempfile.TemporaryDirectory(prefix="learnware_") as tempdir: diff --git a/tests/test_workflow/test_workflow.py b/tests/test_workflow/test_workflow.py index 000aa15..5adafd2 100644 --- a/tests/test_workflow/test_workflow.py +++ b/tests/test_workflow/test_workflow.py @@ -12,7 +12,7 @@ from shutil import copyfile, rmtree import learnware from learnware.market import instantiate_learnware_market, BaseUserInfo -from learnware.specification import RKMETableSpecification, generate_rkme_spec +from learnware.specification import RKMETableSpecification, generate_rkme_table_spec from learnware.reuse import JobSelectorReuser, AveragingReuser, EnsemblePruningReuser, FeatureAugmentReuser curr_root = os.path.dirname(os.path.abspath(__file__)) @@ -57,7 +57,7 @@ class TestWorkflow(unittest.TestCase): joblib.dump(clf, os.path.join(dir_path, "svm.pkl")) - spec = generate_rkme_spec(X=data_X, gamma=0.1, cuda_idx=0) + spec = generate_rkme_table_spec(X=data_X, gamma=0.1, cuda_idx=0) spec.save(os.path.join(dir_path, "svm.json")) init_file = os.path.join(dir_path, "__init__.py") @@ -200,7 +200,7 @@ class TestWorkflow(unittest.TestCase): X, y = load_digits(return_X_y=True) train_X, data_X, train_y, data_y = train_test_split(X, y, test_size=0.3, shuffle=True) - stat_spec = generate_rkme_spec(X=data_X, gamma=0.1, cuda_idx=0) + stat_spec = generate_rkme_table_spec(X=data_X, gamma=0.1, cuda_idx=0) user_info = BaseUserInfo(semantic_spec=user_semantic, stat_info={"RKMETableSpecification": stat_spec}) _, _, _, mixture_learnware_list = easy_market.search_learnware(user_info) From 1dbc14c685503f1eb1a40694906329fc8b82ef57 Mon Sep 17 00:00:00 2001 From: Gene Date: Sun, 12 Nov 2023 15:26:52 +0800 Subject: [PATCH 44/90] [MNT] modify HeteroSearcher --- learnware/market/heterogeneous/searcher.py | 100 ++------------------- 1 file changed, 7 insertions(+), 93 deletions(-) diff --git a/learnware/market/heterogeneous/searcher.py b/learnware/market/heterogeneous/searcher.py index 4bf43fc..3605609 100644 --- a/learnware/market/heterogeneous/searcher.py +++ b/learnware/market/heterogeneous/searcher.py @@ -1,102 +1,15 @@ -from typing import Tuple, List, Union - -import numpy as np +from typing import Tuple, List from ...learnware import Learnware from ...logger import get_module_logger -from ...specification import HeteroMapTableSpecification -from ..base import BaseSearcher, BaseUserInfo +from ..base import BaseUserInfo from ..easy import EasySearcher from ..utils import parse_specification_type -from .organizer import HeteroMapTableOrganizer logger = get_module_logger("hetero_searcher") -class HeteroMapTableSearcher(EasySearcher): - def _convert_dist_to_score( - self, dist_list: List[float], dist_epsilon: float = 0.01, min_score: float = 0.92 - ) -> List[float]: - if len(dist_list) == 0: - return [] - - min_dist, max_dist = min(dist_list), max(dist_list) - if min_dist == max_dist: - return [1 for dist in dist_list] - else: - max_score = (max_dist - min_dist) / (max_dist - dist_epsilon) - - if min_dist < dist_epsilon: - dist_epsilon = min_dist - elif max_score < min_score: - dist_epsilon = max_dist - (max_dist - min_dist) / min_score - - return [(max_dist - dist) / (max_dist - dist_epsilon) for dist in dist_list] - - def _search_by_hetero_spec_single( - self, learnware_list: List[Learnware], user_hetero_spec: HeteroMapTableSpecification - ) -> Tuple[List[float], List[Learnware]]: - hetero_spec_list = [ - learnware.specification.get_stat_spec_by_name("HeteroMapTableSpecification") for learnware in learnware_list - ] - mmd_dist_list = [] - for idx, hetero_spec in enumerate(hetero_spec_list): - mmd_dist = hetero_spec.dist(user_hetero_spec) - mmd_dist_list.append(mmd_dist) - - sorted_idx_list = sorted(range(len(learnware_list)), key=lambda k: mmd_dist_list[k]) - sorted_dist_list = [mmd_dist_list[idx] for idx in sorted_idx_list] - sorted_learnware_list = [learnware_list[idx] for idx in sorted_idx_list] - - return sorted_dist_list, sorted_learnware_list - - def _filter_by_hetero_spec_single( - self, - sorted_score_list: List[float], - learnware_list: List[Learnware], - filter_score: float = 0.5, - min_num: int = 5, - ) -> Tuple[List[float], List[Learnware]]: - idx = min(min_num, len(learnware_list)) - while idx < len(learnware_list): - if sorted_score_list[idx] < filter_score: - break - idx += 1 - return sorted_score_list[:idx], learnware_list[:idx] - - def __call__( - self, - learnware_list: List[Learnware], - user_info: BaseUserInfo, - ) -> Tuple[List[float], List[Learnware], float, List[Learnware]]: - # TODO: use specially assigned search_gamma for calculating mmd dist - user_hetero_spec = self.learnware_oganizer.generate_hetero_map_spec(user_info) - logger.info(f"After semantic search, learnware_list length is {len(learnware_list)}") - - sorted_dist_list, single_learnware_list = self._search_by_hetero_spec_single(learnware_list, user_hetero_spec) - sorted_score_list = self._convert_dist_to_score(sorted_dist_list) - - logger.info(f"After search by hetero spec, learnware_list length is {len(single_learnware_list)}") - sorted_score_list, single_learnware_list = self._filter_by_hetero_spec_single( - sorted_score_list, single_learnware_list - ) - - logger.info(f"After filter by hetero spec, learnware_list length is {len(single_learnware_list)}") - return sorted_score_list, single_learnware_list, None, None - - def reset(self, organizer): - self.learnware_oganizer = organizer - - class HeteroSearcher(EasySearcher): - def __init__(self, organizer: HeteroMapTableOrganizer = None): - super(HeteroSearcher, self).__init__(organizer) - self.hetero_stat_searcher = HeteroMapTableSearcher(organizer) - - def reset(self, organizer): - super().reset(organizer) - self.hetero_stat_searcher.reset(organizer) - @staticmethod def check_user_info(user_info: BaseUserInfo): try: @@ -119,8 +32,9 @@ class HeteroSearcher(EasySearcher): return False return True + except Exception as e: - logger.info( + logger.warning( f"Invalid heterogeneous search information provided. Use homogeneous search instead. Error: {e}" ) return False @@ -136,8 +50,8 @@ class HeteroSearcher(EasySearcher): if parse_specification_type(stat_specs=user_info.stat_info) is not None: if self.check_user_info(user_info): - return self.hetero_stat_searcher(learnware_list, user_info) - else: - return self.stat_searcher(learnware_list, user_info, max_search_num, search_method) + user_hetero_spec = self.learnware_organizer.generate_hetero_map_spec(user_info) + user_info.update_stat_info(user_hetero_spec.type, user_hetero_spec) + return self.stat_searcher(learnware_list, user_info, max_search_num, search_method) else: return None, learnware_list, 0.0, None From 2bb0848da7c652bd96fffd0c17ff56e9825ea177 Mon Sep 17 00:00:00 2001 From: Gene Date: Sun, 12 Nov 2023 17:18:13 +0800 Subject: [PATCH 45/90] [ENH] add AlignLearnware --- learnware/reuse/align.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 learnware/reuse/align.py diff --git a/learnware/reuse/align.py b/learnware/reuse/align.py new file mode 100644 index 0000000..04d29d4 --- /dev/null +++ b/learnware/reuse/align.py @@ -0,0 +1,26 @@ +from ..learnware import Learnware + + +class AlignLearnware(Learnware): + """The aligned learnware class, providing the interfaces to align learnware and make predictions""" + + def __init__(self, learnware: Learnware): + """The initialization method for align learnware + + Parameters + ---------- + learnware : Learnware + The learnware list to reuse and make predictions + """ + super(AlignLearnware, self).__init__( + id=learnware.id, + model=learnware.get_model(), + specification=learnware.get_specification(), + learnware_dirpath=learnware.get_dirpath(), + ) + self.learnware = learnware + + def align(self): + """Align the learnware with specification or data""" + + raise NotImplementedError("The align method is not implemented!") From a527f23dbeb43632a4ca6289491dc89436200dba Mon Sep 17 00:00:00 2001 From: Gene Date: Sun, 12 Nov 2023 17:20:28 +0800 Subject: [PATCH 46/90] [MNT] modify details to adapt to hetero searcher --- learnware/market/anchor/user_info.py | 4 ++-- learnware/market/base.py | 12 ++++++++++++ learnware/market/easy/__init__.py | 2 +- learnware/market/utils.py | 8 +++++++- 4 files changed, 22 insertions(+), 4 deletions(-) diff --git a/learnware/market/anchor/user_info.py b/learnware/market/anchor/user_info.py index 7ae4737..0e1ddb2 100644 --- a/learnware/market/anchor/user_info.py +++ b/learnware/market/anchor/user_info.py @@ -29,13 +29,13 @@ class AnchoredUserInfo(BaseUserInfo): self.anchor_learnware_ids += learnware_ids def update_stat_info(self, name: str, item: Any): - """Update stat_info based on anchor learnwares + """Update stat_info by market or user with anchor learnwares Parameters ---------- name : str Name of stat_info item : Any - Statistical information calculated on anchor learnwares + Statistical information calculated by market or user with anchor learnwares """ self.stat_info[name] = item diff --git a/learnware/market/base.py b/learnware/market/base.py index d061d7f..5b8a38a 100644 --- a/learnware/market/base.py +++ b/learnware/market/base.py @@ -42,6 +42,18 @@ class BaseUserInfo: def get_stat_info(self, name: str): return self.stat_info.get(name, None) + def update_stat_info(self, name: str, item: Any): + """Update stat_info by market + + Parameters + ---------- + name : str + Name of stat_info + item : Any + Statistical information calculated by market + """ + self.stat_info[name] = item + class LearnwareMarket: """Base interface for market, it provide the interface of search/add/detele/update learnwares""" diff --git a/learnware/market/easy/__init__.py b/learnware/market/easy/__init__.py index 988a5c6..63907a9 100644 --- a/learnware/market/easy/__init__.py +++ b/learnware/market/easy/__init__.py @@ -11,5 +11,5 @@ if not is_torch_avaliable(verbose=False): EasyStatChecker = None logger.warning("EasySeacher and EasyChecker are skipped because 'torch' is not installed!") else: - from .searcher import EasySearcher + from .searcher import EasySearcher, EasyStatSearcher, EasyFuzzSemanticSearcher, EasyExactSemanticSearcher from .checker import EasySemanticChecker, EasyStatChecker diff --git a/learnware/market/utils.py b/learnware/market/utils.py index 76d41b9..078d473 100644 --- a/learnware/market/utils.py +++ b/learnware/market/utils.py @@ -2,7 +2,13 @@ from ..specification import Specification def parse_specification_type( - stat_specs: dict, spec_list=["RKMETableSpecification", "RKMETextSpecification", "RKMEImageSpecification"] + stat_specs: dict, + spec_list=[ + "HeteroMapTableSpecification", + "RKMETableSpecification", + "RKMETextSpecification", + "RKMEImageSpecification", + ], ): for spec in spec_list: if spec in stat_specs: From c3dbc1c7121794010a1db6b348e2e705cffb5a94 Mon Sep 17 00:00:00 2001 From: Gene Date: Sun, 12 Nov 2023 17:21:10 +0800 Subject: [PATCH 47/90] [MNT] modify details to adapt to AlignLearnware --- learnware/reuse/__init__.py | 11 ++-- learnware/reuse/averaging.py | 4 +- learnware/reuse/ensemble_pruning.py | 2 +- learnware/reuse/feature_augment.py | 44 +++++++++------- learnware/reuse/hetero/__init__.py | 2 + .../feature_align.py | 26 ++++------ .../__init__.py => hetero/hetero_map.py} | 52 +++++++++---------- 7 files changed, 72 insertions(+), 69 deletions(-) create mode 100644 learnware/reuse/hetero/__init__.py rename learnware/reuse/{hetero_reuser => hetero}/feature_align.py (91%) rename learnware/reuse/{hetero_reuser/__init__.py => hetero/hetero_map.py} (58%) diff --git a/learnware/reuse/__init__.py b/learnware/reuse/__init__.py index 93a4153..ad47c32 100644 --- a/learnware/reuse/__init__.py +++ b/learnware/reuse/__init__.py @@ -1,3 +1,6 @@ +from .base import BaseReuser +from .align import AlignLearnware + from ..logger import get_module_logger from ..utils import is_torch_avaliable from .utils import is_geatpy_avaliable, is_lightgbm_avaliable @@ -13,15 +16,15 @@ else: if not is_torch_avaliable(verbose=False): AveragingReuser = None FeatureAugmentReuser = None - HeteroMapTableReuser = None - FeatureAlignReuser = None + HeteroMapAlignLearnware = None + FeatureAlignLearnware = None logger.warning( - "[AveragingReuser, FeatureAugmentReuser, HeteroMapTableReuser, FeatureAlignReuser] is skipped due to 'torch' is not installed!" + "[AveragingReuser, FeatureAugmentReuser, HeteroMapAlignLearnware, FeatureAlignLearnware] is skipped due to 'torch' is not installed!" ) else: from .averaging import AveragingReuser from .feature_augment import FeatureAugmentReuser - from .hetero_reuser import HeteroMapTableReuser, FeatureAlignReuser + from .hetero import HeteroMapAlignLearnware, FeatureAlignLearnware if not is_lightgbm_avaliable(verbose=False) or not is_torch_avaliable(verbose=False): JobSelectorReuser = None diff --git a/learnware/reuse/averaging.py b/learnware/reuse/averaging.py index c12083d..abc572b 100644 --- a/learnware/reuse/averaging.py +++ b/learnware/reuse/averaging.py @@ -1,6 +1,6 @@ import torch import numpy as np -from typing import List +from typing import List, Union from scipy.special import softmax @@ -20,7 +20,7 @@ class AveragingReuser(BaseReuser): Parameters ---------- learnware_list : List[Learnware] - The learnware list + The list contains learnwares. mode : str - "mean": average the output of all learnwares for regression task (learnware output is a real number) - "vote_by_label": vote by labels for classification task, learnware output belongs to the set {0, 1, ..., class_num} diff --git a/learnware/reuse/ensemble_pruning.py b/learnware/reuse/ensemble_pruning.py index bb7db40..f937dbd 100644 --- a/learnware/reuse/ensemble_pruning.py +++ b/learnware/reuse/ensemble_pruning.py @@ -24,7 +24,7 @@ class EnsemblePruningReuser(BaseReuser): Parameters ---------- learnware_list : List[Learnware] - The learnware list + The list contains learnwares mode : str - "regression" for regression task (learnware output is a real number) - "classification" for classification task (learnware output is a logitis vector or belongs to the set {0, 1, ..., class_num}) diff --git a/learnware/reuse/feature_augment.py b/learnware/reuse/feature_augment.py index 653bfa7..ea3d27d 100644 --- a/learnware/reuse/feature_augment.py +++ b/learnware/reuse/feature_augment.py @@ -1,6 +1,6 @@ import torch import numpy as np -from typing import Union +from typing import List from sklearn.linear_model import RidgeCV, LogisticRegressionCV from .base import BaseReuser @@ -16,21 +16,21 @@ class FeatureAugmentReuser(BaseReuser): - "classification": Uses LogisticRegressionCV for classification tasks. """ - def __init__(self, learnware: Union[Learnware, BaseReuser] = None, mode: str = None): + def __init__(self, learnware_list: List[Learnware] = None, mode: str = None): """ Initialize the FeatureAugmentReuser with a learnware model and a mode. Parameters ---------- - learnware : Union[Learnware, BaseReuser] - A learnware model used for initial predictions. + learnware : List[Learnware] + The list contains learnwares. mode : str The mode of operation, either "regression" or "classification". """ - self.learnware = learnware + super(FeatureAugmentReuser, self).__init__(learnware_list) assert mode in ["classification", "regression"], "Mode must be either 'classification' or 'regression'" self.mode = mode - self.output_aligner = None + self.augment_reuser = None def predict(self, user_data: np.ndarray) -> np.ndarray: """ @@ -46,11 +46,11 @@ class FeatureAugmentReuser(BaseReuser): np.ndarray Predicted output from the output aligner model. """ - assert self.output_aligner is not None, "FeatureAugmentReuser is not trained by labeled data yet." + assert self.augment_reuser is not None, "FeatureAugmentReuser is not trained by labeled data yet." user_data = self._fill_data(user_data) user_data_aug = self._get_augment_data(user_data) - y_pred_aug = self.output_aligner.predict(user_data_aug) + y_pred_aug = self.augment_reuser.predict(user_data_aug) return y_pred_aug @@ -72,10 +72,10 @@ class FeatureAugmentReuser(BaseReuser): alpha_list = [0.01, 0.1, 1.0, 10, 100] ridge_cv = RidgeCV(alphas=alpha_list, store_cv_values=True) ridge_cv.fit(x_train_aug, y_train) - self.output_aligner = ridge_cv + self.augment_reuser = ridge_cv else: - self.output_aligner = LogisticRegressionCV(cv=5, max_iter=1000, random_state=0, multi_class="auto") - self.output_aligner.fit(x_train_aug, y_train) + self.augment_reuser = LogisticRegressionCV(cv=5, max_iter=1000, random_state=0, multi_class="auto") + self.augment_reuser.fit(x_train_aug, y_train) def _fill_data(self, X: np.ndarray) -> np.ndarray: """ @@ -125,12 +125,16 @@ class FeatureAugmentReuser(BaseReuser): TypeError If the type of model output not in [np.ndarray, torch.Tensor]. """ - y_pred = self.learnware.predict(X) - if isinstance(y_pred, torch.Tensor): - y_pred = y_pred.detach().cpu().numpy() - if not isinstance(y_pred, np.ndarray): - raise TypeError(f"Model output must be np.ndarray or torch.Tensor") - if len(y_pred.shape) == 1: - y_pred = y_pred.reshape(-1, 1) - - return np.concatenate((X, y_pred), axis=1) + y_preds = [] + for learnware in self.learnware_list: + y_pred = learnware.predict(X) + if isinstance(y_pred, torch.Tensor): + y_pred = y_pred.detach().cpu().numpy() + if not isinstance(y_pred, np.ndarray): + raise TypeError(f"Model output must be np.ndarray or torch.Tensor") + if len(y_pred.shape) == 1: + y_pred = y_pred.reshape(-1, 1) + y_preds.append(y_pred) + y_preds = np.concatenate(y_preds, axis=1) + + return np.concatenate((X, y_preds), axis=1) diff --git a/learnware/reuse/hetero/__init__.py b/learnware/reuse/hetero/__init__.py new file mode 100644 index 0000000..6f8ad23 --- /dev/null +++ b/learnware/reuse/hetero/__init__.py @@ -0,0 +1,2 @@ +from .feature_align import FeatureAlignLearnware +from .hetero_map import HeteroMapAlignLearnware diff --git a/learnware/reuse/hetero_reuser/feature_align.py b/learnware/reuse/hetero/feature_align.py similarity index 91% rename from learnware/reuse/hetero_reuser/feature_align.py rename to learnware/reuse/hetero/feature_align.py index 7bcd1f9..71e3d29 100644 --- a/learnware/reuse/hetero_reuser/feature_align.py +++ b/learnware/reuse/hetero/feature_align.py @@ -6,26 +6,24 @@ from typing import List from tqdm import trange import torch.nn.functional as F -from ..base import BaseReuser +from ..align import AlignLearnware from ...logger import get_module_logger from ...learnware import Learnware from ...specification import RKMETableSpecification from ...specification.regular.table.rkme import choose_device -logger = get_module_logger("hetero_feature_align") +logger = get_module_logger("feature_align") -class FeatureAlignReuser(BaseReuser): +class FeatureAlignLearnware(AlignLearnware): """ - FeatureAlignReuser is a class for aligning features from a user dataset with a target dataset using a learnware model. + FeatureAlignLearnware is a class for aligning features from a user dataset with a target dataset using a learnware model. It supports both classification and regression tasks and uses a feature alignment trainer for alignment. Attributes ---------- learnware : Learnware The learnware model used for final prediction. - mode : str - Operation mode, either "classification" or "regression". align_arguments : dict Additional arguments for the feature alignment trainer. cuda_idx : int @@ -34,32 +32,28 @@ class FeatureAlignReuser(BaseReuser): The device (CPU or CUDA) on which computations will be performed. """ - def __init__(self, learnware: Learnware = None, mode: str = None, cuda_idx=0, **align_arguments): + def __init__(self, learnware: Learnware = None, cuda_idx=0, **align_arguments): """ - Initialize the FeatureAlignReuser with a learnware model, mode, CUDA device index, and alignment arguments. + Initialize the FeatureAlignLearnware with a learnware model, mode, CUDA device index, and alignment arguments. Parameters ---------- learnware : Learnware A learnware model used for initial predictions. - mode : str - The mode of operation, either "regression" or "classification". cuda_idx : int The index of the CUDA device for computations. align_arguments : dict Additional arguments to be passed to the feature alignment trainer. """ - self.learnware = learnware - assert mode in ["classification", "regression"], "Mode must be either 'classification' or 'regression'" - self.mode = mode + super(FeatureAlignLearnware, self).__init__(learnware) self.align_arguments = align_arguments self.cuda_idx = cuda_idx self.device = choose_device(cuda_idx=cuda_idx) self.align_model = None - def fit(self, user_rkme: RKMETableSpecification): + def align(self, user_rkme: RKMETableSpecification): """ - Fit the align model using the RKME (Relative Knowledge Model Embeddings) specifications from the learnware model. + Train the align model using the RKME specifications from the user and the learnware. Parameters ---------- @@ -87,7 +81,7 @@ class FeatureAlignReuser(BaseReuser): np.ndarray Predicted output from the learnware model after alignment. """ - assert self.align_model is not None, "FeatureAlignReuser must be fitted before making predictions." + assert self.align_model is not None, "FeatureAlignLearnware must be aligned before making predictions." user_data = self._fill_data(user_data) transformed_user_data = ( self.align_model(torch.tensor(user_data, device=self.device).float()).detach().cpu().numpy() diff --git a/learnware/reuse/hetero_reuser/__init__.py b/learnware/reuse/hetero/hetero_map.py similarity index 58% rename from learnware/reuse/hetero_reuser/__init__.py rename to learnware/reuse/hetero/hetero_map.py index c794524..76f9ce0 100644 --- a/learnware/reuse/hetero_reuser/__init__.py +++ b/learnware/reuse/hetero/hetero_map.py @@ -1,12 +1,18 @@ +import numpy as np + +from ..align import AlignLearnware from ...learnware import Learnware -from ..base import BaseReuser -from .feature_align import FeatureAlignReuser +from ...logger import get_module_logger +from .feature_align import FeatureAlignLearnware from ..feature_augment import FeatureAugmentReuser +from ...specification import RKMETableSpecification + +logger = get_module_logger("hetero_map_align") -class HeteroMapTableReuser(BaseReuser): +class HeteroMapAlignLearnware(AlignLearnware): """ - HeteroMapTableReuser is a class designed for reusing learnware models with feature alignment and augmentation. + HeteroMapAlignLearnware is a class designed for reusing learnware models with feature alignment and augmentation. It can handle both classification and regression tasks and supports fine-tuning on additional training data. Attributes @@ -23,7 +29,7 @@ class HeteroMapTableReuser(BaseReuser): def __init__(self, learnware: Learnware = None, mode: str = None, cuda_idx=0, **align_arguments): """ - Initialize the HeteroMapTableReuser with a learnware model, mode, CUDA device index, and alignment arguments. + Initialize the HeteroMapAlignLearnware with a learnware model, mode, CUDA device index, and alignment arguments. Parameters ---------- @@ -36,43 +42,37 @@ class HeteroMapTableReuser(BaseReuser): align_arguments : dict Additional arguments to be passed to the feature alignment process. """ - self.learnware = learnware + super(HeteroMapAlignLearnware, self).__init__(learnware) assert mode in ["classification", "regression"], "Mode must be either 'classification' or 'regression'" self.mode = mode self.cuda_idx = cuda_idx self.align_arguments = align_arguments self.reuser = None - self.feature_align_reuser = None - def fit(self, user_rkme): + def align(self, user_rkme: RKMETableSpecification, x_train: np.ndarray = None, y_train: np.ndarray = None): """ - Fit the feature aligner using the user RKME (Relative Knowledge Model Embeddings) specification. + Align the hetero learnware using the user RKME specification and labeled data. Parameters ---------- user_rkme : RKMETableSpecification The RKME specification from the user dataset. - """ - self.feature_align_reuser = FeatureAlignReuser( - learnware=self.learnware, mode=self.mode, cuda_idx=self.cuda_idx, **self.align_arguments - ) - self.feature_align_reuser.fit(user_rkme) - self.reuser = self.feature_align_reuser - - def finetune(self, x_train, y_train): - """ - Fine-tune the feature aligner using additional training data. - - Parameters - ---------- x_train : ndarray Training data features. y_train : ndarray Training data labels. """ - assert self.feature_align_reuser is not None, "HeteroMapTableReuser must be fitted before fine-tuning." - self.reuser = FeatureAugmentReuser(learnware=self.feature_align_reuser, mode=self.mode) - self.reuser.fit(x_train, y_train) + self.feature_align_learnware = FeatureAlignLearnware( + learnware=self.learnware, cuda_idx=self.cuda_idx, **self.align_arguments + ) + self.feature_align_learnware.align(user_rkme) + + if x_train is None or y_train is None: + logger.warning("Hetero learnware may not perform well as labeled data alignment is not provided!") + self.reuser = self.feature_align_learnware + else: + self.reuser = FeatureAugmentReuser(learnware_list=[self.feature_align_learnware], mode=self.mode) + self.reuser.fit(x_train, y_train) def predict(self, user_data): """ @@ -88,5 +88,5 @@ class HeteroMapTableReuser(BaseReuser): ndarray Predicted output from the model. """ - assert self.reuser is not None, "HeteroMapTableReuser must be fitted before making predictions." + assert self.reuser is not None, "HeteroMapAlignLearnware must be aligned before making predictions." return self.reuser.predict(user_data) From 81358f604317767df4cab9fe3d9f719aa81b6c0b Mon Sep 17 00:00:00 2001 From: Gene Date: Sun, 12 Nov 2023 17:21:30 +0800 Subject: [PATCH 48/90] [MNT] modify details in tests --- tests/test_hetero_market/test_hetero.py | 39 +++++++++++++++++++------ tests/test_workflow/test_workflow.py | 2 +- 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/tests/test_hetero_market/test_hetero.py b/tests/test_hetero_market/test_hetero.py index 3a9643f..e4a3d04 100644 --- a/tests/test_hetero_market/test_hetero.py +++ b/tests/test_hetero_market/test_hetero.py @@ -15,7 +15,7 @@ from sklearn.metrics import mean_squared_error import learnware from learnware.market import instantiate_learnware_market, BaseUserInfo from learnware.specification import RKMETableSpecification, generate_rkme_table_spec -from learnware.reuse import HeteroMapTableReuser +from learnware.reuse import HeteroMapAlignLearnware, AveragingReuser, EnsemblePruningReuser from example_learnwares.config import ( input_shape_list, input_description_list, @@ -265,6 +265,9 @@ class TestMarket(unittest.TestCase): print(f"search result of user{idx}:") for score, learnware in zip(sorted_score_list, single_learnware_list): print(f"score: {score}, learnware_id: {learnware.id}") + print( + f"mixture_score: {mixture_score}, mixture_learnware_ids: {[item.id for item in mixture_learnware_list]}" + ) # empty value of key "Task" in semantic_spec, use homo search and print invalid semantic_spec print(">> test for key 'Task' has empty 'Values':") @@ -374,14 +377,32 @@ class TestMarket(unittest.TestCase): # print search results for score, learnware in zip(sorted_score_list, single_learnware_list): print(f"score: {score}, learnware_id: {learnware.id}") - - # model reuse - reuser = HeteroMapTableReuser(single_learnware_list[0], mode="regression") - reuser.fit(user_spec) - reuser.finetune(X[:100], y[:100]) - y_pred = reuser.predict(X) - rmse = mean_squared_error(y, y_pred, squared=False) - print(f"rmse finetune: {rmse}") + print(f"mixture_score: {mixture_score}, mixture_learnware_ids: {[item.id for item in mixture_learnware_list]}") + + # single model reuse + hetero_learnware = HeteroMapAlignLearnware(single_learnware_list[0], mode="regression") + hetero_learnware.align(user_spec, X[:100], y[:100]) + single_predict_y = hetero_learnware.predict(X) + + # multi model reuse + hetero_learnware_list = [] + for learnware in mixture_learnware_list: + hetero_learnware = HeteroMapAlignLearnware(learnware, mode="regression") + hetero_learnware.align(user_spec, X[:100], y[:100]) + hetero_learnware_list.append(hetero_learnware) + + # Use averaging ensemble reuser to reuse the searched learnwares to make prediction + reuse_ensemble = AveragingReuser(learnware_list=hetero_learnware_list, mode="mean") + ensemble_predict_y = reuse_ensemble.predict(user_data=X) + + # Use ensemble pruning reuser to reuse the searched learnwares to make prediction + reuse_ensemble = EnsemblePruningReuser(learnware_list=hetero_learnware_list, mode="regression") + reuse_ensemble.fit(X[:100], y[:100]) + ensemble_pruning_predict_y = reuse_ensemble.predict(user_data=X) + + print("Single model RMSE by finetune:", mean_squared_error(y, single_predict_y, squared=False)) + print("Averaging Reuser RMSE:", mean_squared_error(y, ensemble_predict_y, squared=False)) + print("Ensemble Pruning Reuser RMSE:", mean_squared_error(y, ensemble_pruning_predict_y, squared=False)) def suite(): diff --git a/tests/test_workflow/test_workflow.py b/tests/test_workflow/test_workflow.py index 5adafd2..2e0b636 100644 --- a/tests/test_workflow/test_workflow.py +++ b/tests/test_workflow/test_workflow.py @@ -220,7 +220,7 @@ class TestWorkflow(unittest.TestCase): ensemble_pruning_predict_y = reuse_ensemble.predict(user_data=data_X) # Use feature augment reuser to reuse the searched learnwares to make prediction - reuse_feature_augment = FeatureAugmentReuser(learnware=reuse_ensemble, mode="classification") + reuse_feature_augment = FeatureAugmentReuser(learnware_list=mixture_learnware_list, mode="classification") reuse_feature_augment.fit(train_X[-200:], train_y[-200:]) feature_augment_predict_y = reuse_feature_augment.predict(user_data=data_X) From 8d59a4c4846dcea098b233a66f254f53f5e67f28 Mon Sep 17 00:00:00 2001 From: Gene Date: Sun, 12 Nov 2023 18:38:48 +0800 Subject: [PATCH 49/90] [MNT] modify details in hetero organizer --- .../heterogeneous/organizer/__init__.py | 21 +++++-------- .../__init__.py | 31 ++++++------------- .../feature_extractor.py | 28 +++++------------ .../{hetero_mapping => hetero_map}/trainer.py | 4 +-- 4 files changed, 26 insertions(+), 58 deletions(-) rename learnware/market/heterogeneous/organizer/{hetero_mapping => hetero_map}/__init__.py (96%) rename learnware/market/heterogeneous/organizer/{hetero_mapping => hetero_map}/feature_extractor.py (91%) rename learnware/market/heterogeneous/organizer/{hetero_mapping => hetero_map}/trainer.py (98%) diff --git a/learnware/market/heterogeneous/organizer/__init__.py b/learnware/market/heterogeneous/organizer/__init__.py index 51cd12a..5f80856 100644 --- a/learnware/market/heterogeneous/organizer/__init__.py +++ b/learnware/market/heterogeneous/organizer/__init__.py @@ -1,18 +1,11 @@ -from __future__ import annotations - -import copy -import multiprocessing import os -import tempfile +import copy import zipfile +import pandas as pd from collections import defaultdict from shutil import copyfile, rmtree from typing import List, Tuple -import pandas as pd -from torch import nn -import torch.multiprocessing as mp - from ....learnware import Learnware, get_learnware_from_dirpath from ....logger import get_module_logger from ....specification.system import HeteroMapTableSpecification @@ -20,9 +13,9 @@ from ...base import BaseChecker, BaseUserInfo from ...easy import EasyOrganizer from ...easy.database_ops import DatabaseOperations from ....config import C as conf -from .hetero_mapping import HeteroMapping, Trainer +from .hetero_map import HeteroMap, Trainer -logger = get_module_logger("hetero_market") +logger = get_module_logger("hetero_map_table_organizer") class HeteroMapTableOrganizer(EasyOrganizer): @@ -67,7 +60,7 @@ class HeteroMapTableOrganizer(EasyOrganizer): if os.path.exists(self.market_mapping_path): logger.info(f"Reload market mapping from checkpoint {self.market_mapping_path}") - self.market_mapping = HeteroMapping.load(checkpoint=self.market_store_path) + self.market_mapping = HeteroMap.load(checkpoint=self.market_store_path) if not rebuild: if os.path.exists(self.hetero_mappings_path): for hetero_json_path in os.listdir(self.hetero_mappings_path): @@ -83,7 +76,7 @@ class HeteroMapTableOrganizer(EasyOrganizer): self._update_learnware_by_ids(self.learnware_list.keys()) else: logger.warning(f"No market mapping to reload!!") - self.market_mapping = HeteroMapping() + self.market_mapping = HeteroMap() # rmtree(self.hetero_mappings_path) def reset(self, market_id=None, auto_update=False, auto_update_limit=None, **kwargs): @@ -167,7 +160,7 @@ class HeteroMapTableOrganizer(EasyOrganizer): @staticmethod def train(learnware_list: List[Learnware], save_dir: str, **kwargs): allset = HeteroMapTableOrganizer._learnwares_to_dataframes(learnware_list) - market_mapping = HeteroMapping(**kwargs) + market_mapping = HeteroMap(**kwargs) market_mapping_trainer = Trainer( model=market_mapping, train_set_list=allset, diff --git a/learnware/market/heterogeneous/organizer/hetero_mapping/__init__.py b/learnware/market/heterogeneous/organizer/hetero_map/__init__.py similarity index 96% rename from learnware/market/heterogeneous/organizer/hetero_mapping/__init__.py rename to learnware/market/heterogeneous/organizer/hetero_map/__init__.py index f37e3e7..46635be 100644 --- a/learnware/market/heterogeneous/organizer/hetero_mapping/__init__.py +++ b/learnware/market/heterogeneous/organizer/hetero_map/__init__.py @@ -1,8 +1,7 @@ import os -from typing import List, Optional - import numpy as np import pandas as pd +from typing import List, Optional import torch import torch.nn.functional as F from torch import Tensor, nn @@ -12,7 +11,7 @@ from .feature_extractor import * from .trainer import Trainer, TransTabCollatorForCL -class HeteroMapping(nn.Module): +class HeteroMap(nn.Module): def __init__( self, feature_tokenizer=None, @@ -30,8 +29,8 @@ class HeteroMapping(nn.Module): device="cuda:0", checkpoint=None, **kwargs, - ) -> None: - super(HeteroMapping, self).__init__() + ): + super(HeteroMap, self).__init__() self.model_args = { "num_partition": num_partition, @@ -90,19 +89,13 @@ class HeteroMapping(nn.Module): ---------- ckpt_dir: str the directory path to load. - - Returns - ------- - None - """ # load model weight state dict market_model_path = os.path.join(checkpoint, "model.bin") model_info = torch.load(market_model_path, map_location="cpu") - model = HeteroMapping(**model_info["model_args"]) + model = HeteroMap(**model_info["model_args"]) model.load_state_dict(model_info["model_state_dict"], strict=False) return model - # self.feature_tokenizer.load(checkpoint) def save(self, ckpt_dir): """Save the model state_dict and feature_tokenizer configuration @@ -112,11 +105,6 @@ class HeteroMapping(nn.Module): ---------- ckpt_dir: str the directory path to save. - - Returns - ------- - None - """ # save model weight state dict model_info = { @@ -174,7 +162,7 @@ class HeteroMapping(nn.Module): Parameters ---------- x: pd.DataFrame or dict - pd.DataFrame: a batch of raw tabular samples; dict: the output of TransTabFeatureExtractor. + pd.DataFrame: a batch of raw tabular samples; dict: the output of feature_tokenizer Returns ------- @@ -190,7 +178,7 @@ class HeteroMapping(nn.Module): elif isinstance(x, torch.Tensor): inputs = self.feature_tokenizer.forward(cols, x) else: - raise ValueError(f"TransTabOutputFeatureExtractor takes inputs with dict or pd.DataFrame, find {type(x)}.") + raise ValueError(f"feature_tokenizer takes inputs with dict or pd.DataFrame, find {type(x)}.") outputs = self.feature_processor(**inputs) # outputs is dict, "embedding" and "mask" outputs = self.cls_token(**outputs) # add the cls embedding @@ -284,7 +272,7 @@ class TransformerLayer(nn.Module): device=None, dtype=None, use_layer_norm=True, - ) -> None: + ): factory_kwargs = {"device": device, "dtype": dtype} super().__init__() self.self_attn = nn.MultiheadAttention(d_model, nhead, batch_first=batch_first, **factory_kwargs) @@ -314,7 +302,6 @@ class TransformerLayer(nn.Module): # self-attention block def _sa_block(self, x: Tensor, attn_mask: Optional[Tensor], key_padding_mask: Optional[Tensor]) -> Tensor: - src = x key_padding_mask = ~key_padding_mask.bool() x = self.self_attn( x, @@ -339,7 +326,7 @@ class TransformerLayer(nn.Module): super().__setstate__(state) def forward(self, src, src_mask=None, src_key_padding_mask=None, is_causal=None, **kwargs) -> Tensor: - r"""Pass the input through the encoder layer. + """Pass the input through the encoder layer. Args: src: the sequence to the encoder layer (required). diff --git a/learnware/market/heterogeneous/organizer/hetero_mapping/feature_extractor.py b/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py similarity index 91% rename from learnware/market/heterogeneous/organizer/hetero_mapping/feature_extractor.py rename to learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py index 66eea79..e47d702 100644 --- a/learnware/market/heterogeneous/organizer/hetero_mapping/feature_extractor.py +++ b/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py @@ -1,6 +1,5 @@ -import json -import math import os +import math from typing import Dict import numpy as np @@ -22,7 +21,7 @@ class WordEmbedding(nn.Module): padding_idx=0, hidden_dropout_prob=0, layer_norm_eps=1e-5, - ) -> None: + ): super().__init__() self.word_embeddings = nn.Embedding(vocab_size, hidden_dim, padding_idx) nn_init.kaiming_normal_(self.word_embeddings.weight) @@ -41,7 +40,7 @@ class NumEmbedding(nn.Module): Encode tokens drawn from column names and the corresponding numerical features. """ - def __init__(self, hidden_dim) -> None: + def __init__(self, hidden_dim): super().__init__() self.norm = nn.LayerNorm(hidden_dim) self.num_bias = nn.Parameter(Tensor(1, 1, hidden_dim)) # add bias @@ -58,7 +57,7 @@ class NumEmbedding(nn.Module): class FeatureTokenizer: - r""" + """ Process input dataframe to input indices towards encoder, usually used to build dataloader for paralleling loading. """ @@ -67,7 +66,7 @@ class FeatureTokenizer: self, disable_tokenizer_parallel=True, **kwargs, - ) -> None: + ): """args: disable_tokenizer_parallel: true if use extractor for collator function in torch.DataLoader """ @@ -122,7 +121,6 @@ class FeatureTokenizer: return encoded_inputs - # ------------------------ New function ------------------------ def forward(self, cols, x) -> Dict: """ Parameters @@ -157,19 +155,9 @@ class FeatureTokenizer: return encoded_inputs - # def save(self, path): - # """save the feature extractor configuration to local dir.""" - # self.tokenizer.save_pretrained(os.path.join(path, conf.market_tokenizer_path)) - - # def load(self, path): - # """load the feature extractor configuration from local dir.""" - # tokenizer_path = os.path.join(path, conf.market_tokenizer_path) - # if os.path.exists(tokenizer_path): - # self.tokenizer = BertTokenizerFast.from_pretrained(os.path.join(path, conf.market_tokenizer_path)) - class FeatureProcessor(nn.Module): - r""" + """ Process inputs from feature extractor to map them to embeddings. """ @@ -180,7 +168,7 @@ class FeatureProcessor(nn.Module): hidden_dropout_prob=0, pad_token_id=0, device="cuda:0", - ) -> None: + ): super().__init__() self.word_embedding = WordEmbedding( vocab_size=vocab_size, @@ -229,7 +217,7 @@ class FeatureProcessor(nn.Module): class CLSToken(nn.Module): """add a learnable cls token embedding at the end of each sequence.""" - def __init__(self, hidden_dim) -> None: + def __init__(self, hidden_dim): super().__init__() self.weight = nn.Parameter(Tensor(hidden_dim)) nn_init.uniform_(self.weight, a=-1 / math.sqrt(hidden_dim), b=1 / math.sqrt(hidden_dim)) diff --git a/learnware/market/heterogeneous/organizer/hetero_mapping/trainer.py b/learnware/market/heterogeneous/organizer/hetero_map/trainer.py similarity index 98% rename from learnware/market/heterogeneous/organizer/hetero_mapping/trainer.py rename to learnware/market/heterogeneous/organizer/hetero_map/trainer.py index e9083a0..8208d00 100644 --- a/learnware/market/heterogeneous/organizer/hetero_mapping/trainer.py +++ b/learnware/market/heterogeneous/organizer/hetero_map/trainer.py @@ -174,7 +174,7 @@ class TrainDataset(Dataset): class TransTabCollatorForCL: - """support positive pair sampling for contrastive learning of transtab model.""" + """support positive pair sampling for contrastive learning.""" def __init__( self, @@ -182,7 +182,7 @@ class TransTabCollatorForCL: overlap_ratio=0.5, num_partition=3, **kwargs, - ) -> None: + ): self.feature_tokenizer = feature_tokenizer or FeatureTokenizer(disable_tokenizer_parallel=True) assert num_partition > 0, f"number of contrastive subsets must be greater than 0, got {num_partition}" assert isinstance(num_partition, int), f"number of constrative subsets must be int, got {type(num_partition)}" From 3191fa3a6a11791b0c4bf6b36c0e7192a7cb7e5a Mon Sep 17 00:00:00 2001 From: Gene Date: Sun, 12 Nov 2023 18:57:22 +0800 Subject: [PATCH 50/90] [MNT] remove extra pass --- learnware/client/package_utils.py | 1 - learnware/config.py | 2 - learnware/market/easy/database_ops.py | 50 +++++-------------- learnware/market/easy/organizer.py | 3 +- learnware/market/easy/searcher.py | 4 -- .../heterogeneous/organizer/__init__.py | 1 - 6 files changed, 13 insertions(+), 48 deletions(-) diff --git a/learnware/client/package_utils.py b/learnware/client/package_utils.py index cb99c85..492cf3d 100644 --- a/learnware/client/package_utils.py +++ b/learnware/client/package_utils.py @@ -109,7 +109,6 @@ def filter_nonexist_conda_packages(packages: list) -> Tuple[List[str], List[str] last_bracket = stdout.rfind("\n{") if last_bracket != -1: stdout = stdout[last_bracket:] - pass print(stdout) output = json.loads(stdout).get("bad_deps", []) diff --git a/learnware/config.py b/learnware/config.py index 30b1365..45c04b1 100644 --- a/learnware/config.py +++ b/learnware/config.py @@ -13,8 +13,6 @@ class Config: if os.path.exists(config_file): with open(config_file, "r") as f: self.__dict__["_config"].update(json.load(f)) - pass - pass def __getitem__(self, key): return self.__dict__["_config"][key] diff --git a/learnware/market/easy/database_ops.py b/learnware/market/easy/database_ops.py index e128653..ed4a319 100644 --- a/learnware/market/easy/database_ops.py +++ b/learnware/market/easy/database_ops.py @@ -19,8 +19,6 @@ class Learnware(DeclarativeBase): folder_path = Column(Text, nullable=False) use_flag = Column(Text, nullable=False) - pass - class DatabaseOperations(object): def __init__(self, url: str, database_name: str): @@ -28,13 +26,10 @@ class DatabaseOperations(object): url = os.path.join(url, f"{database_name}.db") else: url = f"{url}/{database_name}" - pass self.url = url self.create_database_if_not_exists(url) - pass - def create_database_if_not_exists(self, url): database_exists = True @@ -44,12 +39,10 @@ class DatabaseOperations(object): path = url[start + 4 :] if os.path.exists(path): database_exists = True - pass else: database_exists = False os.makedirs(os.path.dirname(path), exist_ok=True) - pass - pass + elif self.url.startswith("postgresql"): # it is postgresql dbname_start = url.rfind("/") @@ -63,37 +56,27 @@ class DatabaseOperations(object): for row in result.fetchall(): db_list.add(row[0].lower()) - pass if dbname.lower() not in db_list: database_exists = False conn.execution_options(isolation_level="AUTOCOMMIT").execute( text("CREATE DATABASE {0};".format(dbname)) ) - pass else: database_exists = True - pass - pass engine.dispose() - pass else: raise Exception(f"Unsupported database url: {self.url}") - pass self.engine = create_engine(url, future=True) if not database_exists: DeclarativeBase.metadata.create_all(self.engine) - pass - pass def clear_learnware_table(self): with self.engine.connect() as conn: conn.execute(text("DELETE FROM tb_learnware;")) conn.commit() - pass - pass def add_learnware(self, id: str, semantic_spec: dict, zip_path, folder_path, use_flag: str): with self.engine.connect() as conn: @@ -114,15 +97,11 @@ class DatabaseOperations(object): ), ) conn.commit() - pass - pass def delete_learnware(self, id: str): with self.engine.connect() as conn: conn.execute(text("DELETE FROM tb_learnware WHERE id=:id;"), dict(id=id)) conn.commit() - pass - pass def update_learnware_semantic_specification(self, id: str, semantic_spec: dict): with self.engine.connect() as conn: @@ -132,8 +111,6 @@ class DatabaseOperations(object): dict(id=id, semantic_spec=semantic_spec_str), ) conn.commit() - pass - pass def update_learnware_use_flag(self, id: str, use_flag: str): with self.engine.connect() as conn: @@ -142,8 +119,6 @@ class DatabaseOperations(object): dict(id=id, use_flag=use_flag), ) conn.commit() - pass - pass def get_learnware_semantic_specification(self, id: str): with self.engine.connect() as conn: @@ -153,8 +128,6 @@ class DatabaseOperations(object): return None else: return json.loads(row[0]) - pass - pass def get_learnware_use_flag(self, id: str): with self.engine.connect() as conn: @@ -164,12 +137,13 @@ class DatabaseOperations(object): return None else: return int(row[0]) - pass - pass def get_learnware_info(self, id: str): with self.engine.connect() as conn: - r = conn.execute(text("SELECT semantic_spec, zip_path, folder_path, use_flag FROM tb_learnware WHERE id=:id;"), dict(id=id)) + r = conn.execute( + text("SELECT semantic_spec, zip_path, folder_path, use_flag FROM tb_learnware WHERE id=:id;"), + dict(id=id), + ) row = r.fetchone() if row is None: return None @@ -178,9 +152,12 @@ class DatabaseOperations(object): zip_path = row[1] folder_path = row[2] use_flag = int(row[3]) - return {'semantic_spec': semantic_spec, 'zip_path': zip_path, 'folder_path': folder_path, 'use_flag': use_flag} - pass - pass + return { + "semantic_spec": semantic_spec, + "zip_path": zip_path, + "folder_path": folder_path, + "use_flag": use_flag, + } def load_market(self): with self.engine.connect() as conn: @@ -205,8 +182,5 @@ class DatabaseOperations(object): folder_list[id] = folder_path use_flags[id] = int(use_flag) max_count = max(max_count, int(id)) - pass - return learnware_list, zip_list, folder_list, use_flags, max_count + 1 - pass - pass + return learnware_list, zip_list, folder_list, use_flags, max_count + 1 diff --git a/learnware/market/easy/organizer.py b/learnware/market/easy/organizer.py index 27b974d..0c25b17 100644 --- a/learnware/market/easy/organizer.py +++ b/learnware/market/easy/organizer.py @@ -145,7 +145,7 @@ class EasyOrganizer(BaseOrganizer): zip_dir = self.learnware_zip_list[id] if os.path.exists(zip_dir): os.remove(zip_dir) - pass + folder_dir = self.learnware_folder_list[id] rmtree(folder_dir, ignore_errors=True) self.learnware_list.pop(id) @@ -390,7 +390,6 @@ class EasyOrganizer(BaseOrganizer): id=learnware_id, semantic_spec=semantic_spec, learnware_dirpath=target_folder_dir ) self.use_flags[learnware_id] = self.dbops.get_learnware_use_flag(learnware_id) - pass def get_learnware_info_from_storage(self, learnware_id: str) -> Dict: """return learnware zip path and semantic_specification from storage diff --git a/learnware/market/easy/searcher.py b/learnware/market/easy/searcher.py index 5340c0e..8de00af 100644 --- a/learnware/market/easy/searcher.py +++ b/learnware/market/easy/searcher.py @@ -38,7 +38,6 @@ class EasyExactSemanticSearcher(BaseSearcher): v1 = v1.lower() if v1 not in name2 and v1 not in description2: return False - pass else: if len(v2) == 0: # user input contains some key that is not in database @@ -54,9 +53,6 @@ class EasyExactSemanticSearcher(BaseSearcher): elif semantic_spec1[key]["Type"] == "Tag": if not (set(v1) & set(v2)): return False - pass - pass - pass return True diff --git a/learnware/market/heterogeneous/organizer/__init__.py b/learnware/market/heterogeneous/organizer/__init__.py index 5f80856..963fd9b 100644 --- a/learnware/market/heterogeneous/organizer/__init__.py +++ b/learnware/market/heterogeneous/organizer/__init__.py @@ -43,7 +43,6 @@ class HeteroMapTableOrganizer(EasyOrganizer): rmtree(self.learnware_pool_path) except Exception as err: logger.warning(f"Clear current database failed due to {err}!!") - pass os.makedirs(self.learnware_pool_path, exist_ok=True) os.makedirs(self.learnware_zip_pool_path, exist_ok=True) From a0d1e42bf3cb3e6f9791f0f3dbc7d287282bca43 Mon Sep 17 00:00:00 2001 From: Gene Date: Mon, 13 Nov 2023 01:59:11 +0800 Subject: [PATCH 51/90] [MNT] add a logger in EasyOrganizer --- learnware/market/easy/organizer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/learnware/market/easy/organizer.py b/learnware/market/easy/organizer.py index 0c25b17..412664b 100644 --- a/learnware/market/easy/organizer.py +++ b/learnware/market/easy/organizer.py @@ -41,8 +41,8 @@ class EasyOrganizer(BaseOrganizer): try: self.dbops.clear_learnware_table() rmtree(self.learnware_pool_path) - except: - pass + except Exception as err: + logger.error(f"Clear current database failed due to {err}!!") os.makedirs(self.learnware_pool_path, exist_ok=True) os.makedirs(self.learnware_zip_pool_path, exist_ok=True) @@ -97,6 +97,7 @@ class EasyOrganizer(BaseOrganizer): id=learnware_id, semantic_spec=semantic_spec, learnware_dirpath=target_folder_dir ) except: + logger.warning("New learnware is not properly added!") try: os.remove(target_zip_dir) rmtree(target_folder_dir) From 5949373109d861dc2f0521cd0a714218722e2121 Mon Sep 17 00:00:00 2001 From: Gene Date: Mon, 13 Nov 2023 02:00:22 +0800 Subject: [PATCH 52/90] [MNT | ENH] refactor and add delete, update and reload in hetero organizer --- .../heterogeneous/organizer/__init__.py | 264 ++++++++---------- 1 file changed, 110 insertions(+), 154 deletions(-) diff --git a/learnware/market/heterogeneous/organizer/__init__.py b/learnware/market/heterogeneous/organizer/__init__.py index 963fd9b..a1fc9bb 100644 --- a/learnware/market/heterogeneous/organizer/__init__.py +++ b/learnware/market/heterogeneous/organizer/__init__.py @@ -1,201 +1,153 @@ import os -import copy -import zipfile import pandas as pd from collections import defaultdict -from shutil import copyfile, rmtree -from typing import List, Tuple +from typing import List, Tuple, Union -from ....learnware import Learnware, get_learnware_from_dirpath +from ....learnware import Learnware from ....logger import get_module_logger -from ....specification.system import HeteroMapTableSpecification +from ....specification import RKMETableSpecification, HeteroMapTableSpecification from ...base import BaseChecker, BaseUserInfo from ...easy import EasyOrganizer -from ...easy.database_ops import DatabaseOperations -from ....config import C as conf from .hetero_map import HeteroMap, Trainer logger = get_module_logger("hetero_map_table_organizer") class HeteroMapTableOrganizer(EasyOrganizer): - def reload_market(self, rebuild=False, auto_update_limit=100): - self.market_store_path = os.path.join(conf.root_path, self.market_id) - self.market_mapping_path = os.path.join(self.market_store_path, "model.bin") - self.learnware_pool_path = os.path.join(self.market_store_path, "learnware_pool") - self.learnware_zip_pool_path = os.path.join(self.market_store_path, "zips") - self.learnware_folder_pool_path = os.path.join(self.market_store_path, "unzipped_learnwares") - self.hetero_mappings_path = os.path.join(self.market_store_path, "hetero_mappings") - self.learnware_list = {} # id:learnware - self.learnware_zip_list = {} - self.learnware_folder_list = {} - self.count = 0 - self.training_count = 1 - self.last_training_count = 0 - self.dbops = DatabaseOperations(conf.database_url, "market_" + self.market_id) - self.auto_update = False + def reload_market(self, rebuild=False, auto_update=False, auto_update_limit=100): + super().reload_market(rebuild=rebuild) + self.auto_update = auto_update self.auto_update_limit = auto_update_limit + self.count_down = auto_update_limit - if rebuild: - logger.warning("Warning! You are trying to clear current database!") - try: - self.dbops.clear_learnware_table() - rmtree(self.learnware_pool_path) - except Exception as err: - logger.warning(f"Clear current database failed due to {err}!!") - - os.makedirs(self.learnware_pool_path, exist_ok=True) - os.makedirs(self.learnware_zip_pool_path, exist_ok=True) - os.makedirs(self.learnware_folder_pool_path, exist_ok=True) - os.makedirs(self.hetero_mappings_path, exist_ok=True) - - ( - self.learnware_list, - self.learnware_zip_list, - self.learnware_folder_list, - self.use_flags, - self.count, - ) = self.dbops.load_market() + hetero_folder_path = os.path.join(self.market_store_path, "hetero") + os.makedirs(hetero_folder_path, exist_ok=True) + self.market_mapping_path = os.path.join(hetero_folder_path, "model.bin") + self.hetero_specs_path = os.path.join(hetero_folder_path, "hetero_specifications") if os.path.exists(self.market_mapping_path): logger.info(f"Reload market mapping from checkpoint {self.market_mapping_path}") self.market_mapping = HeteroMap.load(checkpoint=self.market_store_path) if not rebuild: - if os.path.exists(self.hetero_mappings_path): - for hetero_json_path in os.listdir(self.hetero_mappings_path): - idx = hetero_json_path.split(".")[0] - hetero_spec = HeteroMapTableSpecification() - hetero_spec.load(os.path.join(self.hetero_mappings_path, f"{idx}.json")) + if os.path.exists(self.hetero_specs_path): + for hetero_json_path in os.listdir(self.hetero_specs_path): try: - self.learnware_list[idx].update_stat_spec("HeteroMapTableSpecification", hetero_spec) + idx = hetero_json_path.split(".")[0] + hetero_spec = HeteroMapTableSpecification() + hetero_spec.load(os.path.join(self.hetero_specs_path, f"{idx}.json")) + self.learnware_list[idx].update_stat_spec(hetero_spec.type, hetero_spec) except: - logger.warning(f"Learnware ID {idx} NOT Found!") + logger.warning(f"Learnware {idx} hetero spec loaded failed!") else: logger.info("No HeteroMapTableSpecification to reload. Use loaded market mapping to regenerate.") - self._update_learnware_by_ids(self.learnware_list.keys()) + os.makedirs(self.hetero_specs_path, exist_ok=True) + self._update_learnware_by_ids(self.get_learnware_ids(check_status=BaseChecker.USABLE_LEARWARE)) else: - logger.warning(f"No market mapping to reload!!") + logger.warning(f"No market mapping to reload!") self.market_mapping = HeteroMap() - # rmtree(self.hetero_mappings_path) - def reset(self, market_id=None, auto_update=False, auto_update_limit=None, **kwargs): - self.auto_update = auto_update + def reset(self, market_id=None, auto_update=False, auto_update_limit=100, **training_args): self.market_id = market_id - self.training_args = kwargs - if auto_update_limit is not None: - self.auto_update_limit = auto_update_limit + self.auto_update = auto_update + self.auto_update_limit = auto_update_limit + self.training_args = training_args def add_learnware( self, zip_path: str, semantic_spec: dict, check_status: int, learnware_id: str = None ) -> Tuple[str, int]: - if check_status == BaseChecker.INVALID_LEARNWARE: - logger.warning("Learnware is invalid!") - return None, BaseChecker.INVALID_LEARNWARE - - semantic_spec = copy.deepcopy(semantic_spec) - logger.info("Get new learnware from %s" % (zip_path)) - - learnware_id = "%08d" % (self.count) if learnware_id is None else learnware_id - target_zip_dir = os.path.join(self.learnware_zip_pool_path, "%s.zip" % (learnware_id)) - target_folder_dir = os.path.join(self.learnware_folder_pool_path, learnware_id) - copyfile(zip_path, target_zip_dir) + learnware_id, learnwere_status = super().add_learnware(zip_path, semantic_spec, check_status, learnware_id) + + if learnwere_status == BaseChecker.USABLE_LEARWARE and len(self._get_hetero_learnware_ids(learnware_id)): + self._update_learnware_by_ids(learnware_id) + + if self.auto_update: + self.count_down -= 1 + if self.count_down == 0: + training_learnware_ids = self._get_hetero_learnware_ids( + self.get_learnware_ids(check_status=BaseChecker.USABLE_LEARWARE) + ) + training_learnwares = self.get_learnware_by_ids(training_learnware_ids) + logger.info(f"Verified leanwares for training: {training_learnware_ids}") + updated_market_mapping = self.train( + learnware_list=training_learnwares, save_dir=self.market_store_path, **self.training_args + ) + logger.info( + f"Market mapping train completed. Now update HeteroMapTableSpecification for {training_learnware_ids}" + ) + self.market_mapping = updated_market_mapping + self._update_learnware_by_ids(training_learnware_ids) + else: + self.count_down = self.auto_update_limit - with zipfile.ZipFile(target_zip_dir, "r") as z_file: - z_file.extractall(target_folder_dir) - logger.info("Learnware move to %s, and unzip to %s" % (target_zip_dir, target_folder_dir)) + return learnware_id, learnwere_status - try: - new_learnware = get_learnware_from_dirpath( - id=learnware_id, semantic_spec=semantic_spec, learnware_dirpath=target_folder_dir - ) - except: - logger.info("New Learnware Not Properly Added!!!") + def delete_learnware(self, id: str) -> bool: + flag = super().delete_learnware(id) + if flag: + hetero_spec_path = os.path.join(self.hetero_specs_path, f"{id}.json") try: - os.remove(target_zip_dir) - rmtree(target_folder_dir) - except: + os.remove(hetero_spec_path) + except FileNotFoundError: pass - return None, BaseChecker.INVALID_LEARNWARE - - if new_learnware is None: - return None, BaseChecker.INVALID_LEARNWARE - - learnwere_status = check_status if check_status is not None else BaseChecker.NONUSABLE_LEARNWARE - - self.dbops.add_learnware( - id=learnware_id, - semantic_spec=semantic_spec, - zip_path=target_zip_dir, - folder_path=target_folder_dir, - use_flag=learnwere_status, - ) - - self.learnware_list[learnware_id] = new_learnware - self.learnware_zip_list[learnware_id] = target_zip_dir - self.learnware_folder_list[learnware_id] = target_folder_dir - self.use_flags[learnware_id] = learnwere_status - self._update_learnware_by_ids([learnware_id]) - self.count += 1 - self.training_count += [learnware_id] == self._get_table_type_learnware_ids([learnware_id]) - - if self.auto_update and self.training_count - self.last_training_count == self.auto_update_limit + 1: - training_learnware_ids = self._get_table_type_learnware_ids(self.get_learnware_ids()) - training_learnwares = self.get_learnware_by_ids(training_learnware_ids) - logger.warning(f"Leanwares for training: {training_learnware_ids}") - - updated_market_mapping = self.train( - learnware_list=training_learnwares, save_dir=self.market_store_path, **self.training_args - ) - - logger.warning( - f"Market mapping train completed. Now update HeteroMapTableSpecification for {training_learnware_ids}" - ) - self.market_mapping = updated_market_mapping - self._update_learnware_by_ids(training_learnware_ids) - self.last_training_count = len(training_learnware_ids) - - return learnware_id, learnwere_status - - @staticmethod - def train(learnware_list: List[Learnware], save_dir: str, **kwargs): - allset = HeteroMapTableOrganizer._learnwares_to_dataframes(learnware_list) - market_mapping = HeteroMap(**kwargs) - market_mapping_trainer = Trainer( - model=market_mapping, - train_set_list=allset, - collate_fn=market_mapping.collate_fn, - **kwargs, - ) + return flag - market_mapping_trainer.train() - market_mapping_trainer.save_model(output_dir=save_dir) + def update_learnware(self, id: str, zip_path: str = None, semantic_spec: dict = None, check_status: int = None): + final_status = super().update_learnware(id, zip_path, semantic_spec, check_status) + if final_status == BaseChecker.USABLE_LEARWARE and len(self._get_hetero_learnware_ids(id)): + self._update_learnware_by_ids(id) + return final_status - return market_mapping + def reload_learnware(self, learnware_id: str): + super().reload_learnware(learnware_id) + try: + hetero_spec_path = os.path.join(self.hetero_specs_path, f"{learnware_id}.json") + if os.path.exists(hetero_spec_path): + hetero_spec = HeteroMapTableSpecification() + hetero_spec.load(hetero_spec_path) + self.learnware_list[learnware_id].update_stat_spec(hetero_spec.type, hetero_spec) + except: + logger.warning(f"Learnware {learnware_id} hetero spec loaded failed!") - def _update_learnware_by_ids(self, ids: List[str]): - ids = self._get_table_type_learnware_ids(ids) - for id in ids: + def _update_learnware_by_ids(self, ids: Union[str, List[str]]): + ids = self._get_hetero_learnware_ids(ids) + for idx in ids: try: - spec = self.learnware_list[id].get_specification() + spec = self.learnware_list[idx].get_specification() semantic_spec, stat_spec = spec.get_semantic_spec(), spec.get_stat_spec()["RKMETableSpecification"] - features = semantic_spec["Input"]["Description"].values() - hetero_spec = self.market_mapping.hetero_mapping(stat_spec, features) - self.learnware_list[id].update_stat_spec("HeteroMapTableSpecification", hetero_spec) + features = semantic_spec["Input"]["Description"] + save_path = os.path.join(self.hetero_specs_path, f"{idx}.json") - save_path = os.path.join(self.hetero_mappings_path, f"{id}.json") + hetero_spec = self.market_mapping.hetero_mapping(stat_spec, features) + self.learnware_list[idx].update_stat_spec(hetero_spec.type, hetero_spec) hetero_spec.save(save_path) + except Exception as err: - logger.warning(f"Learnware {id} generate HeteroMapTableSpecification failed! Due to {err}") + logger.warning(f"Learnware {idx} generate HeteroMapTableSpecification failed! Due to {err}") + + def _get_hetero_learnware_ids(self, ids: Union[str, List[str]]) -> List[str]: + if isinstance(ids, str): + ids = [ids] + + ret = [] + for idx in ids: + try: + spec = self.learnware_list[idx].get_specification() + semantic_spec, rkme = spec.get_semantic_spec(), spec.get_stat_spec().get("RKMETableSpecification", None) + if isinstance(rkme, RKMETableSpecification) and isinstance(semantic_spec["Input"], dict): + ret.append(idx) + except: + continue + return ret def generate_hetero_map_spec(self, user_info: BaseUserInfo) -> HeteroMapTableSpecification: user_stat_spec = user_info.stat_info["RKMETableSpecification"] user_features = user_info.get_semantic_spec()["Input"]["Description"].values() - user_hetero_spec = self.market_mapping.hetero_mapping(user_stat_spec, user_features) return user_hetero_spec @staticmethod - def _learnwares_to_dataframes(learnware_list: List[Learnware]) -> List[pd.DataFrame]: + def train(learnware_list: List[Learnware], save_dir: str, **kwargs) -> HeteroMap: + # Convert learnware to dataframe learnware_df_dict = defaultdict(list) for learnware in learnware_list: spec = learnware.get_specification() @@ -203,13 +155,17 @@ class HeteroMapTableOrganizer(EasyOrganizer): features = spec.get_semantic_spec()["Input"]["Description"] learnware_df = pd.DataFrame(data=stat_spec.get_z(), columns=features.values()) learnware_df_dict[tuple(sorted(features))].append(learnware_df) + allset = [pd.concat(dfs) for dfs in learnware_df_dict.values()] - return [pd.concat(dfs) for dfs in learnware_df_dict.values()] + # Train market mapping + market_mapping = HeteroMap(**kwargs) + market_mapping_trainer = Trainer( + model=market_mapping, + train_set_list=allset, + collate_fn=market_mapping.collate_fn, + **kwargs, + ) + market_mapping_trainer.train() + market_mapping_trainer.save_model(output_dir=save_dir) - def _get_table_type_learnware_ids(self, ids: List[str]) -> List[str]: - ret = [] - for id in ids: - semantic_spec = self.learnware_list[id].get_specification().get_semantic_spec() - if semantic_spec["Data"]["Values"][0] == "Table": - ret.append(id) - return ret + return market_mapping From 9df7c489a538821ce3e6aa09117667c71c03ef74 Mon Sep 17 00:00:00 2001 From: liuht Date: Mon, 13 Nov 2023 15:57:37 +0800 Subject: [PATCH 53/90] [FIX] fix train countdown update --- learnware/market/heterogeneous/organizer/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/learnware/market/heterogeneous/organizer/__init__.py b/learnware/market/heterogeneous/organizer/__init__.py index a1fc9bb..4a44112 100644 --- a/learnware/market/heterogeneous/organizer/__init__.py +++ b/learnware/market/heterogeneous/organizer/__init__.py @@ -76,7 +76,7 @@ class HeteroMapTableOrganizer(EasyOrganizer): ) self.market_mapping = updated_market_mapping self._update_learnware_by_ids(training_learnware_ids) - else: + self.count_down = self.auto_update_limit return learnware_id, learnwere_status From 4c591c3ee79f147452d31250d5d3f303c18e999d Mon Sep 17 00:00:00 2001 From: liuht Date: Mon, 13 Nov 2023 18:04:40 +0800 Subject: [PATCH 54/90] [FIX | MNT] fix model save_dir bugs --- .../heterogeneous/organizer/__init__.py | 9 ++++--- .../organizer/hetero_map/__init__.py | 25 ++++++++++--------- .../organizer/hetero_map/trainer.py | 17 ------------- 3 files changed, 18 insertions(+), 33 deletions(-) diff --git a/learnware/market/heterogeneous/organizer/__init__.py b/learnware/market/heterogeneous/organizer/__init__.py index 4a44112..5a0418e 100644 --- a/learnware/market/heterogeneous/organizer/__init__.py +++ b/learnware/market/heterogeneous/organizer/__init__.py @@ -24,10 +24,11 @@ class HeteroMapTableOrganizer(EasyOrganizer): os.makedirs(hetero_folder_path, exist_ok=True) self.market_mapping_path = os.path.join(hetero_folder_path, "model.bin") self.hetero_specs_path = os.path.join(hetero_folder_path, "hetero_specifications") + os.makedirs(self.hetero_specs_path, exist_ok=True) if os.path.exists(self.market_mapping_path): logger.info(f"Reload market mapping from checkpoint {self.market_mapping_path}") - self.market_mapping = HeteroMap.load(checkpoint=self.market_store_path) + self.market_mapping = HeteroMap.load(checkpoint=self.market_mapping_path) if not rebuild: if os.path.exists(self.hetero_specs_path): for hetero_json_path in os.listdir(self.hetero_specs_path): @@ -40,7 +41,6 @@ class HeteroMapTableOrganizer(EasyOrganizer): logger.warning(f"Learnware {idx} hetero spec loaded failed!") else: logger.info("No HeteroMapTableSpecification to reload. Use loaded market mapping to regenerate.") - os.makedirs(self.hetero_specs_path, exist_ok=True) self._update_learnware_by_ids(self.get_learnware_ids(check_status=BaseChecker.USABLE_LEARWARE)) else: logger.warning(f"No market mapping to reload!") @@ -50,6 +50,7 @@ class HeteroMapTableOrganizer(EasyOrganizer): self.market_id = market_id self.auto_update = auto_update self.auto_update_limit = auto_update_limit + self.count_down = auto_update_limit self.training_args = training_args def add_learnware( @@ -69,7 +70,7 @@ class HeteroMapTableOrganizer(EasyOrganizer): training_learnwares = self.get_learnware_by_ids(training_learnware_ids) logger.info(f"Verified leanwares for training: {training_learnware_ids}") updated_market_mapping = self.train( - learnware_list=training_learnwares, save_dir=self.market_store_path, **self.training_args + learnware_list=training_learnwares, save_dir=self.market_mapping_path, **self.training_args ) logger.info( f"Market mapping train completed. Now update HeteroMapTableSpecification for {training_learnware_ids}" @@ -141,7 +142,7 @@ class HeteroMapTableOrganizer(EasyOrganizer): def generate_hetero_map_spec(self, user_info: BaseUserInfo) -> HeteroMapTableSpecification: user_stat_spec = user_info.stat_info["RKMETableSpecification"] - user_features = user_info.get_semantic_spec()["Input"]["Description"].values() + user_features = user_info.get_semantic_spec()["Input"]["Description"] user_hetero_spec = self.market_mapping.hetero_mapping(user_stat_spec, user_features) return user_hetero_spec diff --git a/learnware/market/heterogeneous/organizer/hetero_map/__init__.py b/learnware/market/heterogeneous/organizer/hetero_map/__init__.py index 46635be..2316c88 100644 --- a/learnware/market/heterogeneous/organizer/hetero_map/__init__.py +++ b/learnware/market/heterogeneous/organizer/hetero_map/__init__.py @@ -83,36 +83,34 @@ class HeteroMap(nn.Module): @staticmethod def load(checkpoint=None): """Load the model state_dict and feature_tokenizer configuration - from the ``ckpt_dir``. + from the ``checkpoint``. Parameters ---------- - ckpt_dir: str + checkpoint: str the directory path to load. """ # load model weight state dict - market_model_path = os.path.join(checkpoint, "model.bin") - model_info = torch.load(market_model_path, map_location="cpu") + model_info = torch.load(checkpoint, map_location="cpu") model = HeteroMap(**model_info["model_args"]) model.load_state_dict(model_info["model_state_dict"], strict=False) return model - def save(self, ckpt_dir): + def save(self, checkpoint): """Save the model state_dict and feature_tokenizer configuration - to the ``ckpt_dir``. + to the ``checkpoint``. Parameters ---------- - ckpt_dir: str + checkpoint: str the directory path to save. """ # save model weight state dict model_info = { "model_state_dict": self.state_dict(), - "model_args": self.model_args, - # "feature_tokenizer": self.feature_tokenizer, + "model_args": self.model_args } - torch.save(model_info, os.path.join(ckpt_dir, "model.bin")) + torch.save(model_info, checkpoint) def forward(self, x, y=None): # do positive sampling @@ -134,9 +132,12 @@ class HeteroMap(nn.Module): loss = self._self_supervised_contrastive_loss(feat_x_multiview) return loss - def hetero_mapping(self, rkme_spec: RKMETableSpecification, cols: List[str]) -> HeteroMapTableSpecification: + # def hetero_mapping(self, rkme_spec: RKMETableSpecification, features: dict) -> HeteroMapTableSpecification: + def hetero_mapping(self, rkme_spec: RKMETableSpecification, features: dict) -> HeteroMapTableSpecification: hetero_spec = HeteroMapTableSpecification() - hetero_input_df = pd.DataFrame(data=rkme_spec.get_z(), columns=cols) + data = rkme_spec.get_z() + cols = [features.get(str(i), "") for i in range(data.shape[1])] + hetero_input_df = pd.DataFrame(data=data, columns=cols) hetero_embedding = self._extract_batch_features(hetero_input_df) hetero_spec.generate_stat_spec_from_system(hetero_embedding, rkme_spec) return hetero_spec diff --git a/learnware/market/heterogeneous/organizer/hetero_map/trainer.py b/learnware/market/heterogeneous/organizer/hetero_map/trainer.py index 8208d00..f192b78 100644 --- a/learnware/market/heterogeneous/organizer/hetero_map/trainer.py +++ b/learnware/market/heterogeneous/organizer/hetero_map/trainer.py @@ -88,25 +88,8 @@ class Trainer: return final_train_loss def save_model(self, output_dir=None): - if output_dir is None: - logger.info("no path assigned for save mode, default saved to ./ckpt/model.pt !") - output_dir = self.output_dir - logger.info(f"saving model checkpoint to {output_dir}") self.model.save(output_dir) - # self.collate_fn.save(output_dir) - - if self.args is not None: - train_args = {} - for k, v in self.args.items(): - if isinstance(v, int) or isinstance(v, str) or isinstance(v, float): - train_args[k] = v - with open( - os.path.join(output_dir, "training_args.json"), - "w", - encoding="utf-8", - ) as f: - f.write(json.dumps(train_args)) def _create_optimizer(self): if self.optimizer is None: From 36e6e04451567a6ba455cd3e8934850b6b7ff136 Mon Sep 17 00:00:00 2001 From: Peng Tan Date: Mon, 13 Nov 2023 20:02:20 +0800 Subject: [PATCH 55/90] [DOC] add ref for transtab --- .../organizer/hetero_map/__init__.py | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/learnware/market/heterogeneous/organizer/hetero_map/__init__.py b/learnware/market/heterogeneous/organizer/hetero_map/__init__.py index 2316c88..9b22789 100644 --- a/learnware/market/heterogeneous/organizer/hetero_map/__init__.py +++ b/learnware/market/heterogeneous/organizer/hetero_map/__init__.py @@ -12,6 +12,37 @@ from .trainer import Trainer, TransTabCollatorForCL class HeteroMap(nn.Module): + """ + This class is based on 'TransTab' project as described in the paper + "TransTab: A flexible transferable tabular learning framework". The original project is available at + https://github.com/RyanWangZf/transtab and is licensed under the BSD 2-Clause License. + + Modifications: + - Simplified the original code to focus primarily on methods related to numerical features. + - Retained only the unsupervised training method. + - While the original paper and the TransTab framework utilized the module for final predictions, this version + is modified for feature extraction purposes only. + + The class implements a neural network module for processing tabular data, specifically tuned for numerical features. + + Args: + feature_tokenizer (FeatureTokenizer, optional): Tokenizer for feature representation. + hidden_dim (int, optional): Dimension of hidden layer. + num_layer (int, optional): Number of layers in the transformer encoder. + num_attention_head (int, optional): Number of attention heads in the transformer. + hidden_dropout_prob (float, optional): Dropout probability for hidden layers. + ffn_dim (int, optional): Dimension of feedforward network. + projection_dim (int, optional): Dimension for projection head. + overlap_ratio (float, optional): Overlap ratio for tokenization. + num_partition (int, optional): Number of partitions for collation. + temperature (float, optional): Temperature parameter for contrastive learning. + base_temperature (float, optional): Base temperature parameter. + activation (str, optional): Activation function for transformer layers. + device (str, optional): Device to run the model on. + checkpoint (str, optional): Path to a pre-trained model checkpoint. + **kwargs: Additional keyword arguments. + """ + def __init__( self, feature_tokenizer=None, From beff99a1cb849f75bf468aee307dd2b53e3641f4 Mon Sep 17 00:00:00 2001 From: bxdd Date: Mon, 13 Nov 2023 20:20:30 +0800 Subject: [PATCH 56/90] [MNT] modifty market base class --- learnware/market/base.py | 41 +++++++++++++------ .../heterogeneous/organizer/__init__.py | 14 ++++--- 2 files changed, 36 insertions(+), 19 deletions(-) diff --git a/learnware/market/base.py b/learnware/market/base.py index 3ed15b8..eb72814 100644 --- a/learnware/market/base.py +++ b/learnware/market/base.py @@ -68,8 +68,7 @@ class LearnwareMarket: ): self.market_id = market_id self.learnware_organizer = BaseOrganizer() if organizer is None else organizer - self.learnware_organizer.reset(market_id=market_id) - self.learnware_organizer.reload_market(rebuild=rebuild) + self.learnware_organizer.reset(market_id=market_id, reload_kwargs={"rebuild": rebuild}) self.learnware_searcher = BaseSearcher() if searcher is None else searcher self.learnware_searcher.reset(organizer=self.learnware_organizer) @@ -77,9 +76,20 @@ class LearnwareMarket: self.learnware_checker = {"BaseChecker": BaseChecker()} else: self.learnware_checker = {checker.__class__.__name__: checker for checker in checker_list} - for name, checker in self.learnware_checker.items(): + for checker in self.learnware_checker.values(): checker.reset(organizer=self.learnware_organizer) + def reset(self, organizer_kwargs=None, searcher_kwargs=None, checker_kwargs=None, **kwargs): + organizer_kwargs = {} if organizer_kwargs is None else organizer_kwargs + searcher_kwargs = {} if searcher_kwargs is None else searcher_kwargs + checker_kwargs = {} if checker_kwargs is None else checker_kwargs + self.learnware_organizer.reset(**organizer_kwargs) + self.learnware_searcher.reset(**searcher_kwargs) + for checker in self.learnware_checker.values(): + checker.reset(**checker_kwargs) + for _k, _v in kwargs.items(): + setattr(self, _k, _v) + def reload_market(self, **kwargs) -> bool: self.learnware_organizer.reload_market(**kwargs) @@ -254,11 +264,14 @@ class LearnwareMarket: class BaseOrganizer: - def __init__(self, market_id=None): - self.reset(market_id=market_id) + def __init__(self, market_id=None, **kwargs): + self.reset(market_id=market_id, **kwargs) - def reset(self, market_id=None, **kwargs): - self.market_id = market_id + def reset(self, market_id: str = None, reload_kwargs: dict = None): + if market_id is not None: + self.market_id = market_id + if reload_kwargs is not None: + self.reload_market(**reload_kwargs) def reload_market(self, rebuild=False, **kwargs) -> bool: """Reload the learnware organizer when server restared. @@ -428,11 +441,12 @@ class BaseOrganizer: class BaseSearcher: - def __init__(self, organizer: BaseOrganizer = None): - self.learnware_organizer = organizer + def __init__(self, organizer: BaseOrganizer = None, **kwargs): + self.reset(organizer=organizer, **kwargs) - def reset(self, organizer): - self.learnware_organizer = organizer + def reset(self, organizer: BaseOrganizer = None, **kwargs): + if organizer is not None: + self.learnware_organizer = organizer def __call__(self, user_info: BaseUserInfo, check_status: int = None): """Search learnwares based on user_info from learnwares with check_status @@ -456,8 +470,9 @@ class BaseChecker: def __init__(self, organizer: BaseOrganizer = None): self.learnware_organizer = organizer - def reset(self, organizer): - self.learnware_organizer = organizer + def reset(self, organizer=None): + if organizer is not None: + self.learnware_organizer = organizer def __call__(self, learnware: Learnware) -> Tuple[int, str]: """Check the utility of a learnware diff --git a/learnware/market/heterogeneous/organizer/__init__.py b/learnware/market/heterogeneous/organizer/__init__.py index 4a44112..2f03fba 100644 --- a/learnware/market/heterogeneous/organizer/__init__.py +++ b/learnware/market/heterogeneous/organizer/__init__.py @@ -15,7 +15,7 @@ logger = get_module_logger("hetero_map_table_organizer") class HeteroMapTableOrganizer(EasyOrganizer): def reload_market(self, rebuild=False, auto_update=False, auto_update_limit=100): - super().reload_market(rebuild=rebuild) + super(HeteroMapTableOrganizer, self).reload_market(rebuild=rebuild) self.auto_update = auto_update self.auto_update_limit = auto_update_limit self.count_down = auto_update_limit @@ -55,7 +55,9 @@ class HeteroMapTableOrganizer(EasyOrganizer): def add_learnware( self, zip_path: str, semantic_spec: dict, check_status: int, learnware_id: str = None ) -> Tuple[str, int]: - learnware_id, learnwere_status = super().add_learnware(zip_path, semantic_spec, check_status, learnware_id) + learnware_id, learnwere_status = super(HeteroMapTableOrganizer, self).add_learnware( + zip_path, semantic_spec, check_status, learnware_id + ) if learnwere_status == BaseChecker.USABLE_LEARWARE and len(self._get_hetero_learnware_ids(learnware_id)): self._update_learnware_by_ids(learnware_id) @@ -76,13 +78,13 @@ class HeteroMapTableOrganizer(EasyOrganizer): ) self.market_mapping = updated_market_mapping self._update_learnware_by_ids(training_learnware_ids) - + self.count_down = self.auto_update_limit return learnware_id, learnwere_status def delete_learnware(self, id: str) -> bool: - flag = super().delete_learnware(id) + flag = super(HeteroMapTableOrganizer, self).delete_learnware(id) if flag: hetero_spec_path = os.path.join(self.hetero_specs_path, f"{id}.json") try: @@ -92,13 +94,13 @@ class HeteroMapTableOrganizer(EasyOrganizer): return flag def update_learnware(self, id: str, zip_path: str = None, semantic_spec: dict = None, check_status: int = None): - final_status = super().update_learnware(id, zip_path, semantic_spec, check_status) + final_status = super(HeteroMapTableOrganizer, self).update_learnware(id, zip_path, semantic_spec, check_status) if final_status == BaseChecker.USABLE_LEARWARE and len(self._get_hetero_learnware_ids(id)): self._update_learnware_by_ids(id) return final_status def reload_learnware(self, learnware_id: str): - super().reload_learnware(learnware_id) + super(HeteroMapTableOrganizer, self).reload_learnware(learnware_id) try: hetero_spec_path = os.path.join(self.hetero_specs_path, f"{learnware_id}.json") if os.path.exists(hetero_spec_path): From 579daf993c2fb262a67ee722df22801cee3fb06a Mon Sep 17 00:00:00 2001 From: bxdd Date: Mon, 13 Nov 2023 20:28:45 +0800 Subject: [PATCH 57/90] [MNT] make organizer and searcher necessary in market --- learnware/market/base.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/learnware/market/base.py b/learnware/market/base.py index eb72814..d761f28 100644 --- a/learnware/market/base.py +++ b/learnware/market/base.py @@ -60,22 +60,20 @@ class LearnwareMarket: def __init__( self, - market_id: str = "default", - organizer: BaseOrganizer = None, - searcher: BaseSearcher = None, + market_id: str, + organizer: BaseOrganizer, + searcher: BaseSearcher, checker_list: List[BaseChecker] = None, rebuild=False, ): self.market_id = market_id - self.learnware_organizer = BaseOrganizer() if organizer is None else organizer + self.learnware_organizer = organizer self.learnware_organizer.reset(market_id=market_id, reload_kwargs={"rebuild": rebuild}) - self.learnware_searcher = BaseSearcher() if searcher is None else searcher + self.learnware_searcher = searcher self.learnware_searcher.reset(organizer=self.learnware_organizer) + checker_list = [] if checker_list is None else checker_list + self.learnware_checker = {checker.__class__.__name__: checker for checker in checker_list} - if checker_list is None: - self.learnware_checker = {"BaseChecker": BaseChecker()} - else: - self.learnware_checker = {checker.__class__.__name__: checker for checker in checker_list} for checker in self.learnware_checker.values(): checker.reset(organizer=self.learnware_organizer) From eced92b6190180da9e0f9b807eead44d08ad6178 Mon Sep 17 00:00:00 2001 From: Peng Tan Date: Mon, 13 Nov 2023 20:58:56 +0800 Subject: [PATCH 58/90] [DOC] add readme for transtab --- .../heterogeneous/organizer/hetero_map/README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 learnware/market/heterogeneous/organizer/hetero_map/README.md diff --git a/learnware/market/heterogeneous/organizer/hetero_map/README.md b/learnware/market/heterogeneous/organizer/hetero_map/README.md new file mode 100644 index 0000000..fdfbfc6 --- /dev/null +++ b/learnware/market/heterogeneous/organizer/hetero_map/README.md @@ -0,0 +1,15 @@ +# README + +## Overview + +This package contains code modified from the paper "TransTab: A Flexible Transferable Tabular Learning Framework." The original project, available at [TransTab GitHub Repository](https://github.com/RyanWangZf/transtab), is under the BSD 2-Clause license. The code here has been modified to focus specifically on numerical features, retaining only methods relevant to these features. The training approach is limited to unsupervised training, differing from the original paper's usage of TransTab for final predictions. Instead, this code is utilized primarily for feature extraction. + +## Contents + +## Handling heterogeneous learnwares + +The code is used for finding a unified specification space for learnwares generated from table data with heterogeneous feature spaces and assigning new specifications. When the market receives some leanrwares, it utilize existing learnware specifications to train an engine. This engine integrates the specifications from various spaces into a unified "specification world", assigning new market-specific specifications to the learnware. As more learnwares are uploaded, the engine continuously updates, refining the specification world and updating the specifications of the learnware. + +## License + +This part, based on the TransTab framework, adheres to the BSD 2-Clause license. \ No newline at end of file From d86296b51a80514129cdae36a13e0f8c58c93f1c Mon Sep 17 00:00:00 2001 From: Peng Tan Date: Mon, 13 Nov 2023 21:08:22 +0800 Subject: [PATCH 59/90] [MNT] modify the doc --- .../market/heterogeneous/organizer/hetero_map/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/learnware/market/heterogeneous/organizer/hetero_map/README.md b/learnware/market/heterogeneous/organizer/hetero_map/README.md index fdfbfc6..429a0c0 100644 --- a/learnware/market/heterogeneous/organizer/hetero_map/README.md +++ b/learnware/market/heterogeneous/organizer/hetero_map/README.md @@ -2,14 +2,14 @@ ## Overview -This package contains code modified from the paper "TransTab: A Flexible Transferable Tabular Learning Framework." The original project, available at [TransTab GitHub Repository](https://github.com/RyanWangZf/transtab), is under the BSD 2-Clause license. The code here has been modified to focus specifically on numerical features, retaining only methods relevant to these features. The training approach is limited to unsupervised training, differing from the original paper's usage of TransTab for final predictions. Instead, this code is utilized primarily for feature extraction. +This package contains code modified from the paper "TransTab: A Flexible Transferable Tabular Learning Framework." The original project, available at [TransTab GitHub Repository](https://github.com/RyanWangZf/transtab), is under the BSD 2-Clause license. The code here has been modified to focus specifically on numerical features, retaining only methods relevant to these features. The training approach is limited to unsupervised training. Differing from the original paper's usage of TransTab for final predictions, this code is utilized for feature extraction. ## Contents ## Handling heterogeneous learnwares -The code is used for finding a unified specification space for learnwares generated from table data with heterogeneous feature spaces and assigning new specifications. When the market receives some leanrwares, it utilize existing learnware specifications to train an engine. This engine integrates the specifications from various spaces into a unified "specification world", assigning new market-specific specifications to the learnware. As more learnwares are uploaded, the engine continuously updates, refining the specification world and updating the specifications of the learnware. +The code is used for finding a unified specification space for learnwares generated from table data with heterogeneous feature spaces and assigning new specifications accordingly. When the market receives some leanrwares, it utilize existing learnware specifications to train an engine. This engine integrates the specifications from various spaces into a unified "specification world", assigning new market-specific specifications to the learnware. As more learnwares are uploaded, the engine continuously updates, refining the specification world and updating the specifications of the learnware. ## License -This part, based on the TransTab framework, adheres to the BSD 2-Clause license. \ No newline at end of file +The hetero_map package, based on the TransTab project, adheres to the BSD 2-Clause license. \ No newline at end of file From 2c918d2af35e5bb1e13ba52d576305dd54a28ba7 Mon Sep 17 00:00:00 2001 From: liuht Date: Mon, 13 Nov 2023 22:14:24 +0800 Subject: [PATCH 60/90] [MNT] hetero_map doc contents --- learnware/market/heterogeneous/organizer/hetero_map/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/learnware/market/heterogeneous/organizer/hetero_map/README.md b/learnware/market/heterogeneous/organizer/hetero_map/README.md index 429a0c0..28d2a02 100644 --- a/learnware/market/heterogeneous/organizer/hetero_map/README.md +++ b/learnware/market/heterogeneous/organizer/hetero_map/README.md @@ -6,6 +6,10 @@ This package contains code modified from the paper "TransTab: A Flexible Transfe ## Contents +- `__init__.py`: The `__init__.py` file defines the HeteroMap class, which forms the main network structure of the market engine. It includes methods for handling heterogeneous tabular data, focusing on mapping data from diverse feature spaces into a unified "specification world". +- `trainer.py`: The `trainer.py` file focuses on the unsupervised training process of the market engine. It defines the `TransTabCollatorForCL` class, which builds positive and negative samples from tabular vertical partitions for unsupervised learning. +- `feature_extractor.py`: This file encompasses NLP-related functionalities crucial for processing and understanding table data features. It includes classes like `WordEmbedding`, which encodes tokens from column names into word embeddings, and `FeatureTokenizer`, which prepares tables for processing by the market engine. + ## Handling heterogeneous learnwares The code is used for finding a unified specification space for learnwares generated from table data with heterogeneous feature spaces and assigning new specifications accordingly. When the market receives some leanrwares, it utilize existing learnware specifications to train an engine. This engine integrates the specifications from various spaces into a unified "specification world", assigning new market-specific specifications to the learnware. As more learnwares are uploaded, the engine continuously updates, refining the specification world and updating the specifications of the learnware. From eca3c0924986c49ecc5831e03da1765286f5b3dc Mon Sep 17 00:00:00 2001 From: liuht Date: Mon, 13 Nov 2023 22:15:12 +0800 Subject: [PATCH 61/90] [FIX] modify hetero_map doc contents --- learnware/market/heterogeneous/organizer/hetero_map/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/learnware/market/heterogeneous/organizer/hetero_map/README.md b/learnware/market/heterogeneous/organizer/hetero_map/README.md index 28d2a02..d2ab834 100644 --- a/learnware/market/heterogeneous/organizer/hetero_map/README.md +++ b/learnware/market/heterogeneous/organizer/hetero_map/README.md @@ -6,7 +6,7 @@ This package contains code modified from the paper "TransTab: A Flexible Transfe ## Contents -- `__init__.py`: The `__init__.py` file defines the HeteroMap class, which forms the main network structure of the market engine. It includes methods for handling heterogeneous tabular data, focusing on mapping data from diverse feature spaces into a unified "specification world". +- `__init__.py`: The `__init__.py` file defines the `HeteroMap` class, which forms the main network structure of the market engine. It includes methods for handling heterogeneous tabular data, focusing on mapping data from diverse feature spaces into a unified "specification world". - `trainer.py`: The `trainer.py` file focuses on the unsupervised training process of the market engine. It defines the `TransTabCollatorForCL` class, which builds positive and negative samples from tabular vertical partitions for unsupervised learning. - `feature_extractor.py`: This file encompasses NLP-related functionalities crucial for processing and understanding table data features. It includes classes like `WordEmbedding`, which encodes tokens from column names into word embeddings, and `FeatureTokenizer`, which prepares tables for processing by the market engine. From f3f2f2f70748b3cf10377113e24d6d1c3b65a2d8 Mon Sep 17 00:00:00 2001 From: Peng Tan Date: Mon, 13 Nov 2023 22:26:51 +0800 Subject: [PATCH 62/90] [DOC] modify details for hetero readme --- learnware/market/heterogeneous/organizer/hetero_map/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/learnware/market/heterogeneous/organizer/hetero_map/README.md b/learnware/market/heterogeneous/organizer/hetero_map/README.md index d2ab834..73c8687 100644 --- a/learnware/market/heterogeneous/organizer/hetero_map/README.md +++ b/learnware/market/heterogeneous/organizer/hetero_map/README.md @@ -7,8 +7,8 @@ This package contains code modified from the paper "TransTab: A Flexible Transfe ## Contents - `__init__.py`: The `__init__.py` file defines the `HeteroMap` class, which forms the main network structure of the market engine. It includes methods for handling heterogeneous tabular data, focusing on mapping data from diverse feature spaces into a unified "specification world". -- `trainer.py`: The `trainer.py` file focuses on the unsupervised training process of the market engine. It defines the `TransTabCollatorForCL` class, which builds positive and negative samples from tabular vertical partitions for unsupervised learning. -- `feature_extractor.py`: This file encompasses NLP-related functionalities crucial for processing and understanding table data features. It includes classes like `WordEmbedding`, which encodes tokens from column names into word embeddings, and `FeatureTokenizer`, which prepares tables for processing by the market engine. +- `trainer.py`: The `trainer.py` file focuses on the unsupervised training process of the market engine. The `TransTabCollatorForCL` class is used for generating positive and negative samples from tabular vertical partitions for unsupervised learning. +- `feature_extractor.py`: This file is utilized for the purpose of tokenizing feature descriptions and transforming them into word embeddings. ## Handling heterogeneous learnwares From 3e815f24242436ffad3b66bfb84f3aad0c48c9e7 Mon Sep 17 00:00:00 2001 From: bxdd Date: Mon, 13 Nov 2023 22:48:29 +0800 Subject: [PATCH 63/90] [MNT] update reset method in LearnwareMarket --- learnware/market/base.py | 40 +++++++++++++++++++++++++++----------- learnware/market/module.py | 13 ++++++++++++- 2 files changed, 41 insertions(+), 12 deletions(-) diff --git a/learnware/market/base.py b/learnware/market/base.py index d761f28..3e5554c 100644 --- a/learnware/market/base.py +++ b/learnware/market/base.py @@ -65,10 +65,14 @@ class LearnwareMarket: searcher: BaseSearcher, checker_list: List[BaseChecker] = None, rebuild=False, + organizer_kwargs=None, + searcher_kwargs=None, + checker_kwargs=None, + **kwargs, ): self.market_id = market_id self.learnware_organizer = organizer - self.learnware_organizer.reset(market_id=market_id, reload_kwargs={"rebuild": rebuild}) + self.learnware_organizer.reset(market_id=market_id) self.learnware_searcher = searcher self.learnware_searcher.reset(organizer=self.learnware_organizer) checker_list = [] if checker_list is None else checker_list @@ -77,14 +81,28 @@ class LearnwareMarket: for checker in self.learnware_checker.values(): checker.reset(organizer=self.learnware_organizer) + self.reset( + organizer_kwargs={"rebuild": rebuild, **organizer_kwargs}, + searcher_kwargs=searcher_kwargs, + checker_kwargs=checker_kwargs, + **kwargs, + ) + def reset(self, organizer_kwargs=None, searcher_kwargs=None, checker_kwargs=None, **kwargs): - organizer_kwargs = {} if organizer_kwargs is None else organizer_kwargs - searcher_kwargs = {} if searcher_kwargs is None else searcher_kwargs - checker_kwargs = {} if checker_kwargs is None else checker_kwargs - self.learnware_organizer.reset(**organizer_kwargs) - self.learnware_searcher.reset(**searcher_kwargs) - for checker in self.learnware_checker.values(): - checker.reset(**checker_kwargs) + if organizer_kwargs is not None: + self.learnware_organizer.reset(**organizer_kwargs) + + if searcher_kwargs is not None: + self.learnware_searcher.reset(**searcher_kwargs) + + if checker_kwargs is not None: + if len(set(checker_kwargs) & set(self.learnware_checker)): + for name, checker in self.learnware_checker.items(): + checker.reset(**checker_kwargs.get(name, {})) + else: + for checker in self.learnware_checker.values(): + checker.reset(**checker_kwargs) + for _k, _v in kwargs.items(): setattr(self, _k, _v) @@ -265,11 +283,11 @@ class BaseOrganizer: def __init__(self, market_id=None, **kwargs): self.reset(market_id=market_id, **kwargs) - def reset(self, market_id: str = None, reload_kwargs: dict = None): + def reset(self, market_id: str = None, rebuild=None, **kwargs): if market_id is not None: self.market_id = market_id - if reload_kwargs is not None: - self.reload_market(**reload_kwargs) + if rebuild is not None: + self.reload_market(rebuild=rebuild, **kwargs) def reload_market(self, rebuild=False, **kwargs) -> bool: """Reload the learnware organizer when server restared. diff --git a/learnware/market/module.py b/learnware/market/module.py index d1cd304..410347d 100644 --- a/learnware/market/module.py +++ b/learnware/market/module.py @@ -19,12 +19,23 @@ def get_market_config(): return market_config -def instantiate_learnware_market(market_id="default", name="easy", **kwargs): +def instantiate_learnware_market( + market_id="default", + name="easy", + rebuild=False, + organizer_kwargs=None, + searcher_kwargs=None, + checker_kwargs=None, + **kwargs +): market_config = get_market_config() return LearnwareMarket( market_id=market_id, organizer=market_config[name]["organizer"], searcher=market_config[name]["searcher"], checker_list=market_config[name]["checker_list"], + organizer_kwargs=organizer_kwargs, + searcher_kwargs=searcher_kwargs, + checker_kwargs=checker_kwargs, **kwargs ) From 4314194ccfb6fb248c34153427cfb66888d3b005 Mon Sep 17 00:00:00 2001 From: liuht Date: Tue, 14 Nov 2023 12:12:03 +0800 Subject: [PATCH 64/90] [FIX] fix details in user_info --- tests/test_hetero_market/test_hetero.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/test_hetero_market/test_hetero.py b/tests/test_hetero_market/test_hetero.py index e4a3d04..aa3c7a0 100644 --- a/tests/test_hetero_market/test_hetero.py +++ b/tests/test_hetero_market/test_hetero.py @@ -251,7 +251,7 @@ class TestMarket(unittest.TestCase): semantic_spec["Input"]["Dimension"] = user_dim # keep only the first user_dim descriptions semantic_spec["Input"]["Description"] = { - key: semantic_spec["Input"]["Description"][str(key)] for key in range(user_dim) + str(key): semantic_spec["Input"]["Description"][str(key)] for key in range(user_dim) } user_info = BaseUserInfo(semantic_spec=semantic_spec, stat_info={"RKMETableSpecification": user_spec}) @@ -303,7 +303,7 @@ class TestMarket(unittest.TestCase): semantic_spec["Input"] = copy.deepcopy(input_description_list[idx % 2]) semantic_spec["Input"]["Dimension"] = user_dim - 2 semantic_spec["Input"]["Description"] = { - key: semantic_spec["Input"]["Description"][str(key)] for key in range(user_dim) + "key": semantic_spec["Input"]["Description"][str(key)] for key in range(user_dim) } user_info = BaseUserInfo(semantic_spec=semantic_spec, stat_info={"RKMETableSpecification": user_spec}) @@ -407,13 +407,13 @@ class TestMarket(unittest.TestCase): def suite(): _suite = unittest.TestSuite() - _suite.addTest(TestMarket("test_prepare_learnware_randomly")) - _suite.addTest(TestMarket("test_generated_learnwares")) - _suite.addTest(TestMarket("test_upload_delete_learnware")) - _suite.addTest(TestMarket("test_train_market_model")) - _suite.addTest(TestMarket("test_search_semantics")) + # _suite.addTest(TestMarket("test_prepare_learnware_randomly")) + # _suite.addTest(TestMarket("test_generated_learnwares")) + # _suite.addTest(TestMarket("test_upload_delete_learnware")) + # _suite.addTest(TestMarket("test_train_market_model")) + # _suite.addTest(TestMarket("test_search_semantics")) _suite.addTest(TestMarket("test_stat_search")) - _suite.addTest(TestMarket("test_model_reuse")) + # _suite.addTest(TestMarket("test_model_reuse")) return _suite From 924655671065f73fee0e0682fde746de37890b61 Mon Sep 17 00:00:00 2001 From: liuht Date: Tue, 14 Nov 2023 12:15:07 +0800 Subject: [PATCH 65/90] [MNT | FIX] add cache-dir in feature_tokenizer --- learnware/market/heterogeneous/organizer/__init__.py | 4 +++- .../market/heterogeneous/organizer/hetero_map/__init__.py | 5 +++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/learnware/market/heterogeneous/organizer/__init__.py b/learnware/market/heterogeneous/organizer/__init__.py index 5a0418e..f5dfd4e 100644 --- a/learnware/market/heterogeneous/organizer/__init__.py +++ b/learnware/market/heterogeneous/organizer/__init__.py @@ -24,6 +24,7 @@ class HeteroMapTableOrganizer(EasyOrganizer): os.makedirs(hetero_folder_path, exist_ok=True) self.market_mapping_path = os.path.join(hetero_folder_path, "model.bin") self.hetero_specs_path = os.path.join(hetero_folder_path, "hetero_specifications") + self.training_args = {"cache_dir": hetero_folder_path} os.makedirs(self.hetero_specs_path, exist_ok=True) if os.path.exists(self.market_mapping_path): @@ -51,7 +52,8 @@ class HeteroMapTableOrganizer(EasyOrganizer): self.auto_update = auto_update self.auto_update_limit = auto_update_limit self.count_down = auto_update_limit - self.training_args = training_args + if hasattr(self, 'training_args'): + self.training_args.update(training_args) def add_learnware( self, zip_path: str, semantic_spec: dict, check_status: int, learnware_id: str = None diff --git a/learnware/market/heterogeneous/organizer/hetero_map/__init__.py b/learnware/market/heterogeneous/organizer/hetero_map/__init__.py index 9b22789..3c63580 100644 --- a/learnware/market/heterogeneous/organizer/hetero_map/__init__.py +++ b/learnware/market/heterogeneous/organizer/hetero_map/__init__.py @@ -59,6 +59,7 @@ class HeteroMap(nn.Module): activation="relu", device="cuda:0", checkpoint=None, + cache_dir=None, **kwargs, ): super(HeteroMap, self).__init__() @@ -73,11 +74,12 @@ class HeteroMap(nn.Module): "ffn_dim": ffn_dim, "projection_dim": projection_dim, "activation": activation, + "cache_dir": cache_dir } self.model_args.update(kwargs) if feature_tokenizer is None: - feature_tokenizer = FeatureTokenizer(**kwargs) + feature_tokenizer = FeatureTokenizer(cache_dir=cache_dir, **kwargs) self.feature_tokenizer = feature_tokenizer @@ -163,7 +165,6 @@ class HeteroMap(nn.Module): loss = self._self_supervised_contrastive_loss(feat_x_multiview) return loss - # def hetero_mapping(self, rkme_spec: RKMETableSpecification, features: dict) -> HeteroMapTableSpecification: def hetero_mapping(self, rkme_spec: RKMETableSpecification, features: dict) -> HeteroMapTableSpecification: hetero_spec = HeteroMapTableSpecification() data = rkme_spec.get_z() From 614f8edcf5b4ecb1aed803187227e33dc2367ce5 Mon Sep 17 00:00:00 2001 From: liuht Date: Tue, 14 Nov 2023 13:57:10 +0800 Subject: [PATCH 66/90] [MNT] further add cache_dir --- learnware/market/heterogeneous/organizer/__init__.py | 2 +- .../organizer/hetero_map/feature_extractor.py | 7 +++---- tests/test_hetero_market/test_hetero.py | 4 ++-- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/learnware/market/heterogeneous/organizer/__init__.py b/learnware/market/heterogeneous/organizer/__init__.py index f5dfd4e..76879bb 100644 --- a/learnware/market/heterogeneous/organizer/__init__.py +++ b/learnware/market/heterogeneous/organizer/__init__.py @@ -45,7 +45,7 @@ class HeteroMapTableOrganizer(EasyOrganizer): self._update_learnware_by_ids(self.get_learnware_ids(check_status=BaseChecker.USABLE_LEARWARE)) else: logger.warning(f"No market mapping to reload!") - self.market_mapping = HeteroMap() + self.market_mapping = HeteroMap(cache_dir=hetero_folder_path) def reset(self, market_id=None, auto_update=False, auto_update_limit=100, **training_args): self.market_id = market_id diff --git a/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py b/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py index e47d702..89105c0 100644 --- a/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py +++ b/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py @@ -65,12 +65,13 @@ class FeatureTokenizer: def __init__( self, disable_tokenizer_parallel=True, + cache_dir=None, **kwargs, ): """args: disable_tokenizer_parallel: true if use extractor for collator function in torch.DataLoader """ - self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") + self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", cache_dir=cache_dir) self.tokenizer.__dict__["model_max_length"] = 512 if disable_tokenizer_parallel: # disable tokenizer parallel os.environ["TOKENIZERS_PARALLELISM"] = "false" @@ -96,9 +97,7 @@ class FeatureTokenizer: """ encoded_inputs = { "x_num": None, - "num_col_input_ids": None, - "x_cat_input_ids": None, - "x_bin_input_ids": None, + "num_col_input_ids": None } num_cols = x.columns.tolist() if not shuffle else np.random.shuffle(x.columns.tolist()) x_num = x[num_cols].fillna(0) diff --git a/tests/test_hetero_market/test_hetero.py b/tests/test_hetero_market/test_hetero.py index aa3c7a0..3a55ca7 100644 --- a/tests/test_hetero_market/test_hetero.py +++ b/tests/test_hetero_market/test_hetero.py @@ -164,7 +164,7 @@ class TestMarket(unittest.TestCase): hetero_market = self._init_learnware_market() self.test_prepare_learnware_randomly(learnware_num) self.learnware_num = learnware_num - hetero_market.learnware_organizer.reset(auto_update=True, auto_update_limit=learnware_num) + hetero_market.learnware_organizer.reset(auto_update=False, auto_update_limit=learnware_num) print("Total Item:", len(hetero_market)) assert len(hetero_market) == 0, f"The market should be empty!" @@ -303,7 +303,7 @@ class TestMarket(unittest.TestCase): semantic_spec["Input"] = copy.deepcopy(input_description_list[idx % 2]) semantic_spec["Input"]["Dimension"] = user_dim - 2 semantic_spec["Input"]["Description"] = { - "key": semantic_spec["Input"]["Description"][str(key)] for key in range(user_dim) + str(key): semantic_spec["Input"]["Description"][str(key)] for key in range(user_dim) } user_info = BaseUserInfo(semantic_spec=semantic_spec, stat_info={"RKMETableSpecification": user_spec}) From 04fb184056e90a7167e84c4fff288f24c4f5e27e Mon Sep 17 00:00:00 2001 From: bxdd Date: Tue, 14 Nov 2023 14:12:41 +0800 Subject: [PATCH 67/90] [MNT] refactor instantiate market method --- learnware/market/base.py | 52 +++++++-------- .../heterogeneous/organizer/__init__.py | 10 ++- learnware/market/module.py | 63 +++++++++++-------- 3 files changed, 63 insertions(+), 62 deletions(-) diff --git a/learnware/market/base.py b/learnware/market/base.py index 3e5554c..ddf3d9a 100644 --- a/learnware/market/base.py +++ b/learnware/market/base.py @@ -60,33 +60,22 @@ class LearnwareMarket: def __init__( self, - market_id: str, organizer: BaseOrganizer, searcher: BaseSearcher, checker_list: List[BaseChecker] = None, - rebuild=False, - organizer_kwargs=None, - searcher_kwargs=None, - checker_kwargs=None, **kwargs, ): - self.market_id = market_id self.learnware_organizer = organizer - self.learnware_organizer.reset(market_id=market_id) self.learnware_searcher = searcher - self.learnware_searcher.reset(organizer=self.learnware_organizer) checker_list = [] if checker_list is None else checker_list self.learnware_checker = {checker.__class__.__name__: checker for checker in checker_list} for checker in self.learnware_checker.values(): checker.reset(organizer=self.learnware_organizer) - self.reset( - organizer_kwargs={"rebuild": rebuild, **organizer_kwargs}, - searcher_kwargs=searcher_kwargs, - checker_kwargs=checker_kwargs, - **kwargs, - ) + @property + def market_id(self): + return self.learnware_organizer.market_id def reset(self, organizer_kwargs=None, searcher_kwargs=None, checker_kwargs=None, **kwargs): if organizer_kwargs is not None: @@ -280,14 +269,12 @@ class LearnwareMarket: class BaseOrganizer: - def __init__(self, market_id=None, **kwargs): + def __init__(self, market_id, **kwargs): self.reset(market_id=market_id, **kwargs) - def reset(self, market_id: str = None, rebuild=None, **kwargs): - if market_id is not None: - self.market_id = market_id - if rebuild is not None: - self.reload_market(rebuild=rebuild, **kwargs) + def reset(self, market_id, rebuild=False, **kwargs): + self.market_id = market_id + self.reload_market(rebuild=rebuild, **kwargs) def reload_market(self, rebuild=False, **kwargs) -> bool: """Reload the learnware organizer when server restared. @@ -457,12 +444,11 @@ class BaseOrganizer: class BaseSearcher: - def __init__(self, organizer: BaseOrganizer = None, **kwargs): + def __init__(self, organizer: BaseOrganizer, **kwargs): self.reset(organizer=organizer, **kwargs) - def reset(self, organizer: BaseOrganizer = None, **kwargs): - if organizer is not None: - self.learnware_organizer = organizer + def reset(self, organizer: BaseOrganizer, **kwargs): + self.learnware_organizer = organizer def __call__(self, user_info: BaseUserInfo, check_status: int = None): """Search learnwares based on user_info from learnwares with check_status @@ -483,12 +469,8 @@ class BaseChecker: NONUSABLE_LEARNWARE = 0 USABLE_LEARWARE = 1 - def __init__(self, organizer: BaseOrganizer = None): - self.learnware_organizer = organizer - - def reset(self, organizer=None): - if organizer is not None: - self.learnware_organizer = organizer + def reset(self, **kwargs): + pass def __call__(self, learnware: Learnware) -> Tuple[int, str]: """Check the utility of a learnware @@ -511,3 +493,13 @@ class BaseChecker: """ raise NotImplementedError("'__call__' method is not implemented in BaseChecker") + + +class OrganizerRelatedChecker(BaseChecker): + """Here this is the interface for checker who is related to the organizer""" + + def __init__(self, organizer: BaseOrganizer, **kwargs): + self.reset(organizer=organizer, **kwargs) + + def reset(self, organizer: BaseOrganizer, **kwargs): + self.learnware_organizer = organizer diff --git a/learnware/market/heterogeneous/organizer/__init__.py b/learnware/market/heterogeneous/organizer/__init__.py index 2f03fba..afe8fb9 100644 --- a/learnware/market/heterogeneous/organizer/__init__.py +++ b/learnware/market/heterogeneous/organizer/__init__.py @@ -14,11 +14,8 @@ logger = get_module_logger("hetero_map_table_organizer") class HeteroMapTableOrganizer(EasyOrganizer): - def reload_market(self, rebuild=False, auto_update=False, auto_update_limit=100): + def reload_market(self, rebuild=False): super(HeteroMapTableOrganizer, self).reload_market(rebuild=rebuild) - self.auto_update = auto_update - self.auto_update_limit = auto_update_limit - self.count_down = auto_update_limit hetero_folder_path = os.path.join(self.market_store_path, "hetero") os.makedirs(hetero_folder_path, exist_ok=True) @@ -46,10 +43,11 @@ class HeteroMapTableOrganizer(EasyOrganizer): logger.warning(f"No market mapping to reload!") self.market_mapping = HeteroMap() - def reset(self, market_id=None, auto_update=False, auto_update_limit=100, **training_args): - self.market_id = market_id + def reset(self, market_id=None, rebuild=None, auto_update=False, auto_update_limit=100, **training_args): + super(HeteroMapTableOrganizer, self).reset(market_id, rebuild) self.auto_update = auto_update self.auto_update_limit = auto_update_limit + self.count_down = auto_update_limit self.training_args = training_args def add_learnware( diff --git a/learnware/market/module.py b/learnware/market/module.py index 410347d..97f49be 100644 --- a/learnware/market/module.py +++ b/learnware/market/module.py @@ -3,39 +3,50 @@ from .easy import EasyOrganizer, EasySearcher, EasySemanticChecker, EasyStatChec from .heterogeneous import HeteroMapTableOrganizer, HeteroSearcher -def get_market_config(): - market_config = { - "easy": { - "organizer": EasyOrganizer(), - "searcher": EasySearcher(), - "checker_list": [EasySemanticChecker(), EasyStatChecker()], - }, - "hetero": { - "organizer": HeteroMapTableOrganizer(), - "searcher": HeteroSearcher(), - "checker_list": [EasySemanticChecker(), EasyStatChecker()], - }, - } - return market_config +def get_market_component(name, market_id, rebuild, organizer_kwargs=None, searcher_kwargs=None, checker_kwargs=None): + organizer_kwargs = {} if organizer_kwargs is None else organizer_kwargs + searcher_kwargs = {} if searcher_kwargs is None else searcher_kwargs + checker_kwargs = {} if checker_kwargs is None else checker_kwargs + + if name == "easy": + easy_organizer = EasyOrganizer(market_id=market_id, rebuild=rebuild) + easy_searcher = EasySearcher(organizer=easy_organizer) + easy_checker_list = [EasySemanticChecker(), EasyStatChecker()] + market_component = { + "organizer": easy_organizer, + "searcher": easy_searcher, + "checker_list": easy_checker_list, + } + elif name == "hetero": + hetero_organizer = HeteroMapTableOrganizer(market_id=market_id, rebuild=rebuild, **organizer_kwargs) + hetero_searcher = HeteroSearcher(organizer=hetero_organizer) + hetero_checker_list = [EasySemanticChecker(), EasyStatChecker()] + + market_component = { + "organizer": hetero_organizer, + "searcher": hetero_searcher, + "checker_list": hetero_checker_list, + } + else: + raise ValueError(f"name {name} is not supported for market") + + return market_component def instantiate_learnware_market( market_id="default", name="easy", rebuild=False, - organizer_kwargs=None, - searcher_kwargs=None, - checker_kwargs=None, - **kwargs + organizer_kwargs: dict = None, + searcher_kwargs: dict = None, + checker_kwargs: dict = None, + **kwargs, ): - market_config = get_market_config() + market_componets = get_market_component(name, market_id, rebuild, organizer_kwargs, searcher_kwargs, checker_kwargs) return LearnwareMarket( market_id=market_id, - organizer=market_config[name]["organizer"], - searcher=market_config[name]["searcher"], - checker_list=market_config[name]["checker_list"], - organizer_kwargs=organizer_kwargs, - searcher_kwargs=searcher_kwargs, - checker_kwargs=checker_kwargs, - **kwargs + organizer=market_componets["organizer"], + searcher=market_componets["searcher"], + checker_list=market_componets["checker_list"], + **kwargs, ) From 7840b96905b8077d2093d93c27295071437031a7 Mon Sep 17 00:00:00 2001 From: bxdd Date: Tue, 14 Nov 2023 14:14:20 +0800 Subject: [PATCH 68/90] [MNT] make hetero organizer fitting the market base class --- learnware/market/heterogeneous/organizer/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/learnware/market/heterogeneous/organizer/__init__.py b/learnware/market/heterogeneous/organizer/__init__.py index afe8fb9..fda5347 100644 --- a/learnware/market/heterogeneous/organizer/__init__.py +++ b/learnware/market/heterogeneous/organizer/__init__.py @@ -43,7 +43,7 @@ class HeteroMapTableOrganizer(EasyOrganizer): logger.warning(f"No market mapping to reload!") self.market_mapping = HeteroMap() - def reset(self, market_id=None, rebuild=None, auto_update=False, auto_update_limit=100, **training_args): + def reset(self, market_id, rebuild=False, auto_update=False, auto_update_limit=100, **training_args): super(HeteroMapTableOrganizer, self).reset(market_id, rebuild) self.auto_update = auto_update self.auto_update_limit = auto_update_limit From 4d417669c510e525c8f4ef902070d35392b672ca Mon Sep 17 00:00:00 2001 From: bxdd Date: Tue, 14 Nov 2023 14:35:05 +0800 Subject: [PATCH 69/90] [FIX | MNT] fix bugs in easy seacher, update test to fit with current interface --- learnware/market/easy/searcher.py | 4 ++-- tests/test_hetero_market/test_hetero.py | 23 +++++++++++++---------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/learnware/market/easy/searcher.py b/learnware/market/easy/searcher.py index 8de00af..fc04297 100644 --- a/learnware/market/easy/searcher.py +++ b/learnware/market/easy/searcher.py @@ -591,10 +591,10 @@ class EasyStatSearcher(BaseSearcher): class EasySearcher(BaseSearcher): - def __init__(self, organizer: EasyOrganizer = None): - super(EasySearcher, self).__init__(organizer) + def __init__(self, organizer: EasyOrganizer): self.semantic_searcher = EasyFuzzSemanticSearcher(organizer) self.stat_searcher = EasyStatSearcher(organizer) + super(EasySearcher, self).__init__(organizer) def reset(self, organizer): self.learnware_organizer = organizer diff --git a/tests/test_hetero_market/test_hetero.py b/tests/test_hetero_market/test_hetero.py index 3a55ca7..58c285e 100644 --- a/tests/test_hetero_market/test_hetero.py +++ b/tests/test_hetero_market/test_hetero.py @@ -57,9 +57,11 @@ class TestMarket(unittest.TestCase): np.random.seed(2023) learnware.init() - def _init_learnware_market(self): + def _init_learnware_market(self, organizer_kwargs=None): """initialize learnware market""" - hetero_market = instantiate_learnware_market(market_id="hetero_toy", name="hetero", rebuild=True) + hetero_market = instantiate_learnware_market( + market_id="hetero_toy", name="hetero", rebuild=True, organizer_kwargs=organizer_kwargs + ) return hetero_market def test_prepare_learnware_randomly(self, learnware_num=5): @@ -161,10 +163,11 @@ class TestMarket(unittest.TestCase): return hetero_market def test_train_market_model(self, learnware_num=5): - hetero_market = self._init_learnware_market() + hetero_market = self._init_learnware_market( + organizer_kwargs={"auto_update": False, "auto_update_limit": learnware_num} + ) self.test_prepare_learnware_randomly(learnware_num) self.learnware_num = learnware_num - hetero_market.learnware_organizer.reset(auto_update=False, auto_update_limit=learnware_num) print("Total Item:", len(hetero_market)) assert len(hetero_market) == 0, f"The market should be empty!" @@ -407,13 +410,13 @@ class TestMarket(unittest.TestCase): def suite(): _suite = unittest.TestSuite() - # _suite.addTest(TestMarket("test_prepare_learnware_randomly")) - # _suite.addTest(TestMarket("test_generated_learnwares")) - # _suite.addTest(TestMarket("test_upload_delete_learnware")) - # _suite.addTest(TestMarket("test_train_market_model")) - # _suite.addTest(TestMarket("test_search_semantics")) + _suite.addTest(TestMarket("test_prepare_learnware_randomly")) + _suite.addTest(TestMarket("test_generated_learnwares")) + _suite.addTest(TestMarket("test_upload_delete_learnware")) + _suite.addTest(TestMarket("test_train_market_model")) + _suite.addTest(TestMarket("test_search_semantics")) _suite.addTest(TestMarket("test_stat_search")) - # _suite.addTest(TestMarket("test_model_reuse")) + _suite.addTest(TestMarket("test_model_reuse")) return _suite From edde6807326b1e1f8145c0e9463969c583179bf8 Mon Sep 17 00:00:00 2001 From: bxdd Date: Tue, 14 Nov 2023 14:49:05 +0800 Subject: [PATCH 70/90] [FIX] fix init market module, and update hetero organizer reload method --- learnware/market/heterogeneous/organizer/__init__.py | 5 +++-- learnware/market/module.py | 1 - 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/learnware/market/heterogeneous/organizer/__init__.py b/learnware/market/heterogeneous/organizer/__init__.py index 78c738c..e773812 100644 --- a/learnware/market/heterogeneous/organizer/__init__.py +++ b/learnware/market/heterogeneous/organizer/__init__.py @@ -21,7 +21,7 @@ class HeteroMapTableOrganizer(EasyOrganizer): os.makedirs(hetero_folder_path, exist_ok=True) self.market_mapping_path = os.path.join(hetero_folder_path, "model.bin") self.hetero_specs_path = os.path.join(hetero_folder_path, "hetero_specifications") - self.training_args = {"cache_dir": hetero_folder_path} + self.training_args.update({"cache_dir": hetero_folder_path}) os.makedirs(self.hetero_specs_path, exist_ok=True) if os.path.exists(self.market_mapping_path): @@ -45,12 +45,13 @@ class HeteroMapTableOrganizer(EasyOrganizer): self.market_mapping = HeteroMap(cache_dir=hetero_folder_path) def reset(self, market_id, rebuild=False, auto_update=False, auto_update_limit=100, **training_args): - super(HeteroMapTableOrganizer, self).reset(market_id, rebuild) self.auto_update = auto_update self.auto_update_limit = auto_update_limit self.count_down = auto_update_limit self.training_args = training_args + super(HeteroMapTableOrganizer, self).reset(market_id, rebuild) + def add_learnware( self, zip_path: str, semantic_spec: dict, check_status: int, learnware_id: str = None ) -> Tuple[str, int]: diff --git a/learnware/market/module.py b/learnware/market/module.py index 97f49be..70fd340 100644 --- a/learnware/market/module.py +++ b/learnware/market/module.py @@ -44,7 +44,6 @@ def instantiate_learnware_market( ): market_componets = get_market_component(name, market_id, rebuild, organizer_kwargs, searcher_kwargs, checker_kwargs) return LearnwareMarket( - market_id=market_id, organizer=market_componets["organizer"], searcher=market_componets["searcher"], checker_list=market_componets["checker_list"], From 81afc16efc299f80d279fd4e4b26f56388a51b70 Mon Sep 17 00:00:00 2001 From: bxdd Date: Tue, 14 Nov 2023 14:56:46 +0800 Subject: [PATCH 71/90] [DOC] update docstring for unify them --- .../organizer/hetero_map/feature_extractor.py | 40 ++++++--------- .../organizer/hetero_map/trainer.py | 50 +++++++++---------- 2 files changed, 40 insertions(+), 50 deletions(-) diff --git a/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py b/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py index 89105c0..10d390a 100644 --- a/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py +++ b/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py @@ -10,9 +10,7 @@ from transformers import BertTokenizerFast class WordEmbedding(nn.Module): - """ - Encode tokens drawn from column names - """ + """Encode tokens drawn from column names""" def __init__( self, @@ -36,9 +34,7 @@ class WordEmbedding(nn.Module): class NumEmbedding(nn.Module): - """ - Encode tokens drawn from column names and the corresponding numerical features. - """ + """Encode tokens drawn from column names and the corresponding numerical features.""" def __init__(self, hidden_dim): super().__init__() @@ -47,9 +43,13 @@ class NumEmbedding(nn.Module): nn_init.uniform_(self.num_bias, a=-1 / math.sqrt(hidden_dim), b=1 / math.sqrt(hidden_dim)) def forward(self, col_emb, x_ts) -> Tensor: - """args: - col_emb: numerical column embedding, (# numerical columns, emb_dim) - x_ts: numerical features, (bs, emb_dim) + """ + Parameters + ---------- + col_emb : Any + numerical column embedding, (# numerical columns, emb_dim) + x_ts : Any + numerical features, (bs, emb_dim) """ col_emb = col_emb.unsqueeze(0).expand((x_ts.shape[0], -1, -1)) feat_emb = col_emb * x_ts.unsqueeze(-1).float() + self.num_bias @@ -57,10 +57,7 @@ class NumEmbedding(nn.Module): class FeatureTokenizer: - """ - Process input dataframe to input indices towards encoder, - usually used to build dataloader for paralleling loading. - """ + """Process input dataframe to input indices towards encoder, usually used to build dataloader for paralleling loading.""" def __init__( self, @@ -68,8 +65,11 @@ class FeatureTokenizer: cache_dir=None, **kwargs, ): - """args: - disable_tokenizer_parallel: true if use extractor for collator function in torch.DataLoader + """ + Parameters + ---------- + disable_tokenizer_parallel : bool, optional + true if use extractor for collator function in torch.DataLoader """ self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", cache_dir=cache_dir) self.tokenizer.__dict__["model_max_length"] = 512 @@ -95,10 +95,7 @@ class FeatureTokenizer: 'num_col_input_ids': tensor contains numerical column tokenized ids, } """ - encoded_inputs = { - "x_num": None, - "num_col_input_ids": None - } + encoded_inputs = {"x_num": None, "num_col_input_ids": None} num_cols = x.columns.tolist() if not shuffle else np.random.shuffle(x.columns.tolist()) x_num = x[num_cols].fillna(0) @@ -194,11 +191,6 @@ class FeatureProcessor(nn.Module): num_att_mask=None, **kwargs, ) -> Tensor: - """args: - x: pd.DataFrame with column names and features. - shuffle: if shuffle column order during the training. - num_mask: indicate the NaN place of numerical features, 0: NaN 1: normal. - """ x_num = x_num.to(self.device) num_col_emb = self.word_embedding(num_col_input_ids.to(self.device)) diff --git a/learnware/market/heterogeneous/organizer/hetero_map/trainer.py b/learnware/market/heterogeneous/organizer/hetero_map/trainer.py index f192b78..c4a85e6 100644 --- a/learnware/market/heterogeneous/organizer/hetero_map/trainer.py +++ b/learnware/market/heterogeneous/organizer/hetero_map/trainer.py @@ -30,11 +30,6 @@ class Trainer: eval_batch_size=256, **kwargs, ): - """args: - train_set_list: a list of training sets [(x_1,y_1),(x_2,y_2),...] - patience: the max number of early stop patience - eval_less_is_better: if the set eval_metric is the less the better. For val_loss, it should be set True. - """ self.model = model if isinstance(train_set_list, tuple): train_set_list = [train_set_list] @@ -129,9 +124,7 @@ class Trainer: return trainloader def _get_parameter_names(self, model, forbidden_layer_types): - """ - Returns the names of the model parameters that are not inside a forbidden layer. - """ + """Returns the names of the model parameters that are not inside a forbidden layer.""" result = [] for name, child in model.named_children(): result += [ @@ -174,9 +167,7 @@ class TransTabCollatorForCL: self.num_partition = num_partition def __call__(self, data): - """ - Take a list of subsets (views) from the original tests. - """ + """Take a list of subsets (views) from the original tests.""" # 1. build positive pairs # 2. encode each pair using feature extractor df_x = pd.concat([row for row in data]) @@ -192,15 +183,19 @@ class TransTabCollatorForCL: return res def _build_positive_pairs(self, x, n): - """ - Builds positive pairs of sub-dataframes from the input dataframe x. - - Args: - x (pandas.DataFrame): Input dataframe. - n (int): Number of sub-dataframes to split x into. - - Returns: - list: List of sub-dataframes, each containing a positive pair of columns from x. + """Builds positive pairs of sub-dataframes from the input dataframe x. + + Parameters + ---------- + x : pandas.DataFrame + Input dataframe. + n : int + Number of sub-dataframes to split x into. + + Returns + ------- + List + List of sub-dataframes, each containing a positive pair of columns from x. """ x_cols = x.columns.tolist() sub_col_list = np.array_split(np.array(x_cols), n) @@ -217,14 +212,17 @@ class TransTabCollatorForCL: return sub_x_list def _build_positive_pairs_single_view(self, x): - """ - Builds positive pairs for a single view of data by corrupting half of the columns and shuffling the corrupted columns. + """Builds positive pairs for a single view of data by corrupting half of the columns and shuffling the corrupted columns. - Args: - x (pandas.DataFrame): The input data. + Parameters + ---------- + x : pandas.DataFrame + The input data. - Returns: - list: A list of two pandas DataFrames, where each DataFrame contains the original data with half of the columns corrupted and shuffled. + Returns + ------- + List + A list of two pandas DataFrames, where each DataFrame contains the original data with half of the columns corrupted and shuffled. """ x_cols = x.columns.tolist() sub_x_list = [x] From ead0dbe2a818bcfa83d299fd7ca77ed5ea1099cd Mon Sep 17 00:00:00 2001 From: bxdd Date: Tue, 14 Nov 2023 15:10:44 +0800 Subject: [PATCH 72/90] [DOC] update docstring for hetero map --- .../organizer/hetero_map/__init__.py | 80 +++++++++++-------- 1 file changed, 48 insertions(+), 32 deletions(-) diff --git a/learnware/market/heterogeneous/organizer/hetero_map/__init__.py b/learnware/market/heterogeneous/organizer/hetero_map/__init__.py index 3c63580..265a0ee 100644 --- a/learnware/market/heterogeneous/organizer/hetero_map/__init__.py +++ b/learnware/market/heterogeneous/organizer/hetero_map/__init__.py @@ -24,23 +24,6 @@ class HeteroMap(nn.Module): is modified for feature extraction purposes only. The class implements a neural network module for processing tabular data, specifically tuned for numerical features. - - Args: - feature_tokenizer (FeatureTokenizer, optional): Tokenizer for feature representation. - hidden_dim (int, optional): Dimension of hidden layer. - num_layer (int, optional): Number of layers in the transformer encoder. - num_attention_head (int, optional): Number of attention heads in the transformer. - hidden_dropout_prob (float, optional): Dropout probability for hidden layers. - ffn_dim (int, optional): Dimension of feedforward network. - projection_dim (int, optional): Dimension for projection head. - overlap_ratio (float, optional): Overlap ratio for tokenization. - num_partition (int, optional): Number of partitions for collation. - temperature (float, optional): Temperature parameter for contrastive learning. - base_temperature (float, optional): Base temperature parameter. - activation (str, optional): Activation function for transformer layers. - device (str, optional): Device to run the model on. - checkpoint (str, optional): Path to a pre-trained model checkpoint. - **kwargs: Additional keyword arguments. """ def __init__( @@ -58,10 +41,41 @@ class HeteroMap(nn.Module): base_temperature=10, activation="relu", device="cuda:0", - checkpoint=None, cache_dir=None, **kwargs, ): + """ + Parameters + ---------- + feature_tokenizer : FeatureTokenizer, optional + Tokenizer for feature representation, by default None + hidden_dim : int, optional + Dimension of hidden layer, by default 128 + num_layer : int, optional + Number of layers in the transformer encoder, by default 2 + num_attention_head : int, optional + Number of attention heads in the transformer, by default 8 + hidden_dropout_prob : int, optional + Dropout probability for hidden layers, by default 0 + ffn_dim : int, optional + Dimension of feedforward network, by default 256 + projection_dim : int, optional + Dimension for projection head, by default 128 + overlap_ratio : float, optional + Overlap ratio for tokenizatio, by default 0.5 + num_partition : int, optional + Number of partitions for collatio, by default 3 + temperature : int, optional + Temperature parameter for contrastive learnin, by default 10 + base_temperature : int, optional + Base temperature paramete, by default 10 + activation : str, optional + Activation function for transformer layer, by default "relu" + device : str, optional + Device to run the model on, by default "cuda:0" + cache_dir : str, optional + The cache directory, by default None + """ super(HeteroMap, self).__init__() self.model_args = { @@ -74,7 +88,7 @@ class HeteroMap(nn.Module): "ffn_dim": ffn_dim, "projection_dim": projection_dim, "activation": activation, - "cache_dir": cache_dir + "cache_dir": cache_dir, } self.model_args.update(kwargs) @@ -139,10 +153,7 @@ class HeteroMap(nn.Module): the directory path to save. """ # save model weight state dict - model_info = { - "model_state_dict": self.state_dict(), - "model_args": self.model_args - } + model_info = {"model_state_dict": self.state_dict(), "model_args": self.model_args} torch.save(model_info, checkpoint) def forward(self, x, y=None): @@ -361,14 +372,16 @@ class TransformerLayer(nn.Module): def forward(self, src, src_mask=None, src_key_padding_mask=None, is_causal=None, **kwargs) -> Tensor: """Pass the input through the encoder layer. - Args: - src: the sequence to the encoder layer (required). - src_mask: the mask for the src sequence (optional). - src_key_padding_mask: the mask for the src keys per batch (optional). - - Shape: - see the docs in Transformer class. + Parameters + ---------- + src : Any + The sequence to the encoder layer. + src_mask : Any, optional + The mask for the src sequence, by default None + src_key_padding_mask : Any, optional + The mask for the src keys per batch, by default None """ + # see Fig. 1 of https://arxiv.org/pdf/2002.04745v1.pdf x = src if self.use_layer_norm: @@ -427,8 +440,11 @@ class TransformerMultiLayer(nn.Module): self.transformer_encoder.append(stacked_transformer) def forward(self, embedding, attention_mask=None, **kwargs) -> Tensor: - """args: - embedding: bs, num_token, hidden_dim + """ + Parameters + ---------- + embedding : Any + bs, num_token, hidden_dim """ outputs = embedding for i, mod in enumerate(self.transformer_encoder): From 5c695ef45dd238aa9d819a4ce1d1dcf60bfb82fa Mon Sep 17 00:00:00 2001 From: Gene Date: Tue, 14 Nov 2023 15:23:58 +0800 Subject: [PATCH 73/90] [MNT] add cache dir for FeatureTokenizer --- learnware/config.py | 3 +++ learnware/market/heterogeneous/organizer/__init__.py | 4 ++-- .../heterogeneous/organizer/hetero_map/__init__.py | 9 ++------- .../organizer/hetero_map/feature_extractor.py | 10 +++++----- 4 files changed, 12 insertions(+), 14 deletions(-) diff --git a/learnware/config.py b/learnware/config.py index 45c04b1..84c839b 100644 --- a/learnware/config.py +++ b/learnware/config.py @@ -63,11 +63,13 @@ LEARNWARE_FOLDER_POOL_PATH = os.path.join(LEARNWARE_POOL_PATH, "learnwares") DATABASE_PATH = os.path.join(ROOT_DIRPATH, "database") STDOUT_PATH = os.path.join(ROOT_DIRPATH, "stdout") +CACHE_PATH = os.path.join(ROOT_DIRPATH, "cache") # TODO: Delete them later os.makedirs(ROOT_DIRPATH, exist_ok=True) os.makedirs(DATABASE_PATH, exist_ok=True) os.makedirs(STDOUT_PATH, exist_ok=True) +os.makedirs(CACHE_PATH, exist_ok=True) semantic_config = { "Data": { @@ -123,6 +125,7 @@ _DEFAULT_CONFIG = { "root_path": ROOT_DIRPATH, "package_path": PACKAGE_DIRPATH, "stdout_path": STDOUT_PATH, + "cache_path": CACHE_PATH, "logging_level": logging.INFO, "logging_outfile": None, "semantic_specs": semantic_config, diff --git a/learnware/market/heterogeneous/organizer/__init__.py b/learnware/market/heterogeneous/organizer/__init__.py index 78c738c..07604b8 100644 --- a/learnware/market/heterogeneous/organizer/__init__.py +++ b/learnware/market/heterogeneous/organizer/__init__.py @@ -21,7 +21,7 @@ class HeteroMapTableOrganizer(EasyOrganizer): os.makedirs(hetero_folder_path, exist_ok=True) self.market_mapping_path = os.path.join(hetero_folder_path, "model.bin") self.hetero_specs_path = os.path.join(hetero_folder_path, "hetero_specifications") - self.training_args = {"cache_dir": hetero_folder_path} + self.training_args = {} os.makedirs(self.hetero_specs_path, exist_ok=True) if os.path.exists(self.market_mapping_path): @@ -42,7 +42,7 @@ class HeteroMapTableOrganizer(EasyOrganizer): self._update_learnware_by_ids(self.get_learnware_ids(check_status=BaseChecker.USABLE_LEARWARE)) else: logger.warning(f"No market mapping to reload!") - self.market_mapping = HeteroMap(cache_dir=hetero_folder_path) + self.market_mapping = HeteroMap() def reset(self, market_id, rebuild=False, auto_update=False, auto_update_limit=100, **training_args): super(HeteroMapTableOrganizer, self).reset(market_id, rebuild) diff --git a/learnware/market/heterogeneous/organizer/hetero_map/__init__.py b/learnware/market/heterogeneous/organizer/hetero_map/__init__.py index 3c63580..653514c 100644 --- a/learnware/market/heterogeneous/organizer/hetero_map/__init__.py +++ b/learnware/market/heterogeneous/organizer/hetero_map/__init__.py @@ -59,7 +59,6 @@ class HeteroMap(nn.Module): activation="relu", device="cuda:0", checkpoint=None, - cache_dir=None, **kwargs, ): super(HeteroMap, self).__init__() @@ -74,12 +73,11 @@ class HeteroMap(nn.Module): "ffn_dim": ffn_dim, "projection_dim": projection_dim, "activation": activation, - "cache_dir": cache_dir } self.model_args.update(kwargs) if feature_tokenizer is None: - feature_tokenizer = FeatureTokenizer(cache_dir=cache_dir, **kwargs) + feature_tokenizer = FeatureTokenizer(**kwargs) self.feature_tokenizer = feature_tokenizer @@ -139,10 +137,7 @@ class HeteroMap(nn.Module): the directory path to save. """ # save model weight state dict - model_info = { - "model_state_dict": self.state_dict(), - "model_args": self.model_args - } + model_info = {"model_state_dict": self.state_dict(), "model_args": self.model_args} torch.save(model_info, checkpoint) def forward(self, x, y=None): diff --git a/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py b/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py index 89105c0..4c7bcef 100644 --- a/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py +++ b/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py @@ -8,6 +8,8 @@ import torch.nn.init as nn_init from torch import Tensor, nn from transformers import BertTokenizerFast +from .....config import C as conf + class WordEmbedding(nn.Module): """ @@ -65,12 +67,13 @@ class FeatureTokenizer: def __init__( self, disable_tokenizer_parallel=True, - cache_dir=None, **kwargs, ): """args: disable_tokenizer_parallel: true if use extractor for collator function in torch.DataLoader """ + cache_dir = conf["cache_path"] + os.makedirs(cache_dir, exist_ok=True) self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", cache_dir=cache_dir) self.tokenizer.__dict__["model_max_length"] = 512 if disable_tokenizer_parallel: # disable tokenizer parallel @@ -95,10 +98,7 @@ class FeatureTokenizer: 'num_col_input_ids': tensor contains numerical column tokenized ids, } """ - encoded_inputs = { - "x_num": None, - "num_col_input_ids": None - } + encoded_inputs = {"x_num": None, "num_col_input_ids": None} num_cols = x.columns.tolist() if not shuffle else np.random.shuffle(x.columns.tolist()) x_num = x[num_cols].fillna(0) From 9407a76bc9b881cbd182dacd71d15bed2b1bcc78 Mon Sep 17 00:00:00 2001 From: bxdd Date: Tue, 14 Nov 2023 16:47:31 +0800 Subject: [PATCH 74/90] [MNT] add gpu utils --- .../organizer/hetero_map/__init__.py | 2 +- learnware/reuse/hetero/feature_align.py | 3 +- learnware/specification/regular/image/rkme.py | 3 +- learnware/specification/regular/table/rkme.py | 41 +---------------- .../specification/system/hetero_table.py | 5 +- learnware/utils/__init__.py | 1 + learnware/utils/gpu.py | 46 +++++++++++++++++++ 7 files changed, 56 insertions(+), 45 deletions(-) create mode 100644 learnware/utils/gpu.py diff --git a/learnware/market/heterogeneous/organizer/hetero_map/__init__.py b/learnware/market/heterogeneous/organizer/hetero_map/__init__.py index 265a0ee..6155732 100644 --- a/learnware/market/heterogeneous/organizer/hetero_map/__init__.py +++ b/learnware/market/heterogeneous/organizer/hetero_map/__init__.py @@ -40,7 +40,7 @@ class HeteroMap(nn.Module): temperature=10, base_temperature=10, activation="relu", - device="cuda:0", + device="cpu", cache_dir=None, **kwargs, ): diff --git a/learnware/reuse/hetero/feature_align.py b/learnware/reuse/hetero/feature_align.py index 71e3d29..beaf1ca 100644 --- a/learnware/reuse/hetero/feature_align.py +++ b/learnware/reuse/hetero/feature_align.py @@ -7,10 +7,11 @@ from tqdm import trange import torch.nn.functional as F from ..align import AlignLearnware +from ...utils import choose_device from ...logger import get_module_logger from ...learnware import Learnware from ...specification import RKMETableSpecification -from ...specification.regular.table.rkme import choose_device + logger = get_module_logger("feature_align") diff --git a/learnware/specification/regular/image/rkme.py b/learnware/specification/regular/image/rkme.py index c09be8b..50e367a 100644 --- a/learnware/specification/regular/image/rkme.py +++ b/learnware/specification/regular/image/rkme.py @@ -18,7 +18,8 @@ from tqdm import tqdm from . import cnn_gp from ..base import RegularStatSpecification -from ..table.rkme import rkme_solve_qp, choose_device, setup_seed +from ..table.rkme import rkme_solve_qp +from ....utils import choose_device, setup_seed class RKMEImageSpecification(RegularStatSpecification): diff --git a/learnware/specification/regular/table/rkme.py b/learnware/specification/regular/table/rkme.py index 996b9a9..8b97632 100644 --- a/learnware/specification/regular/table/rkme.py +++ b/learnware/specification/regular/table/rkme.py @@ -15,6 +15,7 @@ from sklearn.cluster import MiniBatchKMeans from ..base import RegularStatSpecification from ....logger import get_module_logger +from ....utils import setup_seed, choose_device logger = get_module_logger("rkme") @@ -461,46 +462,6 @@ class RKMEStatSpecification(RKMETableSpecification): super(RKMETableSpecification, self).__init__(type=RKMETableSpecification.__name__) -def setup_seed(seed): - """Fix a random seed for addressing reproducibility issues. - - Parameters - ---------- - seed : int - Random seed for torch, torch.cuda, numpy, random and cudnn libraries. - """ - torch.manual_seed(seed) - torch.cuda.manual_seed_all(seed) - np.random.seed(seed) - random.seed(seed) - torch.backends.cudnn.deterministic = True - - -def choose_device(cuda_idx=-1): - """Let users choose compuational device between CPU or GPU. - - Parameters - ---------- - cuda_idx : int, optional - GPU index, by default -1 which stands for using CPU instead. - - Returns - ------- - torch.device - A torch.device object - """ - cuda_idx = int(cuda_idx) - if cuda_idx == -1 or not torch.cuda.is_available(): - device = torch.device("cpu") - else: - device_count = torch.cuda.device_count() - if cuda_idx >= 0 and cuda_idx < device_count: - device = torch.device(f"cuda:{cuda_idx}") - else: - device = torch.device("cuda:0") - return device - - def torch_rbf_kernel(x1, x2, gamma) -> torch.Tensor: """Use pytorch to compute rbf_kernel function at faster speed. diff --git a/learnware/specification/system/hetero_table.py b/learnware/specification/system/hetero_table.py index 918ee11..4e89f2d 100644 --- a/learnware/specification/system/hetero_table.py +++ b/learnware/specification/system/hetero_table.py @@ -7,9 +7,10 @@ import torch import codecs import numpy as np -from ..regular import RKMETableSpecification -from ..regular.table.rkme import choose_device, setup_seed, torch_rbf_kernel from .base import SystemStatSpecification +from ..regular import RKMETableSpecification +from ..regular.table.rkme import torch_rbf_kernel +from ...utils import choose_device, setup_seed class HeteroMapTableSpecification(SystemStatSpecification): diff --git a/learnware/utils/__init__.py b/learnware/utils/__init__.py index 60f2b46..f37bc03 100644 --- a/learnware/utils/__init__.py +++ b/learnware/utils/__init__.py @@ -4,6 +4,7 @@ import zipfile from .import_utils import is_torch_avaliable from .module import get_module_by_module_path from .file import read_yaml_to_dict, save_dict_to_yaml +from .gpu import setup_seed, choose_device def zip_learnware_folder(path: str, output_name: str): diff --git a/learnware/utils/gpu.py b/learnware/utils/gpu.py new file mode 100644 index 0000000..95fbfe1 --- /dev/null +++ b/learnware/utils/gpu.py @@ -0,0 +1,46 @@ +import random +import numpy as np + + +def setup_seed(seed): + import torch + + """Fix a random seed for addressing reproducibility issues. + + Parameters + ---------- + seed : int + Random seed for torch, torch.cuda, numpy, random and cudnn libraries. + """ + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + np.random.seed(seed) + random.seed(seed) + torch.backends.cudnn.deterministic = True + + +def choose_device(cuda_idx=-1): + import torch + + """Let users choose compuational device between CPU or GPU. + + Parameters + ---------- + cuda_idx : int, optional + GPU index, by default -1 which stands for using CPU instead. + + Returns + ------- + torch.device + A torch.device object + """ + cuda_idx = int(cuda_idx) + if cuda_idx == -1 or not torch.cuda.is_available(): + device = torch.device("cpu") + else: + device_count = torch.cuda.device_count() + if cuda_idx >= 0 and cuda_idx < device_count: + device = torch.device(f"cuda:{cuda_idx}") + else: + device = torch.device("cuda:0") + return device From ea1fd9ffe36dc2a2a8ed569adb773733a5e14830 Mon Sep 17 00:00:00 2001 From: bxdd Date: Tue, 14 Nov 2023 20:27:14 +0800 Subject: [PATCH 75/90] [MNT] update hetero organizer --- learnware/market/easy/organizer.py | 7 +------ learnware/market/heterogeneous/organizer/__init__.py | 12 +++++++----- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/learnware/market/easy/organizer.py b/learnware/market/easy/organizer.py index 412664b..e3587e8 100644 --- a/learnware/market/easy/organizer.py +++ b/learnware/market/easy/organizer.py @@ -374,13 +374,8 @@ class EasyOrganizer(BaseOrganizer): return [self.learnware_list[idx] for idx in learnware_ids] def reload_learnware(self, learnware_id: str): - current_learnware = self.learnware_list.get(learnware_id) - - if current_learnware is None: - # add learnware + if learnware_id not in self.learnware_list: self.count += 1 - else: - pass target_zip_dir = os.path.join(self.learnware_zip_pool_path, "%s.zip" % (learnware_id)) target_folder_dir = os.path.join(self.learnware_folder_pool_path, learnware_id) diff --git a/learnware/market/heterogeneous/organizer/__init__.py b/learnware/market/heterogeneous/organizer/__init__.py index 758570e..f397726 100644 --- a/learnware/market/heterogeneous/organizer/__init__.py +++ b/learnware/market/heterogeneous/organizer/__init__.py @@ -29,13 +29,15 @@ class HeteroMapTableOrganizer(EasyOrganizer): if not rebuild: if os.path.exists(self.hetero_specs_path): for hetero_json_path in os.listdir(self.hetero_specs_path): + if not hetero_json_path.endswith(".json"): + continue try: idx = hetero_json_path.split(".")[0] hetero_spec = HeteroMapTableSpecification() - hetero_spec.load(os.path.join(self.hetero_specs_path, f"{idx}.json")) + hetero_spec.load(os.path.join(self.hetero_specs_path, hetero_json_path)) self.learnware_list[idx].update_stat_spec(hetero_spec.type, hetero_spec) - except: - logger.warning(f"Learnware {idx} hetero spec loaded failed!") + except Exception as err: + logger.warning(f"Learnware in {hetero_json_path} hetero spec loaded failed! due to {err}.") else: logger.info("No HeteroMapTableSpecification to reload. Use loaded market mapping to regenerate.") self._update_learnware_by_ids(self.get_learnware_ids(check_status=BaseChecker.USABLE_LEARWARE)) @@ -136,8 +138,8 @@ class HeteroMapTableOrganizer(EasyOrganizer): semantic_spec, rkme = spec.get_semantic_spec(), spec.get_stat_spec().get("RKMETableSpecification", None) if isinstance(rkme, RKMETableSpecification) and isinstance(semantic_spec["Input"], dict): ret.append(idx) - except: - continue + except Exception: + pass return ret def generate_hetero_map_spec(self, user_info: BaseUserInfo) -> HeteroMapTableSpecification: From fe760d14287cf0d03567d396d4dd523f51e1cede Mon Sep 17 00:00:00 2001 From: bxdd Date: Tue, 14 Nov 2023 21:35:16 +0800 Subject: [PATCH 76/90] [MNT] del self.learnware in HeteroMapAlignLearnware and FeatureAlignLearnware --- .../market/heterogeneous/organizer/hetero_map/__init__.py | 7 +++---- .../organizer/hetero_map/feature_extractor.py | 2 +- learnware/reuse/align.py | 1 - learnware/reuse/hetero/feature_align.py | 5 ++--- learnware/reuse/hetero/hetero_map.py | 2 +- 5 files changed, 7 insertions(+), 10 deletions(-) diff --git a/learnware/market/heterogeneous/organizer/hetero_map/__init__.py b/learnware/market/heterogeneous/organizer/hetero_map/__init__.py index 2a2397c..97a92da 100644 --- a/learnware/market/heterogeneous/organizer/hetero_map/__init__.py +++ b/learnware/market/heterogeneous/organizer/hetero_map/__init__.py @@ -1,14 +1,13 @@ -import os import numpy as np import pandas as pd -from typing import List, Optional import torch import torch.nn.functional as F from torch import Tensor, nn +from typing import List, Optional from .....specification import HeteroMapTableSpecification, RKMETableSpecification -from .feature_extractor import * -from .trainer import Trainer, TransTabCollatorForCL +from .feature_extractor import FeatureTokenizer, FeatureProcessor, CLSToken +from .trainer import TransTabCollatorForCL, Trainer class HeteroMap(nn.Module): diff --git a/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py b/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py index ef27344..40a019c 100644 --- a/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py +++ b/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py @@ -72,7 +72,7 @@ class FeatureTokenizer: disable_tokenizer_parallel : bool, optional true if use extractor for collator function in torch.DataLoader """ - cache_dir = conf["cache_path"] + cache_dir = conf.cache_path os.makedirs(cache_dir, exist_ok=True) self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", cache_dir=cache_dir) self.tokenizer.__dict__["model_max_length"] = 512 diff --git a/learnware/reuse/align.py b/learnware/reuse/align.py index 04d29d4..47a47d5 100644 --- a/learnware/reuse/align.py +++ b/learnware/reuse/align.py @@ -18,7 +18,6 @@ class AlignLearnware(Learnware): specification=learnware.get_specification(), learnware_dirpath=learnware.get_dirpath(), ) - self.learnware = learnware def align(self): """Align the learnware with specification or data""" diff --git a/learnware/reuse/hetero/feature_align.py b/learnware/reuse/hetero/feature_align.py index beaf1ca..b43ed57 100644 --- a/learnware/reuse/hetero/feature_align.py +++ b/learnware/reuse/hetero/feature_align.py @@ -12,7 +12,6 @@ from ...logger import get_module_logger from ...learnware import Learnware from ...specification import RKMETableSpecification - logger = get_module_logger("feature_align") @@ -61,7 +60,7 @@ class FeatureAlignLearnware(AlignLearnware): user_rkme : RKMETableSpecification The RKME specification from the user dataset. """ - target_rkme = self.learnware.specification.get_stat_spec()["RKMETableSpecification"] + target_rkme = self.specification.get_stat_spec()["RKMETableSpecification"] trainer = FeatureAlignTrainer( target_rkme=target_rkme, user_rkme=user_rkme, cuda_idx=self.cuda_idx, **self.align_arguments ) @@ -87,7 +86,7 @@ class FeatureAlignLearnware(AlignLearnware): transformed_user_data = ( self.align_model(torch.tensor(user_data, device=self.device).float()).detach().cpu().numpy() ) - y_pred = self.learnware.predict(transformed_user_data) + y_pred = super(FeatureAlignLearnware, self).predict(transformed_user_data) return y_pred def _fill_data(self, X: np.ndarray): diff --git a/learnware/reuse/hetero/hetero_map.py b/learnware/reuse/hetero/hetero_map.py index 76f9ce0..c41095a 100644 --- a/learnware/reuse/hetero/hetero_map.py +++ b/learnware/reuse/hetero/hetero_map.py @@ -63,7 +63,7 @@ class HeteroMapAlignLearnware(AlignLearnware): Training data labels. """ self.feature_align_learnware = FeatureAlignLearnware( - learnware=self.learnware, cuda_idx=self.cuda_idx, **self.align_arguments + learnware=self, cuda_idx=self.cuda_idx, **self.align_arguments ) self.feature_align_learnware.align(user_rkme) From b5d442d243cbe4aeda0e07c79a4ade6f51daec27 Mon Sep 17 00:00:00 2001 From: liuht Date: Tue, 14 Nov 2023 22:30:03 +0800 Subject: [PATCH 77/90] [DOC] add docs for hetero_organizer, searcher --- learnware/market/easy/organizer.py | 6 +- .../heterogeneous/organizer/__init__.py | 135 ++++++- .../organizer/hetero_map/__init__.py | 337 +++++++++++++----- .../organizer/hetero_map/feature_extractor.py | 231 +++++++++--- .../organizer/hetero_map/trainer.py | 207 +++++++++-- learnware/market/heterogeneous/searcher.py | 35 +- learnware/reuse/job_selector.py | 2 +- 7 files changed, 773 insertions(+), 180 deletions(-) diff --git a/learnware/market/easy/organizer.py b/learnware/market/easy/organizer.py index 412664b..3165fd6 100644 --- a/learnware/market/easy/organizer.py +++ b/learnware/market/easy/organizer.py @@ -17,12 +17,12 @@ logger = get_module_logger("easy_organizer") class EasyOrganizer(BaseOrganizer): def reload_market(self, rebuild=False) -> bool: - """Reload the learnware organizer when server restared. + """Reload the learnware organizer when server restarted. Returns ------- bool - A flag indicating whether the market is reload successfully. + A flag indicating whether the market is reloaded successfully. """ self.market_store_path = os.path.join(conf.market_root_path, self.market_id) self.learnware_pool_path = os.path.join(self.market_store_path, "learnware_pool") @@ -234,7 +234,7 @@ class EasyOrganizer(BaseOrganizer): ---------- ids : Union[str, List[str]] Give a id or a list of ids - str: id of targer learware + str: id of target learware List[str]: A list of ids of target learnwares Returns diff --git a/learnware/market/heterogeneous/organizer/__init__.py b/learnware/market/heterogeneous/organizer/__init__.py index 758570e..16d66fb 100644 --- a/learnware/market/heterogeneous/organizer/__init__.py +++ b/learnware/market/heterogeneous/organizer/__init__.py @@ -14,7 +14,14 @@ logger = get_module_logger("hetero_map_table_organizer") class HeteroMapTableOrganizer(EasyOrganizer): - def reload_market(self, rebuild=False): + def reload_market(self, rebuild=False) -> bool: + """Reload the heterogeneous learnware organizer when server restarted. + + Returns + ------- + bool + A flag indicating whether the heterogeneous market is reloaded successfully. + """ super(HeteroMapTableOrganizer, self).reload_market(rebuild=rebuild) hetero_folder_path = os.path.join(self.market_store_path, "hetero") @@ -44,6 +51,19 @@ class HeteroMapTableOrganizer(EasyOrganizer): self.market_mapping = HeteroMap() def reset(self, market_id, rebuild=False, auto_update=False, auto_update_limit=100, **training_args): + """Reset the heterogeneous market with specified settings. + + Parameters + ---------- + market_id : str + the heterogeneous market's id + rebuild : bool, optional + A flag indicating whether to reload market, by default False + auto_update : bool, optional + A flag indicating whether to enable automatic updating of market mapping, by default False + auto_update_limit : int, optional + The threshold for the number of learnwares required to trigger an automatic market mapping update, by default 100 + """ self.auto_update = auto_update self.auto_update_limit = auto_update_limit self.count_down = auto_update_limit @@ -54,6 +74,26 @@ class HeteroMapTableOrganizer(EasyOrganizer): def add_learnware( self, zip_path: str, semantic_spec: dict, check_status: int, learnware_id: str = None ) -> Tuple[str, int]: + """Add a learnware into the heterogeneous learnware market. + Initiates an update of the market mapping if `auto_update` is True and the number of learnwares supporting training reaches `auto_update_limit`. + + Parameters + ---------- + zip_path : str + Filepath for learnware model, a zipped file. + semantic_spec : dict + semantic_spec for new learnware, in dictionary format. + check_status : int + A flag indicating whether the learnware is usable. + learnware_id : str, optional + A id in database for learnware + + Returns + ------- + Tuple[str, int] + - str indicating model_id + - int indicating the final learnware check_status + """ learnware_id, learnwere_status = super(HeteroMapTableOrganizer, self).add_learnware( zip_path, semantic_spec, check_status, learnware_id ) @@ -83,6 +123,20 @@ class HeteroMapTableOrganizer(EasyOrganizer): return learnware_id, learnwere_status def delete_learnware(self, id: str) -> bool: + """Delete learnware from heterogeneous learnware market. + If a corresponding HeteroMapTableSpecification exists, it is also removed. + + Parameters + ---------- + id : str + Learnware to be deleted + + Returns + ------- + bool + True for successful operation. + False for id not found. + """ flag = super(HeteroMapTableOrganizer, self).delete_learnware(id) if flag: hetero_spec_path = os.path.join(self.hetero_specs_path, f"{id}.json") @@ -92,13 +146,40 @@ class HeteroMapTableOrganizer(EasyOrganizer): pass return flag - def update_learnware(self, id: str, zip_path: str = None, semantic_spec: dict = None, check_status: int = None): + def update_learnware(self, id: str, zip_path: str = None, semantic_spec: dict = None, check_status: int = None) -> bool: + """Update learnware with zip_path, semantic_specification and check_status. + If the learnware supports heterogeneous market training, its HeteroMapTableSpecification is also updated. + + Parameters + ---------- + id : str + Learnware id + zip_path : str, optional + Filepath for learnware model, a zipped file. + semantic_spec : dict, optional + semantic_spec for new learnware, in dictionary format. + check_status : int, optional + A flag indicating whether the learnware is usable. + + Returns + ------- + int + The final learnware check_status. + """ final_status = super(HeteroMapTableOrganizer, self).update_learnware(id, zip_path, semantic_spec, check_status) if final_status == BaseChecker.USABLE_LEARWARE and len(self._get_hetero_learnware_ids(id)): self._update_learnware_by_ids(id) return final_status def reload_learnware(self, learnware_id: str): + """Reload learnware into heterogeneous learnware market. + If a corresponding HeteroMapTableSpecification exists, it is also reloaded. + + Parameters + ---------- + learnware_id : str + Learnware to be reloaded + """ super(HeteroMapTableOrganizer, self).reload_learnware(learnware_id) try: hetero_spec_path = os.path.join(self.hetero_specs_path, f"{learnware_id}.json") @@ -110,6 +191,15 @@ class HeteroMapTableOrganizer(EasyOrganizer): logger.warning(f"Learnware {learnware_id} hetero spec loaded failed!") def _update_learnware_by_ids(self, ids: Union[str, List[str]]): + """Update learnware by ids, attempting to generate HeteroMapTableSpecification for them. + + Parameters + ---------- + ids : Union[str, List[str]] + Give a id or a list of ids + str: id of target learware + List[str]: A list of ids of target learnwares + """ ids = self._get_hetero_learnware_ids(ids) for idx in ids: try: @@ -126,6 +216,20 @@ class HeteroMapTableOrganizer(EasyOrganizer): logger.warning(f"Learnware {idx} generate HeteroMapTableSpecification failed! Due to {err}") def _get_hetero_learnware_ids(self, ids: Union[str, List[str]]) -> List[str]: + """Get learnware ids that supports heterogeneous market training and search. + + Parameters + ---------- + ids : Union[str, List[str]] + Give a id or a list of ids + str: id of target learware + List[str]: A list of ids of target learnwares + + Returns + ------- + List[str] + Learnware ids + """ if isinstance(ids, str): ids = [ids] @@ -141,6 +245,18 @@ class HeteroMapTableOrganizer(EasyOrganizer): return ret def generate_hetero_map_spec(self, user_info: BaseUserInfo) -> HeteroMapTableSpecification: + """Generate HeteroMapTableSpecificaion based on user's input description and statistical information. + + Parameters + ---------- + user_info : BaseUserInfo + user_info contains semantic_spec and stat_info + + Returns + ------- + HeteroMapTableSpecification + The generated HeteroMapTableSpecification for user + """ user_stat_spec = user_info.stat_info["RKMETableSpecification"] user_features = user_info.get_semantic_spec()["Input"]["Description"] user_hetero_spec = self.market_mapping.hetero_mapping(user_stat_spec, user_features) @@ -148,7 +264,20 @@ class HeteroMapTableOrganizer(EasyOrganizer): @staticmethod def train(learnware_list: List[Learnware], save_dir: str, **kwargs) -> HeteroMap: - # Convert learnware to dataframe + """Build the market mapping model using learnwares that supports heterogeneous market training. + + Parameters + ---------- + learnware_list : List[Learnware] + The learnware list to train the market mapping + save_dir : str + Filepath where the trained market mapping will be saved + + Returns + ------- + HeteroMap + The trained market mapping model + """ learnware_df_dict = defaultdict(list) for learnware in learnware_list: spec = learnware.get_specification() diff --git a/learnware/market/heterogeneous/organizer/hetero_map/__init__.py b/learnware/market/heterogeneous/organizer/hetero_map/__init__.py index 2f849a3..e16f46a 100644 --- a/learnware/market/heterogeneous/organizer/hetero_map/__init__.py +++ b/learnware/market/heterogeneous/organizer/hetero_map/__init__.py @@ -1,10 +1,11 @@ import os import numpy as np import pandas as pd -from typing import List, Optional +from typing import List, Optional, Union, Callable import torch import torch.nn.functional as F from torch import Tensor, nn +from loguru import logger from .....specification import HeteroMapTableSpecification, RKMETableSpecification from .feature_extractor import * @@ -28,22 +29,24 @@ class HeteroMap(nn.Module): def __init__( self, - feature_tokenizer=None, - hidden_dim=128, - num_layer=2, - num_attention_head=8, - hidden_dropout_prob=0, - ffn_dim=256, - projection_dim=128, - overlap_ratio=0.5, - num_partition=3, - temperature=10, - base_temperature=10, - activation="relu", - device="cuda:0", + feature_tokenizer: FeatureTokenizer = None, + hidden_dim: int = 128, + num_layer: int = 2, + num_attention_head: int = 8, + hidden_dropout_prob: float = 0, + ffn_dim: int = 256, + projection_dim: int = 128, + overlap_ratio: float = 0.5, + num_partition: int = 3, + temperature: int = 10, + base_temperature: int = 10, + activation: Union[str, Callable] = "relu", + device: Union[str, torch.device] = "cuda:0", **kwargs, ): """ + The initialization method for hetero map. + Parameters ---------- feature_tokenizer : FeatureTokenizer, optional @@ -68,12 +71,12 @@ class HeteroMap(nn.Module): Temperature parameter for contrastive learnin, by default 10 base_temperature : int, optional Base temperature paramete, by default 10 - activation : str, optional + activation : Union[str, Callable], optional Activation function for transformer layer, by default "relu" - device : str, optional + device : Union[str, torch.device], optional Device to run the model on, by default "cuda:0" - cache_dir : str, optional - The cache directory, by default None + kwargs: + Additional arguments to be passed to the feature tokenizer """ super(HeteroMap, self).__init__() @@ -126,9 +129,8 @@ class HeteroMap(nn.Module): self.to(device) @staticmethod - def load(checkpoint=None): - """Load the model state_dict and feature_tokenizer configuration - from the ``checkpoint``. + def load(checkpoint: str = None): + """Load the model state_dict and architecture configuration from the specified checkpoint. Parameters ---------- @@ -141,9 +143,8 @@ class HeteroMap(nn.Module): model.load_state_dict(model_info["model_state_dict"], strict=False) return model - def save(self, checkpoint): - """Save the model state_dict and feature_tokenizer configuration - to the ``checkpoint``. + def save(self, checkpoint: str): + """Save the model state_dict and architecture configuration to the specified checkpoint. Parameters ---------- @@ -154,8 +155,19 @@ class HeteroMap(nn.Module): model_info = {"model_state_dict": self.state_dict(), "model_args": self.model_args} torch.save(model_info, checkpoint) - def forward(self, x, y=None): - # do positive sampling + def forward(self, x: dict): + """Processes the input data 'x', performs positive sampling, and computes contrastive loss. + + Parameters + ---------- + x : dict + Pre-tokenized input tabular data in the form of a dictionary + + Returns + ------- + torch.Tensor + The self-supervised VPCL loss + """ feat_x_list = [] if isinstance(x, dict): # pretokenized inputs @@ -175,6 +187,20 @@ class HeteroMap(nn.Module): return loss def hetero_mapping(self, rkme_spec: RKMETableSpecification, features: dict) -> HeteroMapTableSpecification: + """Generate HeteroMapTableSpecification from given tabular data's statistical specification and descriptions of features. + + Parameters + ---------- + rkme_spec : RKMETableSpecification + The RKME specification from the tabular data + features : dict + A dictionary mapping each feature's numerical identifier to its semantic description. + + Returns + ------- + HeteroMapTableSpecification + The resulting HeteroMapTableSpecification + """ hetero_spec = HeteroMapTableSpecification() data = rkme_spec.get_z() cols = [features.get(str(i), "") for i in range(data.shape[1])] @@ -183,7 +209,22 @@ class HeteroMap(nn.Module): hetero_spec.generate_stat_spec_from_system(hetero_embedding, rkme_spec) return hetero_spec - def _build_positive_pairs(self, x, n): + def _build_positive_pairs(self, x: pd.DataFrame, n: int): + """ + Builds positive pairs by splitting the input DataFrame into 'n' parts with some overlap. + + Parameters + ---------- + x : pd.DataFrame + The input DataFrame to be split. + n : int + The number of partitions to divide the DataFrame into. + + Returns + ------- + List[pd.DataFrame] + A list of DataFrames, each representing a partition of the input DataFrame with some overlap. + """ x_cols = x.columns.tolist() sub_col_list = np.array_split(np.array(x_cols), n) len_cols = len(sub_col_list[0]) @@ -198,24 +239,21 @@ class HeteroMap(nn.Module): sub_x_list.append(sub_x) return sub_x_list - def _extract_features(self, x, cols=None): - """Make forward pass given the input feature ``x``. + def _extract_features(self, x: Union[dict, pd.DataFrame], cols=None): + """Performs a forward pass with the given input feature `x`, and extracts features. Parameters ---------- - x: pd.DataFrame or dict - pd.DataFrame: a batch of raw tabular samples; dict: the output of feature_tokenizer + x: Union[dict, pd.DataFrame] + pd.DataFrame: A batch of raw tabular samples + dict: The output of feature_tokenizer Returns ------- output_features: numpy.ndarray - the [CLS] embedding at the end of transformer encoder. + The [CLS] embedding at the end of transformer encoder """ - if isinstance(x, dict): - # input is the pre-tokenized encoded inputs - inputs = x - elif isinstance(x, pd.DataFrame): - # input is dataframe + if isinstance(x, pd.DataFrame): inputs = self.feature_tokenizer(x) elif isinstance(x, torch.Tensor): inputs = self.feature_tokenizer.forward(cols, x) @@ -231,7 +269,21 @@ class HeteroMap(nn.Module): return output_features - def _extract_batch_features(self, x_test, eval_batch_size=256) -> np.ndarray: + def _extract_batch_features(self, x_test: pd.DataFrame, eval_batch_size=256) -> np.ndarray: + """Performs forward passes on a batch of input features `x_test`, extracting and returning features as an array. + + Parameters + ---------- + x_test : pd.DataFrame + A batch of raw tabular samples + eval_batch_size : int, optional + The size of each batch for processing, by default 256 + + Returns + ------- + np.ndarray + An array containing the extracted features from all batches + """ self.eval() output_feas_list = [] for i in range(0, len(x_test), eval_batch_size): @@ -243,18 +295,19 @@ class HeteroMap(nn.Module): all_output_features = np.concatenate(output_feas_list, 0) return all_output_features - def _self_supervised_contrastive_loss(self, features): - """Compute the self-supervised VPCL loss. + def _self_supervised_contrastive_loss(self, features: torch.Tensor): + """ + Compute the self-supervised VPCL loss. Parameters ---------- - features: torch.Tensor - the encoded features of multiple partitions of input tables, with shape ``(bs, n_partition, proj_dim)``. + features : torch.Tensor + The encoded features of multiple partitions of input tables, with shape (bs, n_partition, proj_dim). Returns ------- - loss: torch.Tensor - the computed self-supervised VPCL loss. + torch.Tensor + The computed self-supervised VPCL loss. """ batch_size = features.shape[0] labels = torch.arange(batch_size, dtype=torch.long, device=self.device).view(-1, 1) @@ -286,35 +339,53 @@ class HeteroMap(nn.Module): return loss -def _get_activation_fn(activation): - if activation == "relu": - return F.relu - elif activation == "gelu": - return F.gelu - elif activation == "selu": - return F.selu - elif activation == "leakyrelu": - return F.leaky_relu - raise RuntimeError("activation should be relu/gelu/selu/leakyrelu, not {}".format(activation)) - - class TransformerLayer(nn.Module): + """A custom Transformer layer implemented as a PyTorch module. + """ __config__ = ["batch_first", "norm_first"] def __init__( self, - d_model, - nhead, - dim_feedforward=2048, - dropout=0.1, - activation=F.relu, - layer_norm_eps=1e-5, - batch_first=True, - norm_first=False, - device=None, - dtype=None, - use_layer_norm=True, + d_model: int, + nhead: int, + dim_feedforward: int = 2048, + dropout: float = 0.1, + activation: Union[str, Callable] = F.relu, + layer_norm_eps: float = 1e-5, + batch_first: bool = True, + norm_first: bool = False, + device: Union[str, torch.device] = None, + dtype: torch.dtype = None, + use_layer_norm: bool = True, ): + """ + The initialization method for transformer layer. + + Parameters + ---------- + d_model : int + The number of expected features in the input + nhead : int + The number of heads in the multiheadattention models + dim_feedforward : int, optional + The dimension of the feedforward network model, by default 2048 + dropout : float, optional + The dropout value, by default 0.1 + activation : Union[str, Callable], optional + The activation function to use, by default F.relu + layer_norm_eps : float, optional + The epsilon used for layer normalization, by default 1e-5 + batch_first : bool, optional + Whether to use (batch, seq, feature) format for input and output tensors, by default True + norm_first : bool, optional + Whether to perform layer normalization before attention and feedforward operations, by default False + device : Union[str, torch.device], optional + The device on which the layer is to be run, by default None + dtype : torch.dtype, optional + The data type of the layer's parameters, by default None + use_layer_norm : bool, optional + Whether to use layer normalization, by default True + """ factory_kwargs = {"device": device, "dtype": dtype} super().__init__() self.self_attn = nn.MultiheadAttention(d_model, nhead, batch_first=batch_first, **factory_kwargs) @@ -338,12 +409,29 @@ class TransformerLayer(nn.Module): # Legacy string support for activation function. if isinstance(activation, str): - self.activation = _get_activation_fn(activation) + self.activation = self._get_activation_fn(activation) else: self.activation = activation # self-attention block - def _sa_block(self, x: Tensor, attn_mask: Optional[Tensor], key_padding_mask: Optional[Tensor]) -> Tensor: + def _sa_block(self, x: torch.Tensor, attn_mask: torch.Tensor, key_padding_mask: torch.Tensor) -> torch.Tensor: + """ + Applies a self-attention block to the input tensor. + + Parameters + ---------- + x : torch.Tensor + The input tensor for the self-attention block. + attn_mask : torch.Tensor + The attention mask for the self-attention operation. + key_padding_mask : torch.Tensor + The key padding mask for the self-attention operation. + + Returns + ------- + torch.Tensor + The output tensor after applying the self-attention block. + """ key_padding_mask = ~key_padding_mask.bool() x = self.self_attn( x, @@ -355,7 +443,20 @@ class TransformerLayer(nn.Module): return self.dropout1(x) # feed forward block - def _ff_block(self, x: Tensor) -> Tensor: + def _ff_block(self, x: torch.Tensor) -> torch.Tensor: + """ + Applies a feed-forward block to the input tensor. + + Parameters + ---------- + x : torch.Tensor + The input tensor for the feed-forward block. + + Returns + ------- + torch.Tensor + The output tensor after applying the feed-forward block. + """ g = self.gate_act(self.gate_linear(x)) h = self.linear1(x) h = h * g # add gate @@ -367,19 +468,56 @@ class TransformerLayer(nn.Module): state["activation"] = F.relu super().__setstate__(state) - def forward(self, src, src_mask=None, src_key_padding_mask=None, is_causal=None, **kwargs) -> Tensor: + @staticmethod + def _get_activation_fn(activation: str) -> Callable: + """ + Retrieves the activation function based on the provided activation name. + + Parameters + ---------- + activation : str + Name of the activation function. Supported values are "relu", "gelu", "selu", and "leakyrelu". + + Returns + ------- + Callable + The corresponding activation function from torch.nn.functional. + """ + if activation == "relu": + return F.relu + elif activation == "gelu": + return F.gelu + elif activation == "selu": + return F.selu + elif activation == "leakyrelu": + return F.leaky_relu + raise RuntimeError("activation should be relu/gelu/selu/leakyrelu, not {}".format(activation)) + + def forward(self, + src: torch.Tensor, + src_mask: torch.Tensor = None, + src_key_padding_mask: torch.Tensor = None, + is_causal: torch.Tensor = None, + **kwargs + ) -> torch.Tensor: """Pass the input through the encoder layer. Parameters ---------- - src : Any + src : torch.Tensor The sequence to the encoder layer. - src_mask : Any, optional + src_mask : torch.Tensor, optional The mask for the src sequence, by default None - src_key_padding_mask : Any, optional + src_key_padding_mask : torch.Tensor, optional The mask for the src keys per batch, by default None + is_causal : torch.Tensor, optional + A flag indicating whether the layer should be causal, by default None + + Returns + ------- + torch.Tensor + The output tensor after passing through the encoder layer. """ - # see Fig. 1 of https://arxiv.org/pdf/2002.04745v1.pdf x = src if self.use_layer_norm: @@ -397,15 +535,35 @@ class TransformerLayer(nn.Module): class TransformerMultiLayer(nn.Module): + """A custom multi-layer Transformer module. + """ def __init__( self, - hidden_dim=128, - num_layer=2, - num_attention_head=2, - hidden_dropout_prob=0, - ffn_dim=256, - activation="relu", + hidden_dim: int = 128, + num_layer: int = 2, + num_attention_head: int = 2, + hidden_dropout_prob: float = 0, + ffn_dim: int = 256, + activation: Union[str, Callable] = "relu", ): + """ + The initialization method for align transformer multilayer. + + Parameters + ---------- + hidden_dim : int, optional + Dimension of the hidden layer in the Transformer, by default 128. + num_layer : int, optional + Number of Transformer layers, by default 2. + num_attention_head : int, optional + Number of attention heads in each Transformer layer, by default 2. + hidden_dropout_prob : float, optional + Dropout probability for the hidden layers, by default 0. + ffn_dim : int, optional + Dimension of the feedforward network model, by default 256. + activation : Union[str, Callable], optional + The activation function to be used, by default "relu". + """ super().__init__() self.transformer_encoder = nn.ModuleList( [ @@ -437,14 +595,23 @@ class TransformerMultiLayer(nn.Module): stacked_transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layer - 1) self.transformer_encoder.append(stacked_transformer) - def forward(self, embedding, attention_mask=None, **kwargs) -> Tensor: + def forward(self, embedding: torch.Tensor, attention_mask: torch.Tensor = None, **kwargs) -> torch.Tensor: """ + Passes the input embedding through the Transformer encoder layers. + Parameters ---------- - embedding : Any - bs, num_token, hidden_dim + embedding : torch.Tensor + The input embedding tensor with shape (batch size, number of tokens, hidden dimension). + attention_mask : torch.Tensor, optional + The attention mask for the input tensor, by default None. + + Returns + ------- + Tensor + The output tensor after processing through Transformer encoder layers. """ outputs = embedding for i, mod in enumerate(self.transformer_encoder): outputs = mod(outputs, src_key_padding_mask=attention_mask) - return outputs + return outputs \ No newline at end of file diff --git a/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py b/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py index ef27344..d424a72 100644 --- a/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py +++ b/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py @@ -1,6 +1,6 @@ import os import math -from typing import Dict +from typing import Dict, Callable import numpy as np import torch @@ -12,23 +12,53 @@ from .....config import C as conf class WordEmbedding(nn.Module): - """Encode tokens drawn from column names""" + """Encodes tokens drawn from column names into word embeddings. + """ def __init__( self, - vocab_size, - hidden_dim, - padding_idx=0, - hidden_dropout_prob=0, - layer_norm_eps=1e-5, + vocab_size: int, + hidden_dim: int, + padding_idx: int = 0, + hidden_dropout_prob: float = 0, + layer_norm_eps: float = 1e-5, ): + """ + The initialization method for word embedding. + + Parameters + ---------- + vocab_size : int + The size of the vocabulary. + hidden_dim : int + The dimension of the hidden layer. + padding_idx : int, optional + The index of the padding token, by default 0. + hidden_dropout_prob : float, optional + The dropout probability for the hidden layer, by default 0. + layer_norm_eps : float, optional + The epsilon value for layer normalization, by default 1e-5. + """ super().__init__() self.word_embeddings = nn.Embedding(vocab_size, hidden_dim, padding_idx) nn_init.kaiming_normal_(self.word_embeddings.weight) self.norm = nn.LayerNorm(hidden_dim, eps=layer_norm_eps) self.dropout = nn.Dropout(hidden_dropout_prob) - def forward(self, input_ids) -> Tensor: + def forward(self, input_ids: torch.Tensor) -> torch.Tensor: + """ + Performs the forward pass of the WordEmbedding module. + + Parameters + ---------- + input_ids : torch.Tensor + The input token IDs. + + Returns + ------- + torch.Tensor + The word embeddings corresponding to the input token IDs. + """ embeddings = self.word_embeddings(input_ids) embeddings = self.norm(embeddings) embeddings = self.dropout(embeddings) @@ -38,20 +68,35 @@ class WordEmbedding(nn.Module): class NumEmbedding(nn.Module): """Encode tokens drawn from column names and the corresponding numerical features.""" - def __init__(self, hidden_dim): + def __init__(self, hidden_dim: int): + """ + The initialization method for num embedding. + + Parameters + ---------- + hidden_dim : int + The dimension of the hidden layer. + """ super().__init__() self.norm = nn.LayerNorm(hidden_dim) self.num_bias = nn.Parameter(Tensor(1, 1, hidden_dim)) # add bias nn_init.uniform_(self.num_bias, a=-1 / math.sqrt(hidden_dim), b=1 / math.sqrt(hidden_dim)) - def forward(self, col_emb, x_ts) -> Tensor: + def forward(self, col_emb: torch.Tensor, x_ts: torch.Tensor) -> torch.Tensor: """ + Performs the forward pass of the NumEmbedding module. + Parameters ---------- - col_emb : Any - numerical column embedding, (# numerical columns, emb_dim) - x_ts : Any - numerical features, (bs, emb_dim) + col_emb : torch.Tensor + The numerical column embeddings with shape (# numerical columns, emb_dim). + x_ts : torch.Tensor + The numerical features with shape (bs, emb_dim). + + Returns + ------- + torch.Tensor + The combined feature embeddings. """ col_emb = col_emb.unsqueeze(0).expand((x_ts.shape[0], -1, -1)) feat_emb = col_emb * x_ts.unsqueeze(-1).float() + self.num_bias @@ -63,14 +108,16 @@ class FeatureTokenizer: def __init__( self, - disable_tokenizer_parallel=True, + disable_tokenizer_parallel: bool = True, **kwargs, ): """ + The initialization method for feature tokenizer. + . Parameters ---------- disable_tokenizer_parallel : bool, optional - true if use extractor for collator function in torch.DataLoader + Whether to disable tokenizer parallelism, by default True. """ cache_dir = conf["cache_path"] os.makedirs(cache_dir, exist_ok=True) @@ -81,22 +128,23 @@ class FeatureTokenizer: self.vocab_size = self.tokenizer.vocab_size self.pad_token_id = self.tokenizer.pad_token_id - def __call__(self, x, shuffle=False, keep_input_grad=False) -> Dict: + def __call__(self, x: pd.DataFrame, shuffle: bool = False, keep_input_grad: bool = False) -> Dict: """ + Tokenizes the input DataFrame. + Parameters ---------- - x: pd.DataFrame - with column names and features. - - shuffle: bool - if shuffle column order during the training. + x : pd.DataFrame + The input DataFrame with column names and features. + shuffle : bool, optional + Whether to shuffle column order during training, by default False. + keep_input_grad : bool, optional + Whether to keep input gradients, by default False. Returns ------- - encoded_inputs: a dict with { - 'x_num': tensor contains numerical features, - 'num_col_input_ids': tensor contains numerical column tokenized ids, - } + Dict + A dictionary with tokenized inputs. """ encoded_inputs = {"x_num": None, "num_col_input_ids": None} num_cols = x.columns.tolist() if not shuffle else np.random.shuffle(x.columns.tolist()) @@ -120,21 +168,24 @@ class FeatureTokenizer: return encoded_inputs - def forward(self, cols, x) -> Dict: + def forward(self, cols: List[str], x: torch.Tensor) -> Dict: """ + Processes the input data and generates encoded inputs suitable for model encoding. + Parameters ---------- cols: List[str] - Contain all column names in order. + A list containing all column names in order. x: torch.Tensor + The tensor containing numerical features. Returns ------- - encoded_inputs: a dict with { - 'x_num': tensor contains numerical features, - 'num_col_input_ids': tensor contains numerical column tokenized ids, - } + Dict + - 'x_num': Tensor containing numerical features. + - 'num_col_input_ids': Tensor containing tokenized IDs of numerical columns. + - 'num_att_mask': Attention mask for the numerical column tokens. """ encoded_inputs = { "x_num": None, @@ -156,18 +207,32 @@ class FeatureTokenizer: class FeatureProcessor(nn.Module): - """ - Process inputs from feature extractor to map them to embeddings. - """ + """Process inputs from feature extractor to map them to embeddings.""" def __init__( self, - vocab_size=None, - hidden_dim=128, - hidden_dropout_prob=0, - pad_token_id=0, - device="cuda:0", + vocab_size: int = None, + hidden_dim: int = 128, + hidden_dropout_prob: float = 0, + pad_token_id: int = 0, + device: Union[str, torch.device] = "cuda:0", ): + """ + The initialization method for feature processor. + + Parameters + ---------- + vocab_size : int, optional + The size of the vocabulary. + hidden_dim : int, optional + The dimension of the hidden layer, by default 128. + hidden_dropout_prob : float, optional + The dropout probability for the hidden layer, by default 0. + pad_token_id : int, optional + The index of the padding token, by default 0. + device : Union[str, torch.device], optional + The device to run the module on, by default "cuda:0". + """ super().__init__() self.word_embedding = WordEmbedding( vocab_size=vocab_size, @@ -179,7 +244,22 @@ class FeatureProcessor(nn.Module): self.align_layer = nn.Linear(hidden_dim, hidden_dim, bias=False) self.device = device - def _avg_embedding_by_mask(self, embs, att_mask=None): + def _avg_embedding_by_mask(self, embs: torch.Tensor, att_mask: torch.Tensor = None) -> torch.Tensor: + """ + Averages the embeddings based on the attention mask. + + Parameters + ---------- + embs : torch.Tensor + The embeddings tensor. + att_mask : torch.Tensor, optional + The attention mask to apply on the embeddings. If None, the mean of the embeddings is returned, by default None. + + Returns + ------- + torch.Tensor + The resulting averaged embeddings. + """ if att_mask is None: return embs.mean(1) else: @@ -189,11 +269,28 @@ class FeatureProcessor(nn.Module): def forward( self, - x_num=None, - num_col_input_ids=None, - num_att_mask=None, + x_num: torch.Tensor = None, + num_col_input_ids: torch.Tensor = None, + num_att_mask: torch.Tensor = None, **kwargs, - ) -> Tensor: + ) -> torch.Tensor: + """ + Performs the forward pass of the FeatureProcessor module. + + Parameters + ---------- + x_num : torch.Tensor, optional + The numerical features. + num_col_input_ids : torch.Tensor, optional + The input IDs for numerical columns. + num_att_mask : torch.Tensor, optional + The attention mask. + + Returns + ------- + torch.Tensor + The processed feature embeddings. + """ x_num = x_num.to(self.device) num_col_emb = self.word_embedding(num_col_input_ids.to(self.device)) @@ -209,22 +306,58 @@ class FeatureProcessor(nn.Module): class CLSToken(nn.Module): - """add a learnable cls token embedding at the end of each sequence.""" + """Add a learnable cls token embedding at the end of each sequence.""" - def __init__(self, hidden_dim): + def __init__(self, hidden_dim: int): + """ + The initialization method for CLSToken. + + Parameters + ---------- + hidden_dim : int + The dimension of the hidden layer. + """ super().__init__() self.weight = nn.Parameter(Tensor(hidden_dim)) nn_init.uniform_(self.weight, a=-1 / math.sqrt(hidden_dim), b=1 / math.sqrt(hidden_dim)) self.hidden_dim = hidden_dim - def expand(self, *leading_dimensions): + def expand(self, *leading_dimensions) -> torch.Tensor: + """ + Expands the CLS token embedding to match the leading dimensions of the input. + + Parameters + ---------- + leading_dimensions : tuple + A variable number of integer arguments representing the leading dimensions to which the CLS token embedding will be expanded. + + Returns + ------- + torch.Tensor + Expanded CLS token embedding. + """ new_dims = (1,) * (len(leading_dimensions) - 1) # cls token (128,) -> view(*new_dims, -1) -> (1, 128) # (1, 128) -> expand(*leading_dimensions, -1) -> (64, 1, 128) # here expand means "shared", the cls token embedding remains the same for each sample return self.weight.view(*new_dims, -1).expand(*leading_dimensions, -1) - def forward(self, embedding, attention_mask=None, **kwargs) -> Tensor: + def forward(self, embedding: torch.Tensor, attention_mask: torch.Tensor = None, **kwargs) -> torch.Tensor: + """ + Performs a forward pass by adding a learnable CLS token to the embedding. + + Parameters + ---------- + embedding : torch.Tensor + The input embedding tensor. + attention_mask : torch.Tensor, optional + The attention mask for the input tensor, by default None. + + Returns + ------- + torch.Tensor + Output embedding with the CLS token added. + """ # embedding shape: (64, 11, 128), where 11 is the largest sequence length after tokenizing # after concat, learnable cls token [self.weight] is added to each semantic embedding # embedding shape: (64, d+1, 128) diff --git a/learnware/market/heterogeneous/organizer/hetero_map/trainer.py b/learnware/market/heterogeneous/organizer/hetero_map/trainer.py index c4a85e6..3194b71 100644 --- a/learnware/market/heterogeneous/organizer/hetero_map/trainer.py +++ b/learnware/market/heterogeneous/organizer/hetero_map/trainer.py @@ -1,7 +1,8 @@ -import json +import json_get_parameter_names import math import os import time +from typings import Any, Callable, List, Dict import numpy as np import pandas as pd @@ -19,17 +20,44 @@ logger = get_module_logger("hetero_mapping_trainer") class Trainer: def __init__( self, - model, - train_set_list, - collate_fn=None, - output_dir="./ckpt", - num_epoch=10, - batch_size=64, - lr=1e-4, - weight_decay=0, - eval_batch_size=256, + model: Any, + train_set_list: List[Any], + collate_fn: Callable = None, + output_dir: str = "./ckpt", + num_epoch: int = 10, + batch_size: int = 64, + lr: float = 1e-4, + weight_decay: float = 0, + eval_batch_size: int = 256, **kwargs, ): + """ + The initialization method for trainer. + + Parameters + ---------- + model : Any + The model to be trained. + train_set_list : List[Any] + A list of training datasets. + collate_fn : Callable, optional + The collate function to be used, by default None. + output_dir : str, optional + The directory where the trained model checkpoints will be saved, by default "./ckpt". + num_epoch : int, optional + Number of epochs for training, by default 10. + batch_size : int, optional + Batch size for training, by default 64. + lr : float, optional + Learning rate, by default 1e-4. + weight_decay : float, optional + Weight decay, by default 0. + eval_batch_size : int, optional + Batch size for evaluation, by default 256. + kwargs : dict + Additional keyword arguments. + """ + self.model = model if isinstance(train_set_list, tuple): train_set_list = [train_set_list] @@ -52,7 +80,21 @@ class Trainer: self.args["steps_per_epoch"] = int(self.args["num_training_steps"] / (num_epoch * len(self.train_set_list))) self.optimizer = None - def train(self, verbose: bool = True): + def train(self, verbose: bool = True) -> float: + """ + Trains the model using the provided training data. + + Parameters + ---------- + verbose : bool, optional + Whether to display verbose output, by default True. + + Returns + ------- + float + The final training loss. + """ + self._create_optimizer() start_time = time.time() final_train_loss = 0 @@ -82,11 +124,22 @@ class Trainer: logger.info("training complete, cost {:.1f} secs.".format(time.time() - start_time)) return final_train_loss - def save_model(self, output_dir=None): + def save_model(self, output_dir: str = None): + """ + Saves the trained model to the specified directory. + + Parameters + ---------- + output_dir : str, optional + The directory where the model will be saved, by default None. + """ + logger.info(f"saving model checkpoint to {output_dir}") self.model.save(output_dir) def _create_optimizer(self): + """Creates an optimizer for training the model.""" + if self.optimizer is None: decay_parameters = self._get_parameter_names(self.model, [nn.LayerNorm]) decay_parameters = [name for name in decay_parameters if "bias" not in name] @@ -104,7 +157,25 @@ class Trainer: self.optimizer = torch.optim.Adam(optimizer_grouped_parameters, lr=self.args["lr"]) - def _get_num_train_steps(self, train_set_list, num_epoch, batch_size): + def _get_num_train_steps(self, train_set_list: List[Any], num_epoch: int, batch_size: int) -> int: + """ + Calculates the total number of training steps. + + Parameters + ---------- + train_set_list : List[Any] + A list of training datasets. + num_epoch : int + Number of training epochs. + batch_size : int + Batch size for training. + + Returns + ------- + int + The total number of training steps. + """ + total_step = 0 for trainset in train_set_list: x_train = trainset @@ -112,7 +183,29 @@ class Trainer: total_step *= num_epoch return total_step - def _build_dataloader(self, trainset, batch_size, collator, shuffle=True): + def _build_dataloader( + self, trainset: Any, batch_size: int, collator: Callable, shuffle: bool = True + ) -> torch.utils.data.DataLoader: + """ + Builds a DataLoader for training. + + Parameters + ---------- + trainset : Any + The training dataset. + batch_size : int + Batch size for the DataLoader. + collator : Callable + Collate function for the DataLoader. + shuffle : bool, optional + Whether to shuffle the data, by default True. + + Returns + ------- + torch.utils.data.DataLoader + The DataLoader for the training data. + """ + trainloader = DataLoader( TrainDataset(trainset), collate_fn=collator, @@ -123,8 +216,22 @@ class Trainer: ) return trainloader - def _get_parameter_names(self, model, forbidden_layer_types): - """Returns the names of the model parameters that are not inside a forbidden layer.""" + def _get_parameter_names(self, model: Any, forbidden_layer_types: List[torch.dtype]) -> List[str]: + """ + Retrieves the names of parameters not inside forbidden layers. + + Parameters + ---------- + model : Any + The model from which to retrieve parameters. + forbidden_layer_types : List[torch.dtype] + A list of layer types to exclude. + + Returns + ------- + List[str] + A list of parameter names not inside the forbidden layers. + """ result = [] for name, child in model.named_children(): result += [ @@ -150,15 +257,27 @@ class TrainDataset(Dataset): class TransTabCollatorForCL: - """support positive pair sampling for contrastive learning.""" + """Collator class supporting positive pair sampling for contrastive learning.""" def __init__( self, - feature_tokenizer=None, - overlap_ratio=0.5, - num_partition=3, + feature_tokenizer: Callable = None, + overlap_ratio: float = 0.5, + num_partition: int = 3, **kwargs, ): + """ + The initialization method for TransTabCollatorForCL. + + Parameters + ---------- + feature_tokenizer : Callable, optional + The tokenizer used to process data, by default None. + overlap_ratio : float, optional + The ratio of overlap between partitions, must be between 0 and 1 (exclusive), by default 0.5. + num_partition : int, optional + The number of partitions to create from the data for contrastive learning, by default 3. + """ self.feature_tokenizer = feature_tokenizer or FeatureTokenizer(disable_tokenizer_parallel=True) assert num_partition > 0, f"number of contrastive subsets must be greater than 0, got {num_partition}" assert isinstance(num_partition, int), f"number of constrative subsets must be int, got {type(num_partition)}" @@ -166,10 +285,20 @@ class TransTabCollatorForCL: self.overlap_ratio = overlap_ratio self.num_partition = num_partition - def __call__(self, data): - """Take a list of subsets (views) from the original tests.""" - # 1. build positive pairs - # 2. encode each pair using feature extractor + def __call__(self, data: List[Any]) -> Dict[str, Any]: + """ + Processes the data into subsets for contrastive learning. + + Parameters + ---------- + data : List[Any] + The input data to be processed. + + Returns + ------- + Dict[str, Any] + A dictionary containing the processed data subsets. + """ df_x = pd.concat([row for row in data]) if self.num_partition > 1: sub_x_list = self._build_positive_pairs(df_x, self.num_partition) @@ -182,20 +311,21 @@ class TransTabCollatorForCL: res = {"input_sub_x": input_x_list} return res - def _build_positive_pairs(self, x, n): - """Builds positive pairs of sub-dataframes from the input dataframe x. + def _build_positive_pairs(self, x: pd.DataFrame, n: int) -> List[pd.DataFrame]: + """ + Builds positive pairs of sub-dataframes from the input dataframe. Parameters ---------- - x : pandas.DataFrame - Input dataframe. + x : pd.DataFrame + The input dataframe. n : int - Number of sub-dataframes to split x into. + The number of sub-dataframes to create. Returns ------- - List - List of sub-dataframes, each containing a positive pair of columns from x. + List[pd.DataFrame] + A list of sub-dataframes, each containing a positive pair of columns. """ x_cols = x.columns.tolist() sub_col_list = np.array_split(np.array(x_cols), n) @@ -211,18 +341,19 @@ class TransTabCollatorForCL: sub_x_list.append(sub_x) return sub_x_list - def _build_positive_pairs_single_view(self, x): - """Builds positive pairs for a single view of data by corrupting half of the columns and shuffling the corrupted columns. + def _build_positive_pairs_single_view(self, x: pd.DataFrame) -> List[pd.DataFrame]: + """ + Builds positive pairs for a single view of data by corrupting half of the columns and shuffling the corrupted columns.. Parameters ---------- - x : pandas.DataFrame - The input data. + x : pd.DataFrame + The input dataframe. Returns ------- - List - A list of two pandas DataFrames, where each DataFrame contains the original data with half of the columns corrupted and shuffled. + List[pd.DataFrame] + A list containing two dataframes, one with original data and one with shuffled columns. """ x_cols = x.columns.tolist() sub_x_list = [x] @@ -231,4 +362,4 @@ class TransTabCollatorForCL: x_corrupt = x.copy()[corrupt_cols] np.random.shuffle(x_corrupt.values) sub_x_list.append(pd.concat([x.copy().drop(corrupt_cols, axis=1), x_corrupt], axis=1)) - return sub_x_list + return sub_x_list \ No newline at end of file diff --git a/learnware/market/heterogeneous/searcher.py b/learnware/market/heterogeneous/searcher.py index 3605609..f8ef680 100644 --- a/learnware/market/heterogeneous/searcher.py +++ b/learnware/market/heterogeneous/searcher.py @@ -11,7 +11,19 @@ logger = get_module_logger("hetero_searcher") class HeteroSearcher(EasySearcher): @staticmethod - def check_user_info(user_info: BaseUserInfo): + def check_user_info(user_info: BaseUserInfo) -> bool: + """Check if user_info satifies all the criteria required for enabling heterogeneous learnware search + + Parameters + ---------- + user_info : BaseUserInfo + user_info contains semantic_spec and stat_info + + Returns + ------- + bool + A flag indicating whether heterogeneous search is enabled for user_info + """ try: user_stat_spec = user_info.get_stat_info("RKMETableSpecification") user_input_shape = user_stat_spec.get_z().shape[1] @@ -42,6 +54,27 @@ class HeteroSearcher(EasySearcher): def __call__( self, user_info: BaseUserInfo, check_status: int = None, max_search_num: int = 5, search_method: str = "greedy" ) -> Tuple[List[float], List[Learnware], float, List[Learnware]]: + """Search learnwares based on user_info from learnwares with check_status. + Employs heterogeneous learnware search if specific requirements are met, otherwise resorts to homogeneous search methods. + + Parameters + ---------- + user_info : BaseUserInfo + user_info contains semantic_spec and stat_info + max_search_num : int + The maximum number of the returned learnwares + check_status : int, optional + - None: search from all learnwares + - Others: search from learnwares with check_status + + Returns + ------- + Tuple[List[float], List[Learnware], float, List[Learnware]] + the first is the sorted list of rkme dist + the second is the sorted list of Learnware (single) by the rkme dist + the third is the score of Learnware (mixture) + the fourth is the list of Learnware (mixture), the size is search_num + """ learnware_list = self.learnware_organizer.get_learnwares(check_status=check_status) learnware_list = self.semantic_searcher(learnware_list, user_info) diff --git a/learnware/reuse/job_selector.py b/learnware/reuse/job_selector.py index 8077106..91ad512 100644 --- a/learnware/reuse/job_selector.py +++ b/learnware/reuse/job_selector.py @@ -166,7 +166,7 @@ class JobSelectorReuser(BaseReuser): def _calculate_rkme_spec_mixture_weight( self, user_data: np.ndarray, task_rkme_list: List[RKMETableSpecification], task_rkme_matrix: np.ndarray ) -> List[float]: - """_summary_ + """Calculate mixture weight for the learnware_list based on user's data Parameters ---------- From 60c79ab870710ad3debf7d6ad1a12863bca0f8db Mon Sep 17 00:00:00 2001 From: liuht Date: Tue, 14 Nov 2023 22:44:47 +0800 Subject: [PATCH 78/90] [FIX | MNT] fix import, format code by black --- .../heterogeneous/organizer/__init__.py | 11 +++-- .../organizer/hetero_map/__init__.py | 40 ++++++++++--------- .../organizer/hetero_map/feature_extractor.py | 8 ++-- .../organizer/hetero_map/trainer.py | 9 ++--- learnware/market/heterogeneous/searcher.py | 2 +- 5 files changed, 37 insertions(+), 33 deletions(-) diff --git a/learnware/market/heterogeneous/organizer/__init__.py b/learnware/market/heterogeneous/organizer/__init__.py index 89d6386..35bde1e 100644 --- a/learnware/market/heterogeneous/organizer/__init__.py +++ b/learnware/market/heterogeneous/organizer/__init__.py @@ -1,11 +1,12 @@ import os -import pandas as pd from collections import defaultdict from typing import List, Tuple, Union +import pandas as pd + from ....learnware import Learnware from ....logger import get_module_logger -from ....specification import RKMETableSpecification, HeteroMapTableSpecification +from ....specification import HeteroMapTableSpecification, RKMETableSpecification from ...base import BaseChecker, BaseUserInfo from ...easy import EasyOrganizer from .hetero_map import HeteroMap, Trainer @@ -76,7 +77,7 @@ class HeteroMapTableOrganizer(EasyOrganizer): def add_learnware( self, zip_path: str, semantic_spec: dict, check_status: int, learnware_id: str = None ) -> Tuple[str, int]: - """Add a learnware into the heterogeneous learnware market. + """Add a learnware into the heterogeneous learnware market. Initiates an update of the market mapping if `auto_update` is True and the number of learnwares supporting training reaches `auto_update_limit`. Parameters @@ -148,7 +149,9 @@ class HeteroMapTableOrganizer(EasyOrganizer): pass return flag - def update_learnware(self, id: str, zip_path: str = None, semantic_spec: dict = None, check_status: int = None) -> bool: + def update_learnware( + self, id: str, zip_path: str = None, semantic_spec: dict = None, check_status: int = None + ) -> bool: """Update learnware with zip_path, semantic_specification and check_status. If the learnware supports heterogeneous market training, its HeteroMapTableSpecification is also updated. diff --git a/learnware/market/heterogeneous/organizer/hetero_map/__init__.py b/learnware/market/heterogeneous/organizer/hetero_map/__init__.py index a2e5636..14aece3 100644 --- a/learnware/market/heterogeneous/organizer/hetero_map/__init__.py +++ b/learnware/market/heterogeneous/organizer/hetero_map/__init__.py @@ -1,13 +1,14 @@ +from typing import Callable, List, Optional, Union + import numpy as np import pandas as pd import torch import torch.nn.functional as F from torch import Tensor, nn -from typing import List, Optional from .....specification import HeteroMapTableSpecification, RKMETableSpecification -from .feature_extractor import FeatureTokenizer, FeatureProcessor, CLSToken -from .trainer import TransTabCollatorForCL, Trainer +from .feature_extractor import CLSToken, FeatureProcessor, FeatureTokenizer +from .trainer import Trainer, TransTabCollatorForCL class HeteroMap(nn.Module): @@ -39,12 +40,12 @@ class HeteroMap(nn.Module): temperature: int = 10, base_temperature: int = 10, activation: Union[str, Callable] = "relu", - device: Union[str, torch.device] = "cuda:0", + device: Union[str, torch.device] = "cuda:0", **kwargs, ): """ The initialization method for hetero map. - + Parameters ---------- feature_tokenizer : FeatureTokenizer, optional @@ -154,7 +155,7 @@ class HeteroMap(nn.Module): torch.save(model_info, checkpoint) def forward(self, x: dict): - """Processes the input data 'x', performs positive sampling, and computes contrastive loss. + """Processes the input data 'x', performs positive sampling, and computes contrastive loss. Parameters ---------- @@ -338,8 +339,8 @@ class HeteroMap(nn.Module): class TransformerLayer(nn.Module): - """A custom Transformer layer implemented as a PyTorch module. - """ + """A custom Transformer layer implemented as a PyTorch module.""" + __config__ = ["batch_first", "norm_first"] def __init__( @@ -491,12 +492,13 @@ class TransformerLayer(nn.Module): return F.leaky_relu raise RuntimeError("activation should be relu/gelu/selu/leakyrelu, not {}".format(activation)) - def forward(self, - src: torch.Tensor, - src_mask: torch.Tensor = None, - src_key_padding_mask: torch.Tensor = None, - is_causal: torch.Tensor = None, - **kwargs + def forward( + self, + src: torch.Tensor, + src_mask: torch.Tensor = None, + src_key_padding_mask: torch.Tensor = None, + is_causal: torch.Tensor = None, + **kwargs, ) -> torch.Tensor: """Pass the input through the encoder layer. @@ -510,7 +512,7 @@ class TransformerLayer(nn.Module): The mask for the src keys per batch, by default None is_causal : torch.Tensor, optional A flag indicating whether the layer should be causal, by default None - + Returns ------- torch.Tensor @@ -533,8 +535,8 @@ class TransformerLayer(nn.Module): class TransformerMultiLayer(nn.Module): - """A custom multi-layer Transformer module. - """ + """A custom multi-layer Transformer module.""" + def __init__( self, hidden_dim: int = 128, @@ -546,7 +548,7 @@ class TransformerMultiLayer(nn.Module): ): """ The initialization method for align transformer multilayer. - + Parameters ---------- hidden_dim : int, optional @@ -612,4 +614,4 @@ class TransformerMultiLayer(nn.Module): outputs = embedding for i, mod in enumerate(self.transformer_encoder): outputs = mod(outputs, src_key_padding_mask=attention_mask) - return outputs \ No newline at end of file + return outputs diff --git a/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py b/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py index 2e7a003..da0ab68 100644 --- a/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py +++ b/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py @@ -1,8 +1,9 @@ -import os import math -from typing import Dict, Callable +import os +from typing import Callable, Dict, List, Union import numpy as np +import pandas as pd import torch import torch.nn.init as nn_init from torch import Tensor, nn @@ -12,8 +13,7 @@ from .....config import C as conf class WordEmbedding(nn.Module): - """Encodes tokens drawn from column names into word embeddings. - """ + """Encodes tokens drawn from column names into word embeddings.""" def __init__( self, diff --git a/learnware/market/heterogeneous/organizer/hetero_map/trainer.py b/learnware/market/heterogeneous/organizer/hetero_map/trainer.py index 3194b71..052b2ba 100644 --- a/learnware/market/heterogeneous/organizer/hetero_map/trainer.py +++ b/learnware/market/heterogeneous/organizer/hetero_map/trainer.py @@ -1,8 +1,7 @@ -import json_get_parameter_names import math import os import time -from typings import Any, Callable, List, Dict +from typing import Any, Callable, Dict, List import numpy as np import pandas as pd @@ -11,8 +10,8 @@ from torch import nn from torch.utils.data import DataLoader, Dataset from tqdm.autonotebook import trange -from .feature_extractor import FeatureTokenizer from .....logger import get_module_logger +from .feature_extractor import FeatureTokenizer logger = get_module_logger("hetero_mapping_trainer") @@ -139,7 +138,7 @@ class Trainer: def _create_optimizer(self): """Creates an optimizer for training the model.""" - + if self.optimizer is None: decay_parameters = self._get_parameter_names(self.model, [nn.LayerNorm]) decay_parameters = [name for name in decay_parameters if "bias" not in name] @@ -362,4 +361,4 @@ class TransTabCollatorForCL: x_corrupt = x.copy()[corrupt_cols] np.random.shuffle(x_corrupt.values) sub_x_list.append(pd.concat([x.copy().drop(corrupt_cols, axis=1), x_corrupt], axis=1)) - return sub_x_list \ No newline at end of file + return sub_x_list diff --git a/learnware/market/heterogeneous/searcher.py b/learnware/market/heterogeneous/searcher.py index f8ef680..58cc15f 100644 --- a/learnware/market/heterogeneous/searcher.py +++ b/learnware/market/heterogeneous/searcher.py @@ -1,4 +1,4 @@ -from typing import Tuple, List +from typing import List, Tuple from ...learnware import Learnware from ...logger import get_module_logger From b5f473c74795a27337c4c88623e9731795a1e560 Mon Sep 17 00:00:00 2001 From: liuht Date: Tue, 14 Nov 2023 23:02:40 +0800 Subject: [PATCH 79/90] [DOC] add docs for HeteroMapTableSpecification --- .../specification/system/hetero_table.py | 74 ++++++++++++++++++- 1 file changed, 73 insertions(+), 1 deletion(-) diff --git a/learnware/specification/system/hetero_table.py b/learnware/specification/system/hetero_table.py index 4e89f2d..7e30710 100644 --- a/learnware/specification/system/hetero_table.py +++ b/learnware/specification/system/hetero_table.py @@ -14,9 +14,18 @@ from ...utils import choose_device, setup_seed class HeteroMapTableSpecification(SystemStatSpecification): - """Heterogeneous Embedding Specification""" + """Heterogeneous Map-Table Specification""" def __init__(self, gamma: float = 0.1, cuda_idx: int = -1): + """Initializing HeteroMapTableSpecification parameters. + + Parameters + ---------- + gamma : float + Bandwidth in gaussian kernel, by default 0.1. + cuda_idx : int + A flag indicating whether use CUDA during RKME computation. -1 indicates CUDA not used. + """ self.z = None self.beta = None self.embedding = None @@ -29,16 +38,51 @@ class HeteroMapTableSpecification(SystemStatSpecification): super(HeteroMapTableSpecification, self).__init__(type=self.__class__.__name__) def get_z(self) -> np.ndarray: + """Move z(RKME reduced set points) back to memory accessible to the CPU. + + Returns + ------- + np.ndarray + A copy of z in CPU memory. + """ return self.z.detach().cpu().numpy() def get_beta(self) -> np.ndarray: + """Move beta(RKME weights weights) back to memory accessible to the CPU. + + Returns + ------- + np.ndarray + A copy of beta in CPU memory. + """ return self.beta.detach().cpu().numpy() def generate_stat_spec_from_system(self, heter_embedding: np.ndarray, rkme_spec: RKMETableSpecification): + """Construct heterogeneous map-table specification from RKME specification and embedding genereated by heterogeneous market mapping. + + Parameters + ---------- + heter_embedding : np.ndarray + Embedding genereated by the heterogeneous market mapping. + rkme_spec : RKMETableSpecification + The RKME specification. + """ self.beta = rkme_spec.beta.to(self.device) self.z = torch.from_numpy(heter_embedding).double().to(self.device) def inner_prod(self, Embed2: HeteroMapTableSpecification) -> float: + """Compute the inner product between two HeteroMapTableSpecifications + + Parameters + ---------- + Embed2 : HeteroMapTableSpecification + The other HeteroMapTableSpecification. + + Returns + ------- + float + The inner product between two HeteroMapTableSpecifications. + """ beta_1 = self.beta.reshape(1, -1).double().to(self.device) beta_2 = Embed2.beta.reshape(1, -1).double().to(self.device) Z1 = self.z.double().reshape(self.z.shape[0], -1).to(self.device) @@ -48,6 +92,15 @@ class HeteroMapTableSpecification(SystemStatSpecification): return float(v) def dist(self, Embed2: HeteroMapTableSpecification, omit_term1: bool = False) -> float: + """Compute the Maximum-Mean-Discrepancy(MMD) between two HeteroMapTableSpecifications + + Parameters + ---------- + Phi2 : HeteroMapTableSpecification + The other HeteroMapTableSpecification. + omit_term1 : bool, optional + True if the inner product of self with itself can be omitted, by default False. + """ term1 = 0 if omit_term1 else self.inner_prod(self) term2 = self.inner_prod(Embed2) term3 = Embed2.inner_prod(Embed2) @@ -55,6 +108,18 @@ class HeteroMapTableSpecification(SystemStatSpecification): return float(term1 - 2 * term2 + term3) def load(self, filepath: str) -> bool: + """Load a HeteroMapTableSpecification file in JSON format from the specified path. + + Parameters + ---------- + filepath : str + The specified loading path. + + Returns + ------- + bool + True if the HeteroMapTableSpecification is loaded successfully. + """ load_path = filepath if os.path.exists(load_path): with codecs.open(load_path, "r", encoding="utf-8") as fin: @@ -72,6 +137,13 @@ class HeteroMapTableSpecification(SystemStatSpecification): return False def save(self, filepath: str) -> bool: + """Save the computed HeteroMapTableSpecification to a specified path in JSON format. + + Parameters + ---------- + filepath : str + The specified saving path. + """ save_path = filepath embedding_to_save = copy.deepcopy(self.__dict__) if torch.is_tensor(embedding_to_save["z"]): From 5cc6c1bbcb8b73dd454842fb9fb37599b4f347ea Mon Sep 17 00:00:00 2001 From: bxdd Date: Wed, 15 Nov 2023 00:20:25 +0800 Subject: [PATCH 80/90] [FIX] fix bugs for feature_embedding --- .../market/heterogeneous/organizer/__init__.py | 2 ++ .../heterogeneous/organizer/hetero_map/__init__.py | 4 ++-- .../organizer/hetero_map/feature_extractor.py | 14 +++++++++++--- learnware/market/heterogeneous/searcher.py | 7 +++---- learnware/reuse/feature_augment.py | 4 ++-- 5 files changed, 20 insertions(+), 11 deletions(-) diff --git a/learnware/market/heterogeneous/organizer/__init__.py b/learnware/market/heterogeneous/organizer/__init__.py index f397726..1e2fd23 100644 --- a/learnware/market/heterogeneous/organizer/__init__.py +++ b/learnware/market/heterogeneous/organizer/__init__.py @@ -1,4 +1,5 @@ import os +import traceback import pandas as pd from collections import defaultdict from typing import List, Tuple, Union @@ -125,6 +126,7 @@ class HeteroMapTableOrganizer(EasyOrganizer): hetero_spec.save(save_path) except Exception as err: + traceback.print_exc() logger.warning(f"Learnware {idx} generate HeteroMapTableSpecification failed! Due to {err}") def _get_hetero_learnware_ids(self, ids: Union[str, List[str]]) -> List[str]: diff --git a/learnware/market/heterogeneous/organizer/hetero_map/__init__.py b/learnware/market/heterogeneous/organizer/hetero_map/__init__.py index 97a92da..9afe1b5 100644 --- a/learnware/market/heterogeneous/organizer/hetero_map/__init__.py +++ b/learnware/market/heterogeneous/organizer/hetero_map/__init__.py @@ -39,7 +39,7 @@ class HeteroMap(nn.Module): temperature=10, base_temperature=10, activation="relu", - device="cuda:0", + device="cpu", **kwargs, ): """ @@ -174,7 +174,7 @@ class HeteroMap(nn.Module): def hetero_mapping(self, rkme_spec: RKMETableSpecification, features: dict) -> HeteroMapTableSpecification: hetero_spec = HeteroMapTableSpecification() data = rkme_spec.get_z() - cols = [features.get(str(i), "") for i in range(data.shape[1])] + cols = [features.get(str(i), "Unknown Feature") for i in range(data.shape[1])] hetero_input_df = pd.DataFrame(data=data, columns=cols) hetero_embedding = self._extract_batch_features(hetero_input_df) hetero_spec.generate_stat_spec_from_system(hetero_embedding, rkme_spec) diff --git a/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py b/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py index 40a019c..9b6928f 100644 --- a/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py +++ b/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py @@ -53,6 +53,7 @@ class NumEmbedding(nn.Module): x_ts : Any numerical features, (bs, emb_dim) """ + print(np.array(col_emb).shape, np.array(x_ts).shape) col_emb = col_emb.unsqueeze(0).expand((x_ts.shape[0], -1, -1)) feat_emb = col_emb * x_ts.unsqueeze(-1).float() + self.num_bias return feat_emb @@ -99,13 +100,18 @@ class FeatureTokenizer: } """ encoded_inputs = {"x_num": None, "num_col_input_ids": None} - num_cols = x.columns.tolist() if not shuffle else np.random.shuffle(x.columns.tolist()) - x_num = x[num_cols].fillna(0) + num_cols = x.columns.tolist() if not shuffle else np.random.shuffle(x.columns.tolist()) + index_cols = ( + [i for i in range(len(x.columns))] if not shuffle else np.random.shuffle([i for i in range(len(x.columns))]) + ) + num_cols = [x.columns[i] for i in index_cols] + x_num = x.iloc(axis=1)[index_cols].fillna(0) if keep_input_grad: x_num_ts = torch.tensor(x_num.values, dtype=float, requires_grad=True) # keep the grad else: x_num_ts = torch.tensor(x_num.values, dtype=float) + num_col_ts = self.tokenizer( num_cols, padding=True, @@ -195,9 +201,11 @@ class FeatureProcessor(nn.Module): **kwargs, ) -> Tensor: x_num = x_num.to(self.device) - + print("?1", np.array(x_num).shape, np.array(num_col_input_ids).shape) num_col_emb = self.word_embedding(num_col_input_ids.to(self.device)) + print("?2", np.array(x_num).shape, np.array(num_col_emb).shape) num_col_emb = self._avg_embedding_by_mask(num_col_emb, num_att_mask) + print("?3", np.array(x_num).shape, np.array(num_col_emb).shape) num_feat_embedding = self.num_embedding(num_col_emb, x_num) num_feat_embedding = self.align_layer(num_feat_embedding).float() diff --git a/learnware/market/heterogeneous/searcher.py b/learnware/market/heterogeneous/searcher.py index 3605609..5161126 100644 --- a/learnware/market/heterogeneous/searcher.py +++ b/learnware/market/heterogeneous/searcher.py @@ -1,5 +1,5 @@ from typing import Tuple, List - +import traceback from ...learnware import Learnware from ...logger import get_module_logger from ..base import BaseUserInfo @@ -34,9 +34,8 @@ class HeteroSearcher(EasySearcher): return True except Exception as e: - logger.warning( - f"Invalid heterogeneous search information provided. Use homogeneous search instead. Error: {e}" - ) + traceback.print_exc() + logger.warning(f"Invalid heterogeneous search information provided. Use homogeneous search instead.") return False def __call__( diff --git a/learnware/reuse/feature_augment.py b/learnware/reuse/feature_augment.py index ea3d27d..01daae3 100644 --- a/learnware/reuse/feature_augment.py +++ b/learnware/reuse/feature_augment.py @@ -12,8 +12,8 @@ class FeatureAugmentReuser(BaseReuser): FeatureAugmentReuser is a class for augmenting features using predictions of a given learnware model and applying regression or classification on the augmented dataset. This class supports two modes: - - "regression": Uses RidgeCV for regression tasks. - - "classification": Uses LogisticRegressionCV for classification tasks. + - "regression": Uses RidgeCV for regression tasks. + - "classification": Uses LogisticRegressionCV for classification tasks. """ def __init__(self, learnware_list: List[Learnware] = None, mode: str = None): From 7b5c9db7ce5785dd42ce37e5d93b993744093226 Mon Sep 17 00:00:00 2001 From: bxdd Date: Wed, 15 Nov 2023 00:27:33 +0800 Subject: [PATCH 81/90] [FIX] fix bugs --- .../heterogeneous/organizer/hetero_map/feature_extractor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py b/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py index b855fbd..5f02940 100644 --- a/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py +++ b/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py @@ -149,7 +149,6 @@ class FeatureTokenizer: """ encoded_inputs = {"x_num": None, "num_col_input_ids": None} - num_cols = x.columns.tolist() if not shuffle else np.random.shuffle(x.columns.tolist()) index_cols = ( [i for i in range(len(x.columns))] if not shuffle else np.random.shuffle([i for i in range(len(x.columns))]) ) From e4b3696b512a9d616e5e6f63e5aac9cef3ee0301 Mon Sep 17 00:00:00 2001 From: bxdd Date: Wed, 15 Nov 2023 00:37:03 +0800 Subject: [PATCH 82/90] [MNT] del print --- .../market/heterogeneous/organizer/hetero_map/__init__.py | 2 +- .../heterogeneous/organizer/hetero_map/feature_extractor.py | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/learnware/market/heterogeneous/organizer/hetero_map/__init__.py b/learnware/market/heterogeneous/organizer/hetero_map/__init__.py index 68f7474..0453f97 100644 --- a/learnware/market/heterogeneous/organizer/hetero_map/__init__.py +++ b/learnware/market/heterogeneous/organizer/hetero_map/__init__.py @@ -40,7 +40,7 @@ class HeteroMap(nn.Module): temperature: int = 10, base_temperature: int = 10, activation: Union[str, Callable] = "relu", - device: Union[str, torch.device] = "cuda:0", + device: Union[str, torch.device] = "cpu", **kwargs, ): """ diff --git a/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py b/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py index 5f02940..169fcf7 100644 --- a/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py +++ b/learnware/market/heterogeneous/organizer/hetero_map/feature_extractor.py @@ -98,7 +98,6 @@ class NumEmbedding(nn.Module): torch.Tensor The combined feature embeddings. """ - print(np.array(col_emb).shape, np.array(x_ts).shape) col_emb = col_emb.unsqueeze(0).expand((x_ts.shape[0], -1, -1)) feat_emb = col_emb * x_ts.unsqueeze(-1).float() + self.num_bias return feat_emb @@ -297,11 +296,8 @@ class FeatureProcessor(nn.Module): The processed feature embeddings. """ x_num = x_num.to(self.device) - print("?1", np.array(x_num).shape, np.array(num_col_input_ids).shape) num_col_emb = self.word_embedding(num_col_input_ids.to(self.device)) - print("?2", np.array(x_num).shape, np.array(num_col_emb).shape) num_col_emb = self._avg_embedding_by_mask(num_col_emb, num_att_mask) - print("?3", np.array(x_num).shape, np.array(num_col_emb).shape) num_feat_embedding = self.num_embedding(num_col_emb, x_num) num_feat_embedding = self.align_layer(num_feat_embedding).float() From 257d7e0b3e4f653d6fd4a03f93a342411b447bb8 Mon Sep 17 00:00:00 2001 From: Gene Date: Wed, 15 Nov 2023 01:24:37 +0800 Subject: [PATCH 83/90] [MNT] modify reload process --- .../heterogeneous/organizer/__init__.py | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/learnware/market/heterogeneous/organizer/__init__.py b/learnware/market/heterogeneous/organizer/__init__.py index 1d5c2bf..9352e51 100644 --- a/learnware/market/heterogeneous/organizer/__init__.py +++ b/learnware/market/heterogeneous/organizer/__init__.py @@ -37,20 +37,20 @@ class HeteroMapTableOrganizer(EasyOrganizer): logger.info(f"Reload market mapping from checkpoint {self.market_mapping_path}") self.market_mapping = HeteroMap.load(checkpoint=self.market_mapping_path) if not rebuild: - if os.path.exists(self.hetero_specs_path): - for hetero_json_path in os.listdir(self.hetero_specs_path): - if not hetero_json_path.endswith(".json"): - continue - try: - idx = hetero_json_path.split(".")[0] + usable_ids = self.get_learnware_ids(check_status=BaseChecker.USABLE_LEARWARE) + hetero_ids = self._get_hetero_learnware_ids(usable_ids) + for hetero_id in hetero_ids: + try: + hetero_spec_path = os.path.join(self.hetero_specs_path, f"{hetero_id}.json") + if os.path.exists(hetero_spec_path): hetero_spec = HeteroMapTableSpecification() - hetero_spec.load(os.path.join(self.hetero_specs_path, hetero_json_path)) - self.learnware_list[idx].update_stat_spec(hetero_spec.type, hetero_spec) - except Exception as err: - logger.warning(f"Learnware in {hetero_json_path} hetero spec loaded failed! due to {err}.") - else: - logger.info("No HeteroMapTableSpecification to reload. Use loaded market mapping to regenerate.") - self._update_learnware_by_ids(self.get_learnware_ids(check_status=BaseChecker.USABLE_LEARWARE)) + hetero_spec.load(hetero_spec_path) + self.learnware_list[hetero_id].update_stat_spec(hetero_spec.type, hetero_spec) + else: + self._update_learnware_by_ids(hetero_id) + logger.info(f"Reload HeteroMapTableSpecification for {hetero_id} succeed!") + except Exception as err: + logger.error(f"Learnware {hetero_id} hetero spec loaded failed! due to {err}.") else: logger.warning(f"No market mapping to reload!") self.market_mapping = HeteroMap() From 57636f040f28e07c60f040fd8169a460da9836cf Mon Sep 17 00:00:00 2001 From: Gene Date: Wed, 15 Nov 2023 15:09:53 +0800 Subject: [PATCH 84/90] [FIX] remove extra code in search_learnware --- learnware/client/learnware_client.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/learnware/client/learnware_client.py b/learnware/client/learnware_client.py index abbd71f..0249c53 100644 --- a/learnware/client/learnware_client.py +++ b/learnware/client/learnware_client.py @@ -199,9 +199,6 @@ class LearnwareClient: if semantic_specification is None: semantic_specification = {} - semantic_specification.pop("Input", None) - semantic_specification.pop("Output", None) - if stat_spec is None: files = None else: From cc540f2a7c3397701380bf401db5d39510943a84 Mon Sep 17 00:00:00 2001 From: liuht Date: Wed, 15 Nov 2023 16:58:24 +0800 Subject: [PATCH 85/90] [FIX] fix hetero_map 'to(device)' bug --- .../organizer/hetero_map/__init__.py | 25 +++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/learnware/market/heterogeneous/organizer/hetero_map/__init__.py b/learnware/market/heterogeneous/organizer/hetero_map/__init__.py index 0453f97..dc3b90e 100644 --- a/learnware/market/heterogeneous/organizer/hetero_map/__init__.py +++ b/learnware/market/heterogeneous/organizer/hetero_map/__init__.py @@ -5,6 +5,7 @@ import pandas as pd import torch import torch.nn.functional as F from torch import Tensor, nn +from loguru import logger from .....specification import HeteroMapTableSpecification, RKMETableSpecification from .feature_extractor import CLSToken, FeatureProcessor, FeatureTokenizer @@ -73,7 +74,7 @@ class HeteroMap(nn.Module): activation : Union[str, Callable], optional Activation function for transformer layer, by default "relu" device : Union[str, torch.device], optional - Device to run the model on, by default "cuda:0" + Device to run the model on, by default "cpu" kwargs: Additional arguments to be passed to the feature tokenizer """ @@ -124,8 +125,26 @@ class HeteroMap(nn.Module): self.base_temperature = base_temperature self.num_partition = num_partition self.overlap_ratio = overlap_ratio - self.device = device self.to(device) + + def to(self, device: Union[str, torch.device]): + """Moves the model and all its submodules to the specified device + + Parameters + ---------- + device : Union[str, torch.device] + The target device to which the model and its components should be moved. + + Returns + ------- + HeteroMap + The instance of HeteroMap after moving to the specified device. + """ + super(HeteroMap, self).to(device) + if hasattr(self, 'feature_processor'): + self.feature_processor.device = device + self.device = device + return self @staticmethod def load(checkpoint: str = None): @@ -255,7 +274,9 @@ class HeteroMap(nn.Module): if isinstance(x, pd.DataFrame): inputs = self.feature_tokenizer(x) elif isinstance(x, torch.Tensor): + logger.info(f"extract features, input device:{x.device}") inputs = self.feature_tokenizer.forward(cols, x) + logger.info(f"extract features, output device:{inputs['x_num'].device}") else: raise ValueError(f"feature_tokenizer takes inputs with dict or pd.DataFrame, find {type(x)}.") From cc5c336183b3bea75f7c4773c5bd0210d1f79b8a Mon Sep 17 00:00:00 2001 From: liuht Date: Wed, 15 Nov 2023 17:03:38 +0800 Subject: [PATCH 86/90] [FIX] delete print --- learnware/market/heterogeneous/organizer/hetero_map/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/learnware/market/heterogeneous/organizer/hetero_map/__init__.py b/learnware/market/heterogeneous/organizer/hetero_map/__init__.py index dc3b90e..37b5d3e 100644 --- a/learnware/market/heterogeneous/organizer/hetero_map/__init__.py +++ b/learnware/market/heterogeneous/organizer/hetero_map/__init__.py @@ -274,9 +274,7 @@ class HeteroMap(nn.Module): if isinstance(x, pd.DataFrame): inputs = self.feature_tokenizer(x) elif isinstance(x, torch.Tensor): - logger.info(f"extract features, input device:{x.device}") inputs = self.feature_tokenizer.forward(cols, x) - logger.info(f"extract features, output device:{inputs['x_num'].device}") else: raise ValueError(f"feature_tokenizer takes inputs with dict or pd.DataFrame, find {type(x)}.") From b49da6aed868b69a5675ae28f8201b446df9fa07 Mon Sep 17 00:00:00 2001 From: bxdd Date: Thu, 16 Nov 2023 16:23:15 +0800 Subject: [PATCH 87/90] [MNT] update logic for hetero_market --- learnware/market/easy/database_ops.py | 6 +- .../heterogeneous/organizer/__init__.py | 63 +++++++++---------- learnware/market/heterogeneous/searcher.py | 48 ++------------ learnware/market/heterogeneous/utils.py | 44 +++++++++++++ 4 files changed, 79 insertions(+), 82 deletions(-) create mode 100644 learnware/market/heterogeneous/utils.py diff --git a/learnware/market/easy/database_ops.py b/learnware/market/easy/database_ops.py index 127af1a..ad58b5c 100644 --- a/learnware/market/easy/database_ops.py +++ b/learnware/market/easy/database_ops.py @@ -174,12 +174,10 @@ class DatabaseOperations(object): semantic_spec_dict = json.loads(semantic_spec) try: new_learnware = get_learnware_from_dirpath( - id=id, semantic_spec=semantic_spec_dict, learnware_dirpath=folder_path + id=id, semantic_spec=semantic_spec_dict, learnware_dirpath=folder_path, ignore_error=False ) - assert new_learnware is not None, "learnware must not be None" - logger.info(f"Load learnware: {id}") + logger.info(f"Load learnware {id} succeed!") except Exception as err: - logger.error(f"Load learnware {id} failed due to {err}!") continue learnware_list[id] = new_learnware diff --git a/learnware/market/heterogeneous/organizer/__init__.py b/learnware/market/heterogeneous/organizer/__init__.py index 9352e51..566faa6 100644 --- a/learnware/market/heterogeneous/organizer/__init__.py +++ b/learnware/market/heterogeneous/organizer/__init__.py @@ -6,12 +6,14 @@ from typing import List, Tuple, Union import pandas as pd +from .hetero_map import HeteroMap, Trainer +from ..utils import is_hetero +from ...base import BaseChecker, BaseUserInfo +from ...easy import EasyOrganizer from ....learnware import Learnware from ....logger import get_module_logger from ....specification import HeteroMapTableSpecification, RKMETableSpecification -from ...base import BaseChecker, BaseUserInfo -from ...easy import EasyOrganizer -from .hetero_map import HeteroMap, Trainer + logger = get_module_logger("hetero_map_table_organizer") @@ -40,17 +42,7 @@ class HeteroMapTableOrganizer(EasyOrganizer): usable_ids = self.get_learnware_ids(check_status=BaseChecker.USABLE_LEARWARE) hetero_ids = self._get_hetero_learnware_ids(usable_ids) for hetero_id in hetero_ids: - try: - hetero_spec_path = os.path.join(self.hetero_specs_path, f"{hetero_id}.json") - if os.path.exists(hetero_spec_path): - hetero_spec = HeteroMapTableSpecification() - hetero_spec.load(hetero_spec_path) - self.learnware_list[hetero_id].update_stat_spec(hetero_spec.type, hetero_spec) - else: - self._update_learnware_by_ids(hetero_id) - logger.info(f"Reload HeteroMapTableSpecification for {hetero_id} succeed!") - except Exception as err: - logger.error(f"Learnware {hetero_id} hetero spec loaded failed! due to {err}.") + self._reload_learnware_hetero_spec(hetero_id) else: logger.warning(f"No market mapping to reload!") self.market_mapping = HeteroMap() @@ -104,7 +96,7 @@ class HeteroMapTableOrganizer(EasyOrganizer): ) if learnwere_status == BaseChecker.USABLE_LEARWARE and len(self._get_hetero_learnware_ids(learnware_id)): - self._update_learnware_by_ids(learnware_id) + self._update_learware_hetero_sepc(learnware_id) if self.auto_update: self.count_down -= 1 @@ -121,7 +113,7 @@ class HeteroMapTableOrganizer(EasyOrganizer): f"Market mapping train completed. Now update HeteroMapTableSpecification for {training_learnware_ids}" ) self.market_mapping = updated_market_mapping - self._update_learnware_by_ids(training_learnware_ids) + self._update_learware_hetero_sepc(training_learnware_ids) self.count_down = self.auto_update_limit @@ -175,9 +167,22 @@ class HeteroMapTableOrganizer(EasyOrganizer): """ final_status = super(HeteroMapTableOrganizer, self).update_learnware(id, zip_path, semantic_spec, check_status) if final_status == BaseChecker.USABLE_LEARWARE and len(self._get_hetero_learnware_ids(id)): - self._update_learnware_by_ids(id) + self._update_learware_hetero_sepc(id) return final_status + def _reload_learnware_hetero_spec(self, learnware_id): + try: + hetero_spec_path = os.path.join(self.hetero_specs_path, f"{learnware_id}.json") + if os.path.exists(hetero_spec_path): + hetero_spec = HeteroMapTableSpecification() + hetero_spec.load(hetero_spec_path) + self.learnware_list[learnware_id].update_stat_spec(hetero_spec.type, hetero_spec) + else: + self._update_learware_hetero_sepc(learnware_id) + logger.info(f"Reload HeteroMapTableSpecification for hetero spec {learnware_id} succeed!") + except Exception as err: + logger.error(f"Reload HeteroMapTableSpecification for hetero spec {learnware_id} failed! due to {err}.") + def reload_learnware(self, learnware_id: str): """Reload learnware into heterogeneous learnware market. If a corresponding HeteroMapTableSpecification exists, it is also reloaded. @@ -188,16 +193,10 @@ class HeteroMapTableOrganizer(EasyOrganizer): Learnware to be reloaded """ super(HeteroMapTableOrganizer, self).reload_learnware(learnware_id) - try: - hetero_spec_path = os.path.join(self.hetero_specs_path, f"{learnware_id}.json") - if os.path.exists(hetero_spec_path): - hetero_spec = HeteroMapTableSpecification() - hetero_spec.load(hetero_spec_path) - self.learnware_list[learnware_id].update_stat_spec(hetero_spec.type, hetero_spec) - except: - logger.warning(f"Learnware {learnware_id} hetero spec loaded failed!") + if len(self._get_hetero_learnware_ids(learnware_id)): + self._reload_learnware_hetero_spec(learnware_id) - def _update_learnware_by_ids(self, ids: Union[str, List[str]]): + def _update_learware_hetero_sepc(self, ids: Union[str, List[str]]): """Update learnware by ids, attempting to generate HeteroMapTableSpecification for them. Parameters @@ -207,7 +206,6 @@ class HeteroMapTableOrganizer(EasyOrganizer): str: id of target learware List[str]: A list of ids of target learnwares """ - ids = self._get_hetero_learnware_ids(ids) for idx in ids: try: spec = self.learnware_list[idx].get_specification() @@ -220,7 +218,6 @@ class HeteroMapTableOrganizer(EasyOrganizer): hetero_spec.save(save_path) except Exception as err: - traceback.print_exc() logger.warning(f"Learnware {idx} generate HeteroMapTableSpecification failed! Due to {err}") def _get_hetero_learnware_ids(self, ids: Union[str, List[str]]) -> List[str]: @@ -243,13 +240,9 @@ class HeteroMapTableOrganizer(EasyOrganizer): ret = [] for idx in ids: - try: - spec = self.learnware_list[idx].get_specification() - semantic_spec, rkme = spec.get_semantic_spec(), spec.get_stat_spec().get("RKMETableSpecification", None) - if isinstance(rkme, RKMETableSpecification) and isinstance(semantic_spec["Input"], dict): - ret.append(idx) - except Exception: - pass + spec = self.learnware_list[idx].get_specification() + if is_hetero(stat_specs=spec.get_stat_spec(), semantic_spec=spec.get_semantic_spec()): + ret.append(idx) return ret def generate_hetero_map_spec(self, user_info: BaseUserInfo) -> HeteroMapTableSpecification: diff --git a/learnware/market/heterogeneous/searcher.py b/learnware/market/heterogeneous/searcher.py index efbbd90..7a79004 100644 --- a/learnware/market/heterogeneous/searcher.py +++ b/learnware/market/heterogeneous/searcher.py @@ -1,56 +1,18 @@ import traceback from typing import Tuple, List -from ...learnware import Learnware -from ...logger import get_module_logger +from .utils import is_hetero from ..base import BaseUserInfo from ..easy import EasySearcher from ..utils import parse_specification_type +from ...learnware import Learnware +from ...logger import get_module_logger + logger = get_module_logger("hetero_searcher") class HeteroSearcher(EasySearcher): - @staticmethod - def check_user_info(user_info: BaseUserInfo) -> bool: - """Check if user_info satifies all the criteria required for enabling heterogeneous learnware search - - Parameters - ---------- - user_info : BaseUserInfo - user_info contains semantic_spec and stat_info - - Returns - ------- - bool - A flag indicating whether heterogeneous search is enabled for user_info - """ - try: - user_stat_spec = user_info.get_stat_info("RKMETableSpecification") - user_input_shape = user_stat_spec.get_z().shape[1] - - user_task_type = user_info.get_semantic_spec()["Task"]["Values"] - if user_task_type not in [["Classification"], ["Regression"]]: - logger.warning( - "User doesn't provide correct task type, it must be either Classification or Regression." - ) - return False - - user_input_description = user_info.get_semantic_spec()["Input"] - user_description_dim = int(user_input_description["Dimension"]) - user_description_feature_num = len(user_input_description["Description"]) - - if user_input_shape != user_description_dim or user_input_shape != user_description_feature_num: - logger.warning("User data feature dimensions mismatch with semantic specification.") - return False - - return True - - except Exception as e: - traceback.print_exc() - logger.warning(f"Invalid heterogeneous search information provided. Use homogeneous search instead.") - return False - def __call__( self, user_info: BaseUserInfo, check_status: int = None, max_search_num: int = 5, search_method: str = "greedy" ) -> Tuple[List[float], List[Learnware], float, List[Learnware]]: @@ -82,7 +44,7 @@ class HeteroSearcher(EasySearcher): return [], [], 0.0, [] if parse_specification_type(stat_specs=user_info.stat_info) is not None: - if self.check_user_info(user_info): + if is_hetero(stat_specs=user_info.stat_info, semantic_spec=user_info.semantic_spec): user_hetero_spec = self.learnware_organizer.generate_hetero_map_spec(user_info) user_info.update_stat_info(user_hetero_spec.type, user_hetero_spec) return self.stat_searcher(learnware_list, user_info, max_search_num, search_method) diff --git a/learnware/market/heterogeneous/utils.py b/learnware/market/heterogeneous/utils.py new file mode 100644 index 0000000..7684ee4 --- /dev/null +++ b/learnware/market/heterogeneous/utils.py @@ -0,0 +1,44 @@ +from ...logger import get_module_logger + +logger = get_module_logger("hetero_utils") + + +def is_hetero(stat_specs: dict, semantic_spec: dict) -> bool: + """Check if user_info satifies all the criteria required for enabling heterogeneous learnware search + + Parameters + ---------- + user_info : BaseUserInfo + user_info contains semantic_spec and stat_info + + Returns + ------- + bool + A flag indicating whether heterogeneous search is enabled for user_info + """ + try: + table_stat_spec = stat_specs["RKMETableSpecification"] + table_input_shape = table_stat_spec.get_z().shape[1] + + semantic_task_type = semantic_spec["Task"]["Values"] + if semantic_task_type not in [["Classification"], ["Regression"]]: + logger.warning("User doesn't provide correct task type, it must be either Classification or Regression.") + return False + + semantic_input_description = semantic_spec["Input"] + semantic_description_dim = int(semantic_input_description["Dimension"]) + semantic_decription_feature_num = len(semantic_input_description["Description"]) + + if semantic_decription_feature_num <= 0: + logger.warning("At least one of Input.Description in semantic spec should be provides.") + return False + + if table_input_shape != semantic_description_dim: + logger.warning("User data feature dimensions mismatch with semantic specification.") + return False + + return True + + except Exception as e: + logger.warning(f"Invalid heterogeneous search information provided due to {e}. Use homogeneous search instead.") + return False From a3475aa85946ea0b19e1e1d81fc99c55450ae7c9 Mon Sep 17 00:00:00 2001 From: bxdd Date: Thu, 16 Nov 2023 16:34:43 +0800 Subject: [PATCH 88/90] [MNT] del duplicate code for hetero reuser --- learnware/reuse/feature_augment.py | 35 +++---------------------- learnware/reuse/hetero/feature_align.py | 33 ++--------------------- learnware/reuse/utils.py | 32 ++++++++++++++++++++++ 3 files changed, 37 insertions(+), 63 deletions(-) diff --git a/learnware/reuse/feature_augment.py b/learnware/reuse/feature_augment.py index 01daae3..83484d8 100644 --- a/learnware/reuse/feature_augment.py +++ b/learnware/reuse/feature_augment.py @@ -4,6 +4,7 @@ from typing import List from sklearn.linear_model import RidgeCV, LogisticRegressionCV from .base import BaseReuser +from .utils import fill_data_with_mean from ..learnware import Learnware @@ -48,7 +49,7 @@ class FeatureAugmentReuser(BaseReuser): """ assert self.augment_reuser is not None, "FeatureAugmentReuser is not trained by labeled data yet." - user_data = self._fill_data(user_data) + user_data = fill_data_with_mean(user_data) user_data_aug = self._get_augment_data(user_data) y_pred_aug = self.augment_reuser.predict(user_data_aug) @@ -65,7 +66,7 @@ class FeatureAugmentReuser(BaseReuser): y_train : np.ndarray Training data labels. """ - x_train = self._fill_data(x_train) + x_train = fill_data_with_mean(x_train) x_train_aug = self._get_augment_data(x_train) if self.mode == "regression": @@ -77,36 +78,6 @@ class FeatureAugmentReuser(BaseReuser): self.augment_reuser = LogisticRegressionCV(cv=5, max_iter=1000, random_state=0, multi_class="auto") self.augment_reuser.fit(x_train_aug, y_train) - def _fill_data(self, X: np.ndarray) -> np.ndarray: - """ - Fill missing data (NaN, Inf) in the input array with the mean of the column. - - Parameters - ---------- - X : np.ndarray - Input data array that may contain missing values. - - Returns - ------- - np.ndarray - Data array with missing values filled. - - Raises - ------ - ValueError - If a column in X contains only exceptional values (NaN, Inf). - """ - X[np.isinf(X) | np.isneginf(X) | np.isposinf(X) | np.isneginf(X)] = np.nan - if np.any(np.isnan(X)): - for col in range(X.shape[1]): - is_nan = np.isnan(X[:, col]) - if np.any(is_nan): - if np.all(is_nan): - raise ValueError(f"All values in column {col} are exceptional, e.g., NaN and Inf.") - col_mean = np.nanmean(X[:, col]) - X[:, col] = np.where(is_nan, col_mean, X[:, col]) - return X - def _get_augment_data(self, X: np.ndarray) -> np.ndarray: """Get the augmented data with model output. diff --git a/learnware/reuse/hetero/feature_align.py b/learnware/reuse/hetero/feature_align.py index b43ed57..d7e5906 100644 --- a/learnware/reuse/hetero/feature_align.py +++ b/learnware/reuse/hetero/feature_align.py @@ -7,6 +7,7 @@ from tqdm import trange import torch.nn.functional as F from ..align import AlignLearnware +from ..utils import fill_data_with_mean from ...utils import choose_device from ...logger import get_module_logger from ...learnware import Learnware @@ -82,43 +83,13 @@ class FeatureAlignLearnware(AlignLearnware): Predicted output from the learnware model after alignment. """ assert self.align_model is not None, "FeatureAlignLearnware must be aligned before making predictions." - user_data = self._fill_data(user_data) + user_data = fill_data_with_mean(user_data) transformed_user_data = ( self.align_model(torch.tensor(user_data, device=self.device).float()).detach().cpu().numpy() ) y_pred = super(FeatureAlignLearnware, self).predict(transformed_user_data) return y_pred - def _fill_data(self, X: np.ndarray): - """ - Fill missing data (NaN, Inf) in the input array with the mean of the column. - - Parameters - ---------- - X : np.ndarray - Input data array that may contain missing values. - - Returns - ------- - np.ndarray - Data array with missing values filled. - - Raises - ------ - ValueError - If a column in X contains only exceptional values (NaN, Inf). - """ - X[np.isinf(X) | np.isneginf(X) | np.isposinf(X) | np.isneginf(X)] = np.nan - if np.any(np.isnan(X)): - for col in range(X.shape[1]): - is_nan = np.isnan(X[:, col]) - if np.any(is_nan): - if np.all(is_nan): - raise ValueError(f"All values in column {col} are exceptional, e.g., NaN and Inf.") - col_mean = np.nanmean(X[:, col]) - X[:, col] = np.where(is_nan, col_mean, X[:, col]) - return X - class FeatureAlignModel(nn.Module): """ diff --git a/learnware/reuse/utils.py b/learnware/reuse/utils.py index d0ab3f8..c227682 100644 --- a/learnware/reuse/utils.py +++ b/learnware/reuse/utils.py @@ -1,3 +1,4 @@ +import numpy as np from ..logger import get_module_logger logger = get_module_logger("reuse_utils") @@ -23,3 +24,34 @@ def is_lightgbm_avaliable(verbose=False): logger.warning("ModuleNotFoundError: lightgbm is not installed, please install lightgbm!") return False return True + + +def fill_data_with_mean(X: np.ndarray) -> np.ndarray: + """ + Fill missing data (NaN, Inf) in the input array with the mean of the column. + + Parameters + ---------- + X : np.ndarray + Input data array that may contain missing values. + + Returns + ------- + np.ndarray + Data array with missing values filled. + + Raises + ------ + ValueError + If a column in X contains only exceptional values (NaN, Inf). + """ + X[np.isinf(X) | np.isneginf(X) | np.isposinf(X) | np.isneginf(X)] = np.nan + if np.any(np.isnan(X)): + for col in range(X.shape[1]): + is_nan = np.isnan(X[:, col]) + if np.any(is_nan): + if np.all(is_nan): + raise ValueError(f"All values in column {col} are exceptional, e.g., NaN and Inf.") + col_mean = np.nanmean(X[:, col]) + X[:, col] = np.where(is_nan, col_mean, X[:, col]) + return X From c44b76afd569ab2f0ea999cb5551299ca8fe75dc Mon Sep 17 00:00:00 2001 From: bxdd Date: Thu, 16 Nov 2023 19:57:48 +0800 Subject: [PATCH 89/90] [FIX | MNT] fix bugs for hetero organizer, and modify tests --- learnware/market/heterogeneous/organizer/__init__.py | 6 +++++- .../example_learnwares/example_learnware_1/learnware.yaml | 8 -------- .../example_learnware_1/requirements.txt | 1 - .../{example_learnware_0 => }/learnware.yaml | 0 .../{example_learnware_0/__init__.py => model0.py} | 6 ------ .../{example_learnware_1/__init__.py => model1.py} | 6 ------ .../{example_learnware_0 => }/requirements.txt | 0 tests/test_hetero_market/test_hetero.py | 8 ++++---- 8 files changed, 9 insertions(+), 26 deletions(-) delete mode 100644 tests/test_hetero_market/example_learnwares/example_learnware_1/learnware.yaml delete mode 100644 tests/test_hetero_market/example_learnwares/example_learnware_1/requirements.txt rename tests/test_hetero_market/example_learnwares/{example_learnware_0 => }/learnware.yaml (100%) rename tests/test_hetero_market/example_learnwares/{example_learnware_0/__init__.py => model0.py} (78%) rename tests/test_hetero_market/example_learnwares/{example_learnware_1/__init__.py => model1.py} (78%) rename tests/test_hetero_market/example_learnwares/{example_learnware_0 => }/requirements.txt (100%) diff --git a/learnware/market/heterogeneous/organizer/__init__.py b/learnware/market/heterogeneous/organizer/__init__.py index 566faa6..113b8c3 100644 --- a/learnware/market/heterogeneous/organizer/__init__.py +++ b/learnware/market/heterogeneous/organizer/__init__.py @@ -206,6 +206,9 @@ class HeteroMapTableOrganizer(EasyOrganizer): str: id of target learware List[str]: A list of ids of target learnwares """ + if isinstance(ids, str): + ids = [ids] + for idx in ids: try: spec = self.learnware_list[idx].get_specification() @@ -218,7 +221,8 @@ class HeteroMapTableOrganizer(EasyOrganizer): hetero_spec.save(save_path) except Exception as err: - logger.warning(f"Learnware {idx} generate HeteroMapTableSpecification failed! Due to {err}") + traceback.print_exc() + logger.warning(f"Learnware {idx} generate HeteroMapTableSpecification failed!") def _get_hetero_learnware_ids(self, ids: Union[str, List[str]]) -> List[str]: """Get learnware ids that supports heterogeneous market training and search. diff --git a/tests/test_hetero_market/example_learnwares/example_learnware_1/learnware.yaml b/tests/test_hetero_market/example_learnwares/example_learnware_1/learnware.yaml deleted file mode 100644 index 4a37a37..0000000 --- a/tests/test_hetero_market/example_learnwares/example_learnware_1/learnware.yaml +++ /dev/null @@ -1,8 +0,0 @@ -model: - class_name: MyModel - kwargs: {} -stat_specifications: - - module_path: learnware.specification - class_name: RKMETableSpecification - file_name: stat.json - kwargs: {} \ No newline at end of file diff --git a/tests/test_hetero_market/example_learnwares/example_learnware_1/requirements.txt b/tests/test_hetero_market/example_learnwares/example_learnware_1/requirements.txt deleted file mode 100644 index 1da1c5f..0000000 --- a/tests/test_hetero_market/example_learnwares/example_learnware_1/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -learnware == 0.1.0.999 \ No newline at end of file diff --git a/tests/test_hetero_market/example_learnwares/example_learnware_0/learnware.yaml b/tests/test_hetero_market/example_learnwares/learnware.yaml similarity index 100% rename from tests/test_hetero_market/example_learnwares/example_learnware_0/learnware.yaml rename to tests/test_hetero_market/example_learnwares/learnware.yaml diff --git a/tests/test_hetero_market/example_learnwares/example_learnware_0/__init__.py b/tests/test_hetero_market/example_learnwares/model0.py similarity index 78% rename from tests/test_hetero_market/example_learnwares/example_learnware_0/__init__.py rename to tests/test_hetero_market/example_learnwares/model0.py index ea21917..45f64b7 100644 --- a/tests/test_hetero_market/example_learnwares/example_learnware_0/__init__.py +++ b/tests/test_hetero_market/example_learnwares/model0.py @@ -12,11 +12,5 @@ class MyModel(BaseModel): model = joblib.load(model_path) self.model = model - def fit(self, X: np.ndarray, y: np.ndarray): - pass - def predict(self, X: np.ndarray) -> np.ndarray: return self.model.predict(X) - - def finetune(self, X: np.ndarray, y: np.ndarray): - pass diff --git a/tests/test_hetero_market/example_learnwares/example_learnware_1/__init__.py b/tests/test_hetero_market/example_learnwares/model1.py similarity index 78% rename from tests/test_hetero_market/example_learnwares/example_learnware_1/__init__.py rename to tests/test_hetero_market/example_learnwares/model1.py index 11fb9e0..aca46b3 100644 --- a/tests/test_hetero_market/example_learnwares/example_learnware_1/__init__.py +++ b/tests/test_hetero_market/example_learnwares/model1.py @@ -12,11 +12,5 @@ class MyModel(BaseModel): model = joblib.load(model_path) self.model = model - def fit(self, X: np.ndarray, y: np.ndarray): - pass - def predict(self, X: np.ndarray) -> np.ndarray: return self.model.predict(X) - - def finetune(self, X: np.ndarray, y: np.ndarray): - pass diff --git a/tests/test_hetero_market/example_learnwares/example_learnware_0/requirements.txt b/tests/test_hetero_market/example_learnwares/requirements.txt similarity index 100% rename from tests/test_hetero_market/example_learnwares/example_learnware_0/requirements.txt rename to tests/test_hetero_market/example_learnwares/requirements.txt diff --git a/tests/test_hetero_market/test_hetero.py b/tests/test_hetero_market/test_hetero.py index 58c285e..0755699 100644 --- a/tests/test_hetero_market/test_hetero.py +++ b/tests/test_hetero_market/test_hetero.py @@ -75,7 +75,7 @@ class TestMarket(unittest.TestCase): example_learnware_idx = i % 2 input_dim = input_shape_list[example_learnware_idx] - example_learnware_name = "example_learnwares/example_learnware_%d" % (example_learnware_idx) + learnware_example_dir = "example_learnwares" X, y = make_regression(n_samples=5000, n_informative=15, n_features=input_dim, noise=0.1, random_state=42) @@ -89,16 +89,16 @@ class TestMarket(unittest.TestCase): init_file = os.path.join(dir_path, "__init__.py") copyfile( - os.path.join(curr_root, example_learnware_name, "__init__.py"), init_file + os.path.join(curr_root, learnware_example_dir, f"model{example_learnware_idx}.py"), init_file ) # cp example_init.py init_file yaml_file = os.path.join(dir_path, "learnware.yaml") copyfile( - os.path.join(curr_root, example_learnware_name, "learnware.yaml"), yaml_file + os.path.join(curr_root, learnware_example_dir, "learnware.yaml"), yaml_file ) # cp example.yaml yaml_file env_file = os.path.join(dir_path, "requirements.txt") - copyfile(os.path.join(curr_root, example_learnware_name, "requirements.txt"), env_file) + copyfile(os.path.join(curr_root, learnware_example_dir, "requirements.txt"), env_file) zip_file = dir_path + ".zip" # zip -q -r -j zip_file dir_path From 977f188ad6e281a2307d3a56783233a44ca3c1df Mon Sep 17 00:00:00 2001 From: bxdd Date: Thu, 16 Nov 2023 20:25:11 +0800 Subject: [PATCH 90/90] [MNT] del error import --- .../market/heterogeneous/organizer/hetero_map/__init__.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/learnware/market/heterogeneous/organizer/hetero_map/__init__.py b/learnware/market/heterogeneous/organizer/hetero_map/__init__.py index 37b5d3e..42572b5 100644 --- a/learnware/market/heterogeneous/organizer/hetero_map/__init__.py +++ b/learnware/market/heterogeneous/organizer/hetero_map/__init__.py @@ -5,7 +5,6 @@ import pandas as pd import torch import torch.nn.functional as F from torch import Tensor, nn -from loguru import logger from .....specification import HeteroMapTableSpecification, RKMETableSpecification from .feature_extractor import CLSToken, FeatureProcessor, FeatureTokenizer @@ -126,7 +125,7 @@ class HeteroMap(nn.Module): self.num_partition = num_partition self.overlap_ratio = overlap_ratio self.to(device) - + def to(self, device: Union[str, torch.device]): """Moves the model and all its submodules to the specified device @@ -141,7 +140,7 @@ class HeteroMap(nn.Module): The instance of HeteroMap after moving to the specified device. """ super(HeteroMap, self).to(device) - if hasattr(self, 'feature_processor'): + if hasattr(self, "feature_processor"): self.feature_processor.device = device self.device = device return self