[FIX] fix dataset_m5_workflow

2 years ago · 3a22a48ae5
--- a/examples/dataset_m5_workflow/example_init.py
+++ b/examples/dataset_m5_workflow/example_init.py
@@ -7,7 +7,7 @@ from learnware.model import BaseModel

 class Model(BaseModel):
    def __init__(self):
        super(Model, self).__init__(input_shape=(82,), output_shape=())
        super(Model, self).__init__(input_shape=(82,), output_shape=(1,))
        dir_path = os.path.dirname(os.path.abspath(__file__))
        self.model = lgb.Booster(model_file=os.path.join(dir_path, "model.out"))

--- a/examples/dataset_m5_workflow/main.py
+++ b/examples/dataset_m5_workflow/main.py
@@ -8,7 +8,7 @@ from shutil import copyfile, rmtree

 import learnware
 from learnware.market import instantiate_learnware_market, BaseUserInfo
 from learnware.market import database_ops
 # from learnware.market import database_ops
 from learnware.reuse import JobSelectorReuser, AveragingReuser
 from learnware.specification import generate_rkme_spec
 from m5 import DataLoader
@@ -17,27 +17,40 @@ from learnware.logger import get_module_logger
 logger = get_module_logger("m5_test", level="INFO")


 output_description = {
    "Dimension": 1,
    "Description": {},
 }

 input_description = {
    "Dimension": 82,
    "Description": {},
 }

 semantic_specs = [
    {
        "Data": {"Values": ["Tabular"], "Type": "Class"},
        "Task": {"Values": ["Classification"], "Type": "Class"},
        "Data": {"Values": ["Table"], "Type": "Class"},
        "Task": {"Values": ["Regression"], "Type": "Class"},
        "Library": {"Values": ["Scikit-learn"], "Type": "Class"},
        "Scenario": {"Values": ["Business"], "Type": "Tag"},
        "Description": {"Values": "", "Type": "String"},
        "Name": {"Values": "learnware_1", "Type": "String"},
        "Input": input_description,
        "Output": output_description,
    }
 ]

 user_semantic = {
    "Data": {"Values": ["Tabular"], "Type": "Class"},
    "Task": {"Values": ["Classification"], "Type": "Class"},
    "Data": {"Values": ["Table"], "Type": "Class"},
    "Task": {"Values": ["Regression"], "Type": "Class"},
    "Library": {"Values": ["Scikit-learn"], "Type": "Class"},
    "Scenario": {"Values": ["Business"], "Type": "Tag"},
    "Description": {"Values": "", "Type": "String"},
    "Name": {"Values": "", "Type": "String"},
    "Input": input_description,
    "Output": output_description,
 }


 class M5DatasetWorkflow:
    def _init_m5_dataset(self):
        m5 = DataLoader()
@@ -69,8 +82,8 @@ class M5DatasetWorkflow:
            easy_market.add_learnware(zip_path, semantic_spec)

        print("Total Item:", len(easy_market))
        curr_inds = easy_market._get_ids()
        print("Available ids:", curr_inds)
        # curr_inds = easy_market._get_ids()
        # print("Available ids:", curr_inds)

    def prepare_learnware(self, regenerate_flag=False):
        if regenerate_flag:
@@ -171,7 +184,7 @@ class M5DatasetWorkflow:
            job_selector_score = m5.score(test_y, job_selector_predict_y)
            print(f"mixture reuse loss (job selector): {job_selector_score}")

            reuse_ensemble = AveragingReuser(learnware_list=mixture_learnware_list, mode="vote")
            reuse_ensemble = AveragingReuser(learnware_list=mixture_learnware_list, mode="vote_by_prob")
            ensemble_predict_y = reuse_ensemble.predict(user_data=test_x)
            ensemble_score = m5.score(test_y, ensemble_predict_y)
            print(f"mixture reuse loss (ensemble): {ensemble_score}\n")
--- a/examples/dataset_pfs_workflow/main.py
+++ b/examples/dataset_pfs_workflow/main.py
@@ -15,25 +15,38 @@ from learnware.logger import get_module_logger

 logger = get_module_logger("pfs_test", level="INFO")

 output_description = {
    "Dimension": 1,
    "Description": {},
 }

 input_description = {
    "Dimension": 31,
    "Description": {},
 }

 semantic_specs = [
    {
        "Data": {"Values": ["Tabular"], "Type": "Class"},
        "Task": {"Values": ["Classification"], "Type": "Class"},
        "Data": {"Values": ["Table"], "Type": "Class"},
        "Task": {"Values": ["Regression"], "Type": "Class"},
        "Library": {"Values": ["Scikit-learn"], "Type": "Class"},
        "Scenario": {"Values": ["Business"], "Type": "Tag"},
        "Description": {"Values": "", "Type": "String"},
        "Name": {"Values": "learnware_1", "Type": "String"},
        "Input": input_description,
        "Output": output_description,
    }
 ]

 user_semantic = {
    "Data": {"Values": ["Tabular"], "Type": "Class"},
    "Task": {"Values": ["Classification"], "Type": "Class"},
    "Data": {"Values": ["Table"], "Type": "Class"},
    "Task": {"Values": ["Regression"], "Type": "Class"},
    "Library": {"Values": ["Scikit-learn"], "Type": "Class"},
    "Scenario": {"Values": ["Business"], "Type": "Tag"},
    "Description": {"Values": "", "Type": "String"},
    "Name": {"Values": "", "Type": "String"},
    "Name": {"Values": "learnware_1", "Type": "String"},
    "Input": input_description,
    "Output": output_description,
 }


@@ -66,8 +79,8 @@ class PFSDatasetWorkflow:
            easy_market.add_learnware(zip_path, semantic_spec)

        print("Total Item:", len(easy_market))
        curr_inds = easy_market._get_ids()
        print("Available ids:", curr_inds)
        # curr_inds = easy_market._get_ids()
        # print("Available ids:", curr_inds)

    def prepare_learnware(self, regenerate_flag=False):
        if regenerate_flag:
--- a/examples/dataset_text_workflow/main.py
+++ b/examples/dataset_text_workflow/main.py
@@ -196,10 +196,6 @@ def test_search(gamma=0.1, load_market=True):
        ensemble_score_list.append(ensemble_score)
        print(f"mixture reuse accuracy (ensemble): {ensemble_score}")

        select_list.append(acc_list[0])
        avg_list.append(np.mean(acc_list))
        improve_list.append((acc_list[0] - np.mean(acc_list)) / np.mean(acc_list))

        # test reuse (ensemblePruning)
        reuse_pruning = EnsemblePruningReuser(learnware_list=mixture_learnware_list)
        pruning_predict_y = reuse_pruning.predict(user_data=user_data)
--- a/learnware/market/easy/checker.py
+++ b/learnware/market/easy/checker.py
@@ -120,6 +120,7 @@ class EasyStatChecker(BaseChecker):
                raise ValueError(f"not supported spec type for spec_type = {spec_type}")

            # Check output
            outputs = learnware.predict(inputs)
            try:
                outputs = learnware.predict(inputs)
            except Exception:
--- a/learnware/market/easy/searcher.py
+++ b/learnware/market/easy/searcher.py
@@ -277,7 +277,7 @@ class EasyStatSearcher(BaseSearcher):

        # beta must be nonnegative
        weight, obj = rkme_solve_qp(K, C)
        weight = torch.from_numpy(weight).reshape(-1).double().to(user_rkme.device)
        weight = weight.double().to(user_rkme.device)
        score = user_rkme.inner_prod(user_rkme) + 2 * obj

        return weight.detach().cpu().numpy().reshape(-1), score
--- a/learnware/specification/regular/table/rkme.py
+++ b/learnware/specification/regular/table/rkme.py
@@ -13,6 +13,7 @@ from qpsolvers import solve_qp, Problem, solve_problem
 from collections import Counter
 from typing import Tuple, Any, List, Union, Dict
 import scipy
 from sklearn.cluster import MiniBatchKMeans

 try:
    import faiss
@@ -27,10 +28,10 @@ from ....logger import get_module_logger

 logger = get_module_logger("rkme")

 if not _FAISS_INSTALLED:
    logger.warning(
        "Required faiss version >= 1.7.1 is not detected! Please run 'conda install -c pytorch faiss-cpu' first"
    )
 # if not _FAISS_INSTALLED:
 #     logger.warning(
 #         "Required faiss version >= 1.7.1 is not detected! Please run 'conda install -c pytorch faiss-cpu' first"
 #     )


 class RKMETableSpecification(RegularStatsSpecification):
@@ -127,8 +128,8 @@ class RKMETableSpecification(RegularStatsSpecification):
            self.beta = torch.from_numpy(self.beta).double().to(self.device)
            return

        # Initialize Z by clustering, utiliing faiss to speed up the process.
        self._init_z_by_faiss(X, K)
        # Initialize Z by clustering, utiliing kmeans or faiss to speed up the process.
        self._init_z_by_kmeans(X, K)
        self._update_beta(X, nonnegative_beta)

        # Alternating optimize Z and beta
@@ -156,6 +157,22 @@ class RKMETableSpecification(RegularStatsSpecification):
        center = torch.from_numpy(kmeans.centroids).double()
        self.z = center

    def _init_z_by_kmeans(self, X: Union[np.ndarray, torch.tensor], K: int):
        """Intialize Z by kmeans clustering.

        Parameters
        ----------
        X : np.ndarray or torch.tensor
            Raw data in np.ndarray format or torch.tensor format.
        K : int
            Size of the construced reduced set.
        """
        X = X.astype("float32")
        kmeans = MiniBatchKMeans(n_clusters=K, max_iter=100, verbose=False, n_init="auto")
        kmeans.fit(X)
        center = torch.from_numpy(kmeans.cluster_centers_).double()
        self.z = center

    def _update_beta(self, X: Any, nonnegative_beta: bool = True):
        """Fix Z and update beta using its closed-form solution.

--- a/setup.py
+++ b/setup.py
@@ -75,11 +75,11 @@ REQUIRED = [
    "langdetect>=1.0.9",
    "huggingface-hub<0.18",
    "portalocker>=2.0.0",
    "qpsolvers[clarabel]>=4.0.1"
    "qpsolvers[clarabel]>=4.0.1",
 ]

 if get_platform() != MACOS:
    REQUIRED.append("faiss-cpu>=1.7.1")
 # if get_platform() != MACOS:
 #     REQUIRED.append("faiss-cpu>=1.7.1")

 here = os.path.abspath(os.path.dirname(__file__))
 with open(os.path.join(here, "README.md"), encoding="utf-8") as f: