| @@ -7,7 +7,7 @@ from learnware.model import BaseModel | |||
| class Model(BaseModel): | |||
| def __init__(self): | |||
| super(Model, self).__init__(input_shape=(82,), output_shape=()) | |||
| super(Model, self).__init__(input_shape=(82,), output_shape=(1,)) | |||
| dir_path = os.path.dirname(os.path.abspath(__file__)) | |||
| self.model = lgb.Booster(model_file=os.path.join(dir_path, "model.out")) | |||
| @@ -8,7 +8,7 @@ from shutil import copyfile, rmtree | |||
| import learnware | |||
| from learnware.market import instantiate_learnware_market, BaseUserInfo | |||
| from learnware.market import database_ops | |||
| # from learnware.market import database_ops | |||
| from learnware.reuse import JobSelectorReuser, AveragingReuser | |||
| from learnware.specification import generate_rkme_spec | |||
| from m5 import DataLoader | |||
| @@ -17,27 +17,40 @@ from learnware.logger import get_module_logger | |||
| logger = get_module_logger("m5_test", level="INFO") | |||
| output_description = { | |||
| "Dimension": 1, | |||
| "Description": {}, | |||
| } | |||
| input_description = { | |||
| "Dimension": 82, | |||
| "Description": {}, | |||
| } | |||
| semantic_specs = [ | |||
| { | |||
| "Data": {"Values": ["Tabular"], "Type": "Class"}, | |||
| "Task": {"Values": ["Classification"], "Type": "Class"}, | |||
| "Data": {"Values": ["Table"], "Type": "Class"}, | |||
| "Task": {"Values": ["Regression"], "Type": "Class"}, | |||
| "Library": {"Values": ["Scikit-learn"], "Type": "Class"}, | |||
| "Scenario": {"Values": ["Business"], "Type": "Tag"}, | |||
| "Description": {"Values": "", "Type": "String"}, | |||
| "Name": {"Values": "learnware_1", "Type": "String"}, | |||
| "Input": input_description, | |||
| "Output": output_description, | |||
| } | |||
| ] | |||
| user_semantic = { | |||
| "Data": {"Values": ["Tabular"], "Type": "Class"}, | |||
| "Task": {"Values": ["Classification"], "Type": "Class"}, | |||
| "Data": {"Values": ["Table"], "Type": "Class"}, | |||
| "Task": {"Values": ["Regression"], "Type": "Class"}, | |||
| "Library": {"Values": ["Scikit-learn"], "Type": "Class"}, | |||
| "Scenario": {"Values": ["Business"], "Type": "Tag"}, | |||
| "Description": {"Values": "", "Type": "String"}, | |||
| "Name": {"Values": "", "Type": "String"}, | |||
| "Input": input_description, | |||
| "Output": output_description, | |||
| } | |||
| class M5DatasetWorkflow: | |||
| def _init_m5_dataset(self): | |||
| m5 = DataLoader() | |||
| @@ -69,8 +82,8 @@ class M5DatasetWorkflow: | |||
| easy_market.add_learnware(zip_path, semantic_spec) | |||
| print("Total Item:", len(easy_market)) | |||
| curr_inds = easy_market._get_ids() | |||
| print("Available ids:", curr_inds) | |||
| # curr_inds = easy_market._get_ids() | |||
| # print("Available ids:", curr_inds) | |||
| def prepare_learnware(self, regenerate_flag=False): | |||
| if regenerate_flag: | |||
| @@ -171,7 +184,7 @@ class M5DatasetWorkflow: | |||
| job_selector_score = m5.score(test_y, job_selector_predict_y) | |||
| print(f"mixture reuse loss (job selector): {job_selector_score}") | |||
| reuse_ensemble = AveragingReuser(learnware_list=mixture_learnware_list, mode="vote") | |||
| reuse_ensemble = AveragingReuser(learnware_list=mixture_learnware_list, mode="vote_by_prob") | |||
| ensemble_predict_y = reuse_ensemble.predict(user_data=test_x) | |||
| ensemble_score = m5.score(test_y, ensemble_predict_y) | |||
| print(f"mixture reuse loss (ensemble): {ensemble_score}\n") | |||
| @@ -15,25 +15,38 @@ from learnware.logger import get_module_logger | |||
| logger = get_module_logger("pfs_test", level="INFO") | |||
| output_description = { | |||
| "Dimension": 1, | |||
| "Description": {}, | |||
| } | |||
| input_description = { | |||
| "Dimension": 31, | |||
| "Description": {}, | |||
| } | |||
| semantic_specs = [ | |||
| { | |||
| "Data": {"Values": ["Tabular"], "Type": "Class"}, | |||
| "Task": {"Values": ["Classification"], "Type": "Class"}, | |||
| "Data": {"Values": ["Table"], "Type": "Class"}, | |||
| "Task": {"Values": ["Regression"], "Type": "Class"}, | |||
| "Library": {"Values": ["Scikit-learn"], "Type": "Class"}, | |||
| "Scenario": {"Values": ["Business"], "Type": "Tag"}, | |||
| "Description": {"Values": "", "Type": "String"}, | |||
| "Name": {"Values": "learnware_1", "Type": "String"}, | |||
| "Input": input_description, | |||
| "Output": output_description, | |||
| } | |||
| ] | |||
| user_semantic = { | |||
| "Data": {"Values": ["Tabular"], "Type": "Class"}, | |||
| "Task": {"Values": ["Classification"], "Type": "Class"}, | |||
| "Data": {"Values": ["Table"], "Type": "Class"}, | |||
| "Task": {"Values": ["Regression"], "Type": "Class"}, | |||
| "Library": {"Values": ["Scikit-learn"], "Type": "Class"}, | |||
| "Scenario": {"Values": ["Business"], "Type": "Tag"}, | |||
| "Description": {"Values": "", "Type": "String"}, | |||
| "Name": {"Values": "", "Type": "String"}, | |||
| "Name": {"Values": "learnware_1", "Type": "String"}, | |||
| "Input": input_description, | |||
| "Output": output_description, | |||
| } | |||
| @@ -66,8 +79,8 @@ class PFSDatasetWorkflow: | |||
| easy_market.add_learnware(zip_path, semantic_spec) | |||
| print("Total Item:", len(easy_market)) | |||
| curr_inds = easy_market._get_ids() | |||
| print("Available ids:", curr_inds) | |||
| # curr_inds = easy_market._get_ids() | |||
| # print("Available ids:", curr_inds) | |||
| def prepare_learnware(self, regenerate_flag=False): | |||
| if regenerate_flag: | |||
| @@ -196,10 +196,6 @@ def test_search(gamma=0.1, load_market=True): | |||
| ensemble_score_list.append(ensemble_score) | |||
| print(f"mixture reuse accuracy (ensemble): {ensemble_score}") | |||
| select_list.append(acc_list[0]) | |||
| avg_list.append(np.mean(acc_list)) | |||
| improve_list.append((acc_list[0] - np.mean(acc_list)) / np.mean(acc_list)) | |||
| # test reuse (ensemblePruning) | |||
| reuse_pruning = EnsemblePruningReuser(learnware_list=mixture_learnware_list) | |||
| pruning_predict_y = reuse_pruning.predict(user_data=user_data) | |||
| @@ -120,6 +120,7 @@ class EasyStatChecker(BaseChecker): | |||
| raise ValueError(f"not supported spec type for spec_type = {spec_type}") | |||
| # Check output | |||
| outputs = learnware.predict(inputs) | |||
| try: | |||
| outputs = learnware.predict(inputs) | |||
| except Exception: | |||
| @@ -277,7 +277,7 @@ class EasyStatSearcher(BaseSearcher): | |||
| # beta must be nonnegative | |||
| weight, obj = rkme_solve_qp(K, C) | |||
| weight = torch.from_numpy(weight).reshape(-1).double().to(user_rkme.device) | |||
| weight = weight.double().to(user_rkme.device) | |||
| score = user_rkme.inner_prod(user_rkme) + 2 * obj | |||
| return weight.detach().cpu().numpy().reshape(-1), score | |||
| @@ -13,6 +13,7 @@ from qpsolvers import solve_qp, Problem, solve_problem | |||
| from collections import Counter | |||
| from typing import Tuple, Any, List, Union, Dict | |||
| import scipy | |||
| from sklearn.cluster import MiniBatchKMeans | |||
| try: | |||
| import faiss | |||
| @@ -27,10 +28,10 @@ from ....logger import get_module_logger | |||
| logger = get_module_logger("rkme") | |||
| if not _FAISS_INSTALLED: | |||
| logger.warning( | |||
| "Required faiss version >= 1.7.1 is not detected! Please run 'conda install -c pytorch faiss-cpu' first" | |||
| ) | |||
| # if not _FAISS_INSTALLED: | |||
| # logger.warning( | |||
| # "Required faiss version >= 1.7.1 is not detected! Please run 'conda install -c pytorch faiss-cpu' first" | |||
| # ) | |||
| class RKMETableSpecification(RegularStatsSpecification): | |||
| @@ -127,8 +128,8 @@ class RKMETableSpecification(RegularStatsSpecification): | |||
| self.beta = torch.from_numpy(self.beta).double().to(self.device) | |||
| return | |||
| # Initialize Z by clustering, utiliing faiss to speed up the process. | |||
| self._init_z_by_faiss(X, K) | |||
| # Initialize Z by clustering, utiliing kmeans or faiss to speed up the process. | |||
| self._init_z_by_kmeans(X, K) | |||
| self._update_beta(X, nonnegative_beta) | |||
| # Alternating optimize Z and beta | |||
| @@ -156,6 +157,22 @@ class RKMETableSpecification(RegularStatsSpecification): | |||
| center = torch.from_numpy(kmeans.centroids).double() | |||
| self.z = center | |||
| def _init_z_by_kmeans(self, X: Union[np.ndarray, torch.tensor], K: int): | |||
| """Intialize Z by kmeans clustering. | |||
| Parameters | |||
| ---------- | |||
| X : np.ndarray or torch.tensor | |||
| Raw data in np.ndarray format or torch.tensor format. | |||
| K : int | |||
| Size of the construced reduced set. | |||
| """ | |||
| X = X.astype("float32") | |||
| kmeans = MiniBatchKMeans(n_clusters=K, max_iter=100, verbose=False, n_init="auto") | |||
| kmeans.fit(X) | |||
| center = torch.from_numpy(kmeans.cluster_centers_).double() | |||
| self.z = center | |||
| def _update_beta(self, X: Any, nonnegative_beta: bool = True): | |||
| """Fix Z and update beta using its closed-form solution. | |||
| @@ -75,11 +75,11 @@ REQUIRED = [ | |||
| "langdetect>=1.0.9", | |||
| "huggingface-hub<0.18", | |||
| "portalocker>=2.0.0", | |||
| "qpsolvers[clarabel]>=4.0.1" | |||
| "qpsolvers[clarabel]>=4.0.1", | |||
| ] | |||
| if get_platform() != MACOS: | |||
| REQUIRED.append("faiss-cpu>=1.7.1") | |||
| # if get_platform() != MACOS: | |||
| # REQUIRED.append("faiss-cpu>=1.7.1") | |||
| here = os.path.abspath(os.path.dirname(__file__)) | |||
| with open(os.path.join(here, "README.md"), encoding="utf-8") as f: | |||