[MNT] enable the project to pass the test of flake8tags/v0.3.2
| @@ -0,0 +1,7 @@ | |||
| [flake8] | |||
| max-line-length = 120 | |||
| ignore = | |||
| E203,E501,F841,W503 | |||
| per-file-ignores = | |||
| __init__.py: F401 | |||
| ./learnware/utils/import_utils.py: F401 | |||
| @@ -100,12 +100,12 @@ html_logo = "_static/img/logo/logo1.png" | |||
| # These folders are copied to the documentation's HTML output | |||
| html_static_path = ['_static'] | |||
| html_static_path = ["_static"] | |||
| # These paths are either relative to html_static_path | |||
| # or fully qualified paths (eg. https://...) | |||
| html_css_files = [ | |||
| 'css/custom_style.css', | |||
| "css/custom_style.css", | |||
| ] | |||
| # -- Options for HTMLHelp output ------------------------------------------ | |||
| @@ -9,19 +9,15 @@ from learnware.utils import choose_device | |||
| @torch.no_grad() | |||
| def evaluate(model, evaluate_set: Dataset, device=None, distribution=True): | |||
| device = choose_device(0) if device is None else device | |||
| if isinstance(model, nn.Module): | |||
| model.eval() | |||
| mapping = lambda m, x: m(x) | |||
| else: | |||
| mapping = lambda m, x: m.predict(x) | |||
| criterion = nn.CrossEntropyLoss(reduction="sum") | |||
| total, correct, loss = 0, 0, torch.as_tensor(0.0, dtype=torch.float32, device=device) | |||
| dataloader = DataLoader(evaluate_set, batch_size=1024, shuffle=True) | |||
| for i, (X, y) in enumerate(dataloader): | |||
| X, y = X.to(device), y.to(device) | |||
| out = mapping(model, X) | |||
| out = model(X) if isinstance(model, nn.Module) else model.predict(X) | |||
| if not torch.is_tensor(out): | |||
| out = torch.from_numpy(out).to(device) | |||
| @@ -49,7 +49,7 @@ class ImageDatasetWorkflow: | |||
| plt.xlabel("Amout of Labeled User Data", fontsize=14) | |||
| plt.ylabel("1 - Accuracy", fontsize=14) | |||
| plt.title(f"Results on Image Experimental Scenario", fontsize=16) | |||
| plt.title("Results on Image Experimental Scenario", fontsize=16) | |||
| plt.legend(fontsize=14) | |||
| plt.tight_layout() | |||
| plt.savefig(os.path.join(self.fig_path, "image_labeled_curves.svg"), bbox_inches="tight", dpi=700) | |||
| @@ -61,7 +61,7 @@ class ImageDatasetWorkflow: | |||
| self.user_semantic = client.get_semantic_specification(self.image_benchmark.learnware_ids[0]) | |||
| self.user_semantic["Name"]["Values"] = "" | |||
| if len(self.image_market) == 0 or rebuild == True: | |||
| if len(self.image_market) == 0 or rebuild is True: | |||
| for learnware_id in self.image_benchmark.learnware_ids: | |||
| with tempfile.TemporaryDirectory(prefix="image_benchmark_") as tempdir: | |||
| zip_path = os.path.join(tempdir, f"{learnware_id}.zip") | |||
| @@ -71,7 +71,7 @@ class ImageDatasetWorkflow: | |||
| client.download_learnware(learnware_id, zip_path) | |||
| self.image_market.add_learnware(zip_path, semantic_spec) | |||
| break | |||
| except: | |||
| except Exception: | |||
| time.sleep(1) | |||
| continue | |||
| @@ -1,8 +0,0 @@ | |||
| model: | |||
| class_name: Model | |||
| kwargs: {} | |||
| stat_specifications: | |||
| - module_path: learnware.specification | |||
| class_name: RKMETableSpecification | |||
| file_name: rkme.json | |||
| kwargs: {} | |||
| @@ -1,21 +0,0 @@ | |||
| import os | |||
| import joblib | |||
| import numpy as np | |||
| import lightgbm as lgb | |||
| from learnware.model import BaseModel | |||
| class Model(BaseModel): | |||
| def __init__(self): | |||
| super(Model, self).__init__(input_shape=(82,), output_shape=(1,)) | |||
| dir_path = os.path.dirname(os.path.abspath(__file__)) | |||
| self.model = lgb.Booster(model_file=os.path.join(dir_path, "model.out")) | |||
| def fit(self, X: np.ndarray, y: np.ndarray): | |||
| pass | |||
| def predict(self, X: np.ndarray) -> np.ndarray: | |||
| return self.model.predict(X) | |||
| def finetune(self, X: np.ndarray, y: np.ndarray): | |||
| pass | |||
| @@ -1,3 +0,0 @@ | |||
| # M5 Dataset | |||
| Walmart store, involves the unit sales of various products sold in the USA, organized in the form of grouped time series. More specifically, the dataset involves the unit sales of 3049 products, classified in 3 product categories (Hobbies, Foods, and Household). | |||
| @@ -1,65 +0,0 @@ | |||
| from cgi import test | |||
| import os | |||
| import joblib | |||
| import lightgbm as lgb | |||
| from .config import store_list, model_dir | |||
| from .utils import acquire_data, get_weights, model_predict, score, measure_aux_algo | |||
| from .generate_data import regenerate_data | |||
| from .train import retrain_models, grid_training_sample, train_adaptation_grid | |||
| class DataLoader: | |||
| def __init__(self): | |||
| self.algo = "ridge" | |||
| def set_algo(self, algo): | |||
| self.algo = algo | |||
| def get_algo_list(self): | |||
| return ["lgb", "ridge"] | |||
| def get_idx_list(self): | |||
| return list(range(len(store_list))) | |||
| def get_idx_data(self, idx): | |||
| store = store_list[idx] | |||
| # fill_flag = self.algo == "ridge" | |||
| fill_flag = True | |||
| return acquire_data(store, fill_flag) | |||
| def get_weights(self): | |||
| return get_weights(self.algo) | |||
| def get_model_path(self, idx): | |||
| return os.path.join(model_dir, "{}_{}.out".format(self.algo, store_list[idx])) | |||
| def predict(self, idx, test_x): | |||
| store = store_list[idx] | |||
| if os.path.exists(os.path.join(model_dir, f"{self.algo}_{store}.out")): | |||
| return model_predict(self.algo, idx, test_x) | |||
| else: | |||
| self.retrain_models() | |||
| return model_predict(self.algo, idx, test_x) | |||
| def score(self, real_y, pred_y, sample_weight=None, multioutput="raw_values"): | |||
| return score(real_y, pred_y, sample_weight, multioutput) | |||
| def regenerate_data(self): | |||
| regenerate_data() | |||
| def retrain_models(self): | |||
| retrain_models(self.algo) | |||
| def grid_training_sample(self, user_list=list(range(10))): | |||
| grid_training_sample(self.algo, user_list) | |||
| def train_adaptation_grid( | |||
| self, max_sample, test_sample, user_list=list(range(10)), adaptation_model=[], residual=False | |||
| ): | |||
| train_adaptation_grid(self.algo, max_sample, test_sample, user_list, adaptation_model, residual) | |||
| def measure_aux_algo(self, idx, test_sample, model): | |||
| return measure_aux_algo(idx, test_sample, model) | |||
| @@ -1,139 +0,0 @@ | |||
| import os | |||
| ROOT_PATH = os.path.abspath(os.path.join(__file__, "..", "data")) | |||
| raw_data_dir = os.path.join(ROOT_PATH, "raw") | |||
| processed_data_dir = os.path.join(ROOT_PATH, "processed") | |||
| model_dir = os.path.join(ROOT_PATH, "models") | |||
| grid_dir = os.path.join(ROOT_PATH, "grid_sample") | |||
| TARGET = "sales" | |||
| START_TRAIN = 1 | |||
| END_TRAIN = 1941 - 28 | |||
| category_list = ["item_id", "dept_id", "cat_id", "event_name_1", "event_name_2", "event_type_1", "event_type_2"] | |||
| features_columns = [ | |||
| "item_id", | |||
| "dept_id", | |||
| "cat_id", | |||
| "release", | |||
| "sell_price", | |||
| "price_max", | |||
| "price_min", | |||
| "price_std", | |||
| "price_mean", | |||
| "price_norm", | |||
| "price_nunique", | |||
| "item_nunique", | |||
| "price_momentum", | |||
| "price_momentum_m", | |||
| "price_momentum_y", | |||
| "event_name_1", | |||
| "event_type_1", | |||
| "event_name_2", | |||
| "event_type_2", | |||
| "snap", | |||
| "tm_d", | |||
| "tm_w", | |||
| "tm_m", | |||
| "tm_y", | |||
| "tm_wm", | |||
| "tm_dw", | |||
| "tm_w_end", | |||
| "sales_lag_28", | |||
| "sales_lag_29", | |||
| "sales_lag_30", | |||
| "sales_lag_31", | |||
| "sales_lag_32", | |||
| "sales_lag_33", | |||
| "sales_lag_34", | |||
| "sales_lag_35", | |||
| "sales_lag_36", | |||
| "sales_lag_37", | |||
| "sales_lag_38", | |||
| "sales_lag_39", | |||
| "sales_lag_40", | |||
| "sales_lag_41", | |||
| "sales_lag_42", | |||
| "rolling_mean_7", | |||
| "rolling_std_7", | |||
| "rolling_mean_14", | |||
| "rolling_std_14", | |||
| "rolling_mean_30", | |||
| "rolling_std_30", | |||
| "rolling_mean_60", | |||
| "rolling_std_60", | |||
| "rolling_mean_180", | |||
| "rolling_std_180", | |||
| "rolling_mean_tmp_1_7", | |||
| "rolling_mean_tmp_1_14", | |||
| "rolling_mean_tmp_1_30", | |||
| "rolling_mean_tmp_1_60", | |||
| "rolling_mean_tmp_7_7", | |||
| "rolling_mean_tmp_7_14", | |||
| "rolling_mean_tmp_7_30", | |||
| "rolling_mean_tmp_7_60", | |||
| "rolling_mean_tmp_14_7", | |||
| "rolling_mean_tmp_14_14", | |||
| "rolling_mean_tmp_14_30", | |||
| "rolling_mean_tmp_14_60", | |||
| # "enc_state_id_mean", | |||
| # "enc_state_id_std", | |||
| # "enc_store_id_mean", | |||
| # "enc_store_id_std", | |||
| "enc_cat_id_mean", | |||
| "enc_cat_id_std", | |||
| "enc_dept_id_mean", | |||
| "enc_dept_id_std", | |||
| "enc_state_id_cat_id_mean", | |||
| "enc_state_id_cat_id_std", | |||
| "enc_state_id_dept_id_mean", | |||
| "enc_state_id_dept_id_std", | |||
| "enc_store_id_cat_id_mean", | |||
| "enc_store_id_cat_id_std", | |||
| "enc_store_id_dept_id_mean", | |||
| "enc_store_id_dept_id_std", | |||
| "enc_item_id_mean", | |||
| "enc_item_id_std", | |||
| "enc_item_id_state_id_mean", | |||
| "enc_item_id_state_id_std", | |||
| "enc_item_id_store_id_mean", | |||
| "enc_item_id_store_id_std", | |||
| ] | |||
| label_column = ["sales"] | |||
| lgb_params_list = [ | |||
| [0.015, 224, 66], | |||
| [0.01, 224, 50], | |||
| [0.01, 300, 80], | |||
| [0.015, 128, 50], | |||
| [0.015, 300, 50], | |||
| [0.01, 300, 66], | |||
| [0.015, 300, 80], | |||
| [0.15, 224, 80], | |||
| [0.005, 300, 50], | |||
| [0.015, 224, 50], | |||
| ] | |||
| store_list = ["CA_1", "CA_2", "CA_3", "CA_4", "TX_1", "TX_2", "TX_3", "WI_1", "WI_2", "WI_3"] | |||
| dataset_info = { | |||
| "name": "M5", | |||
| "range of date": "2011.01.29-2016.06.19", | |||
| "description": "Walmart store, involves the unit sales of various products sold in the USA, organized in the form of grouped time series. More specifically, the dataset involves the unit sales of 3049 products, classified in 3 product categories (Hobbies, Foods, and Household).", | |||
| "location": [ | |||
| "California, United States", | |||
| "California, United States", | |||
| "California, United States", | |||
| "California, United States", | |||
| "Texas, United States", | |||
| "Texas, United States", | |||
| "Texas, United States", | |||
| "Wisconsin, United States", | |||
| "Wisconsin, United States", | |||
| "Wisconsin, United States", | |||
| ], | |||
| } | |||
| @@ -1,338 +0,0 @@ | |||
| import numpy as np | |||
| import pandas as pd | |||
| from math import ceil | |||
| from tqdm import tqdm | |||
| from copy import deepcopy as dco | |||
| import os, sys, gc, time, warnings, pickle, psutil, random | |||
| from sklearn.preprocessing import LabelEncoder | |||
| from sklearn.preprocessing import MinMaxScaler | |||
| from .utils import * | |||
| from .config import raw_data_dir, processed_data_dir, TARGET | |||
| warnings.filterwarnings("ignore") | |||
| # ==================== preprocessing ==================== | |||
| def melt_raw_data(train_df): | |||
| if os.path.exists(os.path.join(processed_data_dir, "melt_raw_data.pkl")): | |||
| return pd.read_pickle(os.path.join(processed_data_dir, "melt_raw_data.pkl")) | |||
| index_columns = ["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"] | |||
| grid_df = pd.melt(train_df, id_vars=index_columns, var_name="d", value_name=TARGET) | |||
| for col in index_columns: | |||
| grid_df[col] = grid_df[col].astype("category") | |||
| grid_df.to_pickle(os.path.join(processed_data_dir, "melt_raw_data.pkl")) | |||
| return grid_df | |||
| def add_release_week(grid_df, prices_df, calendar_df): | |||
| if os.path.exists(os.path.join(processed_data_dir, "add_release_week.pkl")): | |||
| return pd.read_pickle(os.path.join(processed_data_dir, "add_release_week.pkl")) | |||
| release_df = prices_df.groupby(["store_id", "item_id"])["wm_yr_wk"].agg(["min"]).reset_index() | |||
| release_df.columns = ["store_id", "item_id", "release"] | |||
| grid_df = merge_by_concat(grid_df, release_df, ["store_id", "item_id"]) | |||
| grid_df = merge_by_concat(grid_df, calendar_df[["wm_yr_wk", "d"]], ["d"]) | |||
| # cutoff meaningless rows | |||
| grid_df = grid_df[grid_df["wm_yr_wk"] >= grid_df["release"]] | |||
| grid_df = grid_df.reset_index(drop=True) | |||
| # scale the release | |||
| grid_df["release"] = grid_df["release"] - grid_df["release"].min() | |||
| grid_df["release"] = grid_df["release"].astype(np.int16) | |||
| grid_df.to_pickle(os.path.join(processed_data_dir, "add_release_week.pkl")) | |||
| return grid_df | |||
| def add_prices(grid_df, prices_df, calendar_df): | |||
| if os.path.exists(os.path.join(processed_data_dir, "add_prices.pkl")): | |||
| return pd.read_pickle(os.path.join(processed_data_dir, "add_prices.pkl")) | |||
| prices_df["price_max"] = prices_df.groupby(["store_id", "item_id"])["sell_price"].transform("max") | |||
| prices_df["price_min"] = prices_df.groupby(["store_id", "item_id"])["sell_price"].transform("min") | |||
| prices_df["price_std"] = prices_df.groupby(["store_id", "item_id"])["sell_price"].transform("std") | |||
| prices_df["price_mean"] = prices_df.groupby(["store_id", "item_id"])["sell_price"].transform("mean") | |||
| prices_df["price_norm"] = prices_df["sell_price"] / prices_df["price_max"] | |||
| prices_df["price_nunique"] = prices_df.groupby(["store_id", "item_id"])["sell_price"].transform("nunique") | |||
| prices_df["item_nunique"] = prices_df.groupby(["store_id", "sell_price"])["item_id"].transform("nunique") | |||
| calendar_prices = calendar_df[["wm_yr_wk", "month", "year"]] | |||
| calendar_prices = calendar_prices.drop_duplicates(subset=["wm_yr_wk"]) | |||
| prices_df = prices_df.merge(calendar_prices[["wm_yr_wk", "month", "year"]], on=["wm_yr_wk"], how="left") | |||
| prices_df["price_momentum"] = prices_df["sell_price"] / prices_df.groupby(["store_id", "item_id"])[ | |||
| "sell_price" | |||
| ].transform(lambda x: x.shift(1)) | |||
| prices_df["price_momentum_m"] = prices_df["sell_price"] / prices_df.groupby(["store_id", "item_id", "month"])[ | |||
| "sell_price" | |||
| ].transform("mean") | |||
| prices_df["price_momentum_y"] = prices_df["sell_price"] / prices_df.groupby(["store_id", "item_id", "year"])[ | |||
| "sell_price" | |||
| ].transform("mean") | |||
| grid_df = reduce_mem_usage(grid_df) | |||
| prices_df = reduce_mem_usage(prices_df) | |||
| original_columns = list(grid_df) | |||
| grid_df = grid_df.merge(prices_df, on=["store_id", "item_id", "wm_yr_wk"], how="left") | |||
| grid_df = reduce_mem_usage(grid_df) | |||
| grid_df.to_pickle(os.path.join(processed_data_dir, "add_prices.pkl")) | |||
| return grid_df | |||
| def add_date(grid_df, calendar_df): | |||
| if os.path.exists(os.path.join(processed_data_dir, "add_date.pkl")): | |||
| return pd.read_pickle(os.path.join(processed_data_dir, "add_date.pkl")) | |||
| # merge calendar partly | |||
| icols = [ | |||
| "date", | |||
| "d", | |||
| "event_name_1", | |||
| "event_type_1", | |||
| "event_name_2", | |||
| "event_type_2", | |||
| "snap_CA", | |||
| "snap_TX", | |||
| "snap_WI", | |||
| ] | |||
| grid_df = grid_df.merge(calendar_df[icols], on=["d"], how="left") | |||
| # convert to category | |||
| icols = [ | |||
| "event_name_1", | |||
| "event_type_1", | |||
| "event_name_2", | |||
| "event_type_2", | |||
| "snap_CA", | |||
| "snap_TX", | |||
| "snap_WI", | |||
| ] | |||
| for col in icols: | |||
| grid_df[col] = grid_df[col].astype("category") | |||
| # make some features from date | |||
| grid_df["date"] = pd.to_datetime(grid_df["date"]) | |||
| grid_df["tm_d"] = grid_df["date"].dt.day.astype(np.int8) | |||
| grid_df["tm_w"] = grid_df["date"].dt.week.astype(np.int8) | |||
| grid_df["tm_m"] = grid_df["date"].dt.month.astype(np.int8) | |||
| grid_df["tm_y"] = grid_df["date"].dt.year | |||
| grid_df["tm_y"] = (grid_df["tm_y"] - grid_df["tm_y"].min()).astype(np.int8) | |||
| grid_df["tm_wm"] = grid_df["tm_d"].apply(lambda x: ceil(x / 7)).astype(np.int8) | |||
| grid_df["tm_dw"] = grid_df["date"].dt.dayofweek.astype(np.int8) | |||
| grid_df["tm_w_end"] = (grid_df["tm_dw"] >= 5).astype(np.int8) | |||
| # clear columns | |||
| grid_df["d"] = grid_df["d"].apply(lambda x: x[2:]).astype(np.int16) | |||
| grid_df = grid_df.drop("wm_yr_wk", 1) | |||
| grid_df.to_pickle(os.path.join(processed_data_dir, "add_date.pkl")) | |||
| return grid_df | |||
| def add_lags_rollings(grid_df): | |||
| if os.path.exists(os.path.join(processed_data_dir, "add_lags_rollings.pkl")): | |||
| return pd.read_pickle(os.path.join(processed_data_dir, "add_lags_rollings.pkl")) | |||
| # add lags | |||
| SHIFT_DAY = 28 | |||
| LAG_DAYS = [col for col in range(SHIFT_DAY, SHIFT_DAY + 15)] | |||
| grid_df = grid_df.assign( | |||
| **{ | |||
| "{}_lag_{}".format(col, l): grid_df.groupby(["id"])[col].transform(lambda x: x.shift(l)) | |||
| for l in LAG_DAYS | |||
| for col in [TARGET] | |||
| } | |||
| ) | |||
| for col in list(grid_df): | |||
| if "lag" in col: | |||
| grid_df[col] = grid_df[col].astype(np.float16) | |||
| # add rollings | |||
| for i in [7, 14, 30, 60, 180]: | |||
| grid_df["rolling_mean_" + str(i)] = ( | |||
| grid_df.groupby(["id"])[TARGET].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).mean()).astype(np.float16) | |||
| ) | |||
| grid_df["rolling_std_" + str(i)] = ( | |||
| grid_df.groupby(["id"])[TARGET].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).std()).astype(np.float16) | |||
| ) | |||
| # sliding window | |||
| for d_shift in [1, 7, 14]: | |||
| for d_window in [7, 14, 30, 60]: | |||
| col_name = "rolling_mean_tmp_" + str(d_shift) + "_" + str(d_window) | |||
| grid_df[col_name] = ( | |||
| grid_df.groupby(["id"])[TARGET] | |||
| .transform(lambda x: x.shift(SHIFT_DAY + d_shift).rolling(d_window).mean()) | |||
| .astype(np.float16) | |||
| ) | |||
| grid_df.to_pickle(os.path.join(processed_data_dir, "add_lags_rollings.pkl")) | |||
| return grid_df | |||
| def add_mean_enc(grid_df): | |||
| if os.path.exists(os.path.join(processed_data_dir, "add_mean_enc.pkl")): | |||
| return pd.read_pickle(os.path.join(processed_data_dir, "add_mean_enc.pkl")) | |||
| sales_df = dco(grid_df["sales"]) | |||
| grid_df["sales"][grid_df["d"] > (1941 - 28)] = np.nan | |||
| icols = [ | |||
| ["state_id"], | |||
| ["store_id"], | |||
| ["cat_id"], | |||
| ["dept_id"], | |||
| ["state_id", "cat_id"], | |||
| ["state_id", "dept_id"], | |||
| ["store_id", "cat_id"], | |||
| ["store_id", "dept_id"], | |||
| ["item_id"], | |||
| ["item_id", "state_id"], | |||
| ["item_id", "store_id"], | |||
| ] | |||
| for col in icols: | |||
| col_name = "_" + "_".join(col) + "_" | |||
| grid_df["enc" + col_name + "mean"] = grid_df.groupby(col)["sales"].transform("mean").astype(np.float16) | |||
| grid_df["enc" + col_name + "std"] = grid_df.groupby(col)["sales"].transform("std").astype(np.float16) | |||
| grid_df["sales"] = sales_df | |||
| grid_df.to_pickle(os.path.join(processed_data_dir, "add_mean_enc.pkl")) | |||
| return grid_df | |||
| def add_snap(grid_df): | |||
| if os.path.exists(os.path.join(processed_data_dir, "all_data_df.pkl")): | |||
| return pd.read_pickle(os.path.join(processed_data_dir, "all_data_df.pkl")) | |||
| mask_CA = grid_df["state_id"] == "CA" | |||
| mask_WI = grid_df["state_id"] == "WI" | |||
| mask_TX = grid_df["state_id"] == "TX" | |||
| grid_df["snap"] = grid_df["snap_CA"] | |||
| grid_df.loc[mask_WI, "snap"] = grid_df["snap_WI"] | |||
| grid_df.loc[mask_TX, "snap"] = grid_df["snap_TX"] | |||
| grid_df.to_pickle(os.path.join(processed_data_dir, "all_data_df.pkl")) | |||
| return grid_df | |||
| def preprocessing_m5(): | |||
| train_df = pd.read_csv(os.path.join(raw_data_dir, "sales_train_evaluation.csv")) | |||
| prices_df = pd.read_csv(os.path.join(raw_data_dir, "sell_prices.csv")) | |||
| calendar_df = pd.read_csv(os.path.join(raw_data_dir, "calendar.csv")) | |||
| grid_df = melt_raw_data(train_df) | |||
| print(f"df: ({grid_df.shape[0]}, {grid_df.shape[1]}) Melting raw data down!") | |||
| grid_df = add_release_week(grid_df, prices_df, calendar_df) | |||
| print(f"df: ({grid_df.shape[0]}, {grid_df.shape[1]}) Adding release week down!") | |||
| grid_df = add_prices(grid_df, prices_df, calendar_df) | |||
| print(f"df: ({grid_df.shape[0]}, {grid_df.shape[1]}) Adding prices down!") | |||
| grid_df = add_date(grid_df, calendar_df) | |||
| print(f"df: ({grid_df.shape[0]}, {grid_df.shape[1]}) Adding date down!") | |||
| grid_df = add_lags_rollings(grid_df) | |||
| print(f"df: ({grid_df.shape[0]}, {grid_df.shape[1]}) Adding lags and rollings down!") | |||
| grid_df = add_mean_enc(grid_df) | |||
| print(f"df: ({grid_df.shape[0]}, {grid_df.shape[1]}) Adding mean encoding down!") | |||
| grid_df = pd.read_pickle(os.path.join(processed_data_dir, "add_mean_enc.pkl")) | |||
| grid_df = add_snap(grid_df) | |||
| print("Save the data down!") | |||
| # ==================== split dataset ==================== | |||
| def label_encode(df, columns): | |||
| le = LabelEncoder() | |||
| data_list = [] | |||
| for column in columns: | |||
| data_list += df[column].drop_duplicates().values.tolist() | |||
| le.fit(data_list) | |||
| for column in columns: | |||
| df[column] = le.transform(df[column].values.tolist()) | |||
| return df | |||
| def reorganize_data(grid_df): | |||
| grid_df["snap"] = grid_df["snap"].astype("int8") | |||
| columns_list = [ | |||
| ["item_id"], | |||
| ["dept_id"], | |||
| ["cat_id"], | |||
| ["event_name_1", "event_name_2"], | |||
| ["event_type_1", "event_type_2"], | |||
| ] | |||
| for columns in columns_list: | |||
| grid_df[columns] = label_encode(grid_df[columns], columns) | |||
| return reduce_mem_usage(grid_df) | |||
| def split_data(df, store, fill_flag=False): | |||
| for cat in category_list: | |||
| df[cat] = df[cat].astype("category") | |||
| if fill_flag: | |||
| df = reduce_mem_usage(df, float16_flag=False) | |||
| cols = df.isnull().any() | |||
| idx = list(cols[cols.values].index) | |||
| df[idx] = df.groupby("item_id", sort=False)[idx].apply(lambda x: x.ffill().bfill()) | |||
| df[idx] = df[idx].fillna(df[idx].mean()) | |||
| mms = MinMaxScaler() | |||
| df[features_columns] = mms.fit_transform(df[features_columns]) | |||
| df = reduce_mem_usage(df) | |||
| train_df = df[df["d"] <= END_TRAIN] | |||
| val_df = df[df["d"] > END_TRAIN] | |||
| train_df = train_df[features_columns + label_column] | |||
| val_df = val_df[features_columns + label_column] | |||
| print(train_df.shape, val_df.shape) | |||
| suffix = f"_fill" if fill_flag else "" | |||
| train_df.to_pickle(os.path.join(processed_data_dir, f"train_{store}{suffix}.pkl")) | |||
| val_df.to_pickle(os.path.join(processed_data_dir, f"val_{store}{suffix}.pkl")) | |||
| def split_m5(): | |||
| grid_df = pd.read_pickle(os.path.join(processed_data_dir, "all_data_df.pkl")) | |||
| if os.path.exists(os.path.join(processed_data_dir, "label_encode.pkl")): | |||
| grid_df = pd.read_pickle(os.path.join(processed_data_dir, "label_encode.pkl")) | |||
| else: | |||
| grid_df = reorganize_data(grid_df) | |||
| grid_df.to_pickle(os.path.join(processed_data_dir, "label_encode.pkl")) | |||
| for store in store_list: | |||
| # split_data(grid_df[grid_df["store_id"] == store], store) | |||
| split_data(grid_df[grid_df["store_id"] == store], store, True) | |||
| def regenerate_data(): | |||
| preprocessing_m5() | |||
| split_m5() | |||
| @@ -1,452 +0,0 @@ | |||
| import gc | |||
| import joblib | |||
| import random | |||
| import numpy as np | |||
| import pandas as pd | |||
| from tqdm import tqdm | |||
| import os, warnings | |||
| import lightgbm as lgb | |||
| from sklearn.svm import SVR | |||
| from sklearn.linear_model import Ridge | |||
| from sklearn.kernel_ridge import KernelRidge | |||
| from sklearn.metrics import mean_squared_error | |||
| from sklearn.metrics.pairwise import rbf_kernel | |||
| from .utils import * | |||
| from .config import model_dir, grid_dir, store_list, lgb_params_list | |||
| warnings.filterwarnings("ignore") | |||
| def train_lgb_model(train_x, train_y, val_x, val_y, store, lr, nl, md, best, save=True, n_estimators=0, train_flag=0): | |||
| lgb_params = { | |||
| "boosting_type": "gbdt", | |||
| "objective": "rmse", | |||
| "metric": "rmse", | |||
| "learning_rate": lr, | |||
| "num_leaves": nl, | |||
| "max_depth": md, | |||
| "n_estimators": 100000, | |||
| "boost_from_average": False, | |||
| "verbose": -1, | |||
| } | |||
| if train_flag: | |||
| idx = int(len(train_y) * 0.1) | |||
| train_data = lgb.Dataset(train_x[:-idx], label=train_y[:-idx]) | |||
| val_data = lgb.Dataset(train_x[-idx:], label=train_y[-idx:]) | |||
| else: | |||
| train_data = lgb.Dataset(train_x, label=train_y) | |||
| val_data = lgb.Dataset(val_x, label=val_y) | |||
| if n_estimators: | |||
| lgb_params["n_estimators"] = n_estimators | |||
| gbm = lgb.train(lgb_params, train_data, verbose_eval=100) | |||
| else: | |||
| gbm = lgb.train(lgb_params, train_data, valid_sets=[val_data], verbose_eval=100, early_stopping_rounds=1000) | |||
| test_y = gbm.predict(val_x, num_iteration=gbm.best_iteration) | |||
| res = mean_squared_error(val_y, test_y, squared=False) | |||
| if res < best: | |||
| best = res | |||
| if save: | |||
| gbm.save_model(os.path.join(model_dir, f"lgb_{store}.out")) | |||
| return best | |||
| def train_ridge_model(train_x, train_y, val_x, val_y, store, a, best, save=True): | |||
| model = Ridge(alpha=a) | |||
| model.fit(train_x, train_y) | |||
| test_y = model.predict(val_x) | |||
| res = mean_squared_error(val_y, test_y, squared=False) | |||
| if res < best: | |||
| best = res | |||
| if save: | |||
| joblib.dump(model, os.path.join(model_dir, f"ridge_{store}.out")) | |||
| return best | |||
| def train_svm_model( | |||
| train_x, train_y, val_x, val_y, store, C, epsilon, best, save=True, gamma=0.1, adaptation_model=[], K1=None, K2=None | |||
| ): | |||
| if K1 is None: | |||
| model = SVR(C=C, epsilon=epsilon, max_iter=30000, cache_size=10240, verbose=True, gamma=gamma) | |||
| else: | |||
| model = AuxiliarySVR( | |||
| C=C, | |||
| epsilon=epsilon, | |||
| gamma=gamma, | |||
| adaptation_model=adaptation_model, | |||
| max_iter=30000, | |||
| cache_size=10240, | |||
| verbose=True, | |||
| K1=K1, | |||
| K2=K2, | |||
| ) | |||
| model.fit(train_x, train_y) | |||
| test_y = model.predict(val_x) | |||
| res = mean_squared_error(val_y, test_y, squared=False) | |||
| if res < best: | |||
| best = res | |||
| if save: | |||
| joblib.dump(model, os.path.join(model_dir, f"svm_{store}.out")) | |||
| return best | |||
| def train_krr_model(train_x, train_y, val_x, val_y, store, a, best, save=True, gamma=0.1, K1=None, K2=None): | |||
| if K1 is None: | |||
| model = KernelRidge(kernel="rbf", alpha=a, gamma=gamma) | |||
| model.fit(train_x, train_y) | |||
| test_y = model.predict(val_x) | |||
| res = mean_squared_error(val_y, test_y, squared=False) | |||
| else: | |||
| len1, len2 = len(train_y), len(val_y) | |||
| model = KernelRidge(kernel="precomputed", alpha=a) | |||
| model.fit(K1[-len1:, -len1:], train_y) | |||
| test_y = model.predict(K2[-len2:, -len1:]) | |||
| res = mean_squared_error(val_y, test_y, squared=False) | |||
| if res < best: | |||
| best = res | |||
| if save: | |||
| joblib.dump(model, os.path.join(model_dir, f"krr_{store}.out")) | |||
| return best | |||
| def grid_search(store_id, algo, search_lgb_flag=False): | |||
| store = store_list[store_id] | |||
| if algo == "lgb": | |||
| train_x, train_y, val_x, val_y = acquire_data(store, True) | |||
| learning_rate = [0.005, 0.01, 0.015] | |||
| num_leaves = [128, 224, 300] | |||
| max_depth = [50, 66, 80] | |||
| best = 10000000 | |||
| if search_lgb_flag: | |||
| for lr in learning_rate: | |||
| for nl in num_leaves: | |||
| for md in max_depth: | |||
| best = train_lgb_model(train_x, train_y, val_x, val_y, store, lr, nl, md, best) | |||
| print(f"store: {store}, lr: {lr}, nl: {nl}, md: {md}, best: {best}") | |||
| else: | |||
| lr, nl, md = lgb_params_list[store_id] | |||
| best = train_lgb_model(train_x, train_y, val_x, val_y, store, lr, nl, md, best) | |||
| print(f"store: {store}, lr: {lr}, nl: {nl}, md: {md}, best: {best}") | |||
| elif algo == "ridge": | |||
| train_x, train_y, val_x, val_y = acquire_data(store, True) | |||
| alpha = [0.01, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10, 20, 30] | |||
| best = 10000000 | |||
| for a in alpha: | |||
| best = train_ridge_model(train_x, train_y, val_x, val_y, store, a, best) | |||
| print(f"store: {store}, alpha: {a}, best: {best}") | |||
| def grid_training_sample(algo, user_list=list(range(10))): | |||
| for i in range(len(user_list)): | |||
| store_id = user_list[i] | |||
| store = store_list[store_id] | |||
| org_train_x, org_train_y, val_x, val_y = acquire_data(store, True) | |||
| res = [] | |||
| proportion_list = [ | |||
| 100, | |||
| 300, | |||
| 500, | |||
| 700, | |||
| 900, | |||
| 1000, | |||
| 3000, | |||
| 5000, | |||
| 7000, | |||
| 9000, | |||
| 10000, | |||
| 30000, | |||
| 50000, | |||
| 70000, | |||
| 90000, | |||
| 100000, | |||
| 300000, | |||
| 500000, | |||
| 700000, | |||
| 900000, | |||
| 1000000, | |||
| 3000000, | |||
| 5000000, | |||
| ] | |||
| for proportion in proportion_list: | |||
| """ | |||
| random | |||
| org_idx_list = list(range(len(org_train_y))) | |||
| idx_list = random.sample(org_idx_list, min(proportion, len(org_train_y))) | |||
| train_x = org_train_x.iloc[idx_list] | |||
| train_y = org_train_y.iloc[idx_list] | |||
| """ | |||
| train_x = org_train_x[-proportion:] | |||
| train_y = org_train_y[-proportion:] | |||
| best = 10000000 | |||
| if algo == "lgb": | |||
| lr, nl, md = lgb_params_list[store_id] | |||
| best = train_lgb_model( | |||
| train_x, train_y, val_x, val_y, store, lr, nl, md, best, save=False, n_estimators=3000, train_flag=0 | |||
| ) | |||
| print(f"store: {store}, lr: {lr}, nl: {nl}, md: {md}, best: {best}") | |||
| elif algo == "ridge": | |||
| alpha = [0.01, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10, 20, 30] | |||
| for a in alpha: | |||
| best = train_ridge_model(train_x, train_y, val_x, val_y, store, a, best, save=False) | |||
| print(f"store: {store}, alpha: {a}, best: {best}") | |||
| elif algo == "svm": | |||
| C = [1, 10, 100] | |||
| epsilon = 0.001 | |||
| for c in C: | |||
| best = train_svm_model(train_x, train_y, val_x, val_y, store, c, epsilon, best, save=False) | |||
| print(f"store: {store}, C: {c}, epsilon: {epsilon}, best: {best}") | |||
| res.append([proportion, best]) | |||
| np.savetxt(os.path.join(grid_dir, f"grid_sample_{algo}_{store}.out"), np.array(res)) | |||
| if proportion > len(org_train_y): | |||
| break | |||
| def retrain_models(algo): | |||
| for store_id in range(10): | |||
| grid_search(store_id, algo) | |||
| def train_adaptation_grid( | |||
| algo, max_sample, test_sample, user_list=list(range(10)), adaptation_model=[], residual=False | |||
| ): | |||
| """ | |||
| adaptation_model = [ | |||
| [("lgb", 1), ("ridge", 2)], | |||
| [("lgb", 1), ("ridge", 2)] | |||
| ] | |||
| """ | |||
| proportion_list = [ | |||
| 100, | |||
| 300, | |||
| 500, | |||
| 700, | |||
| 900, | |||
| 1000, | |||
| 3000, | |||
| 5000, | |||
| 7000, | |||
| 9000, | |||
| 10000, | |||
| 30000, | |||
| 50000, | |||
| 70000, | |||
| 90000, | |||
| 100000, | |||
| 300000, | |||
| 500000, | |||
| 700000, | |||
| 900000, | |||
| 1000000, | |||
| 3000000, | |||
| 5000000, | |||
| ] | |||
| sample_idx = proportion_list.index(max_sample) + 1 | |||
| for i in range(len(user_list)): | |||
| store_id = user_list[i] | |||
| store = store_list[store_id] | |||
| org_train_x, org_train_y, val_x, val_y = acquire_data(store, True) | |||
| val_x = val_x[-test_sample:] | |||
| val_y = val_y[-test_sample:] | |||
| if algo == "lgb" or algo == "ridge": | |||
| res = [] | |||
| if adaptation_model != []: | |||
| if residual: | |||
| aux_algo, model_idx = adaptation_model[i][0] | |||
| org_train_y -= model_predict(aux_algo, model_idx, org_train_x) | |||
| val_y -= model_predict(aux_algo, model_idx, val_x) | |||
| else: | |||
| train_y_list, val_y_list = [], [] | |||
| for aux_algo, model_idx in adaptation_model[i]: | |||
| train_y_list.append(model_predict(aux_algo, model_idx, org_train_x)) | |||
| val_y_list.append(model_predict(aux_algo, model_idx, val_x)) | |||
| for j in range(len(train_y_list)): | |||
| org_train_x[f"model_values_{j}"] = train_y_list[j] | |||
| val_x[f"model_values_{j}"] = val_y_list[j] | |||
| for proportion in proportion_list[:sample_idx]: | |||
| """ | |||
| random | |||
| org_idx_list = list(range(len(org_train_y))) | |||
| idx_list = random.sample(org_idx_list, min(proportion, len(org_train_y))) | |||
| train_x = org_train_x.iloc[idx_list] | |||
| train_y = org_train_y.iloc[idx_list] | |||
| """ | |||
| train_x = org_train_x[-proportion:] | |||
| train_y = org_train_y[-proportion:] | |||
| best = 10000000 | |||
| if algo == "lgb": | |||
| if max_sample < 50000: | |||
| learning_rate = [0.005, 0.01, 0.015] | |||
| num_leaves = [128, 224, 300] | |||
| max_depth = [50, 66, 80] | |||
| for lr in learning_rate: | |||
| for nl in num_leaves: | |||
| for md in max_depth: | |||
| best = train_lgb_model( | |||
| train_x, train_y, val_x, val_y, store, lr, nl, md, best, save=False | |||
| ) | |||
| print(f"store: {store}, lr: {lr}, nl: {nl}, md: {md}, best: {best}") | |||
| else: | |||
| lr, nl, md = lgb_params_list[store_id] | |||
| best = train_lgb_model( | |||
| train_x, | |||
| train_y, | |||
| val_x, | |||
| val_y, | |||
| store, | |||
| lr, | |||
| nl, | |||
| md, | |||
| best, | |||
| save=False, | |||
| n_estimators=3000, | |||
| train_flag=0, | |||
| ) | |||
| print(f"store: {store}, lr: {lr}, nl: {nl}, md: {md}, best: {best}") | |||
| elif algo == "ridge": | |||
| alpha = [0.01, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10, 20, 30] | |||
| for a in alpha: | |||
| best = train_ridge_model(train_x, train_y, val_x, val_y, store, a, best, save=False) | |||
| print(f"store: {store}, alpha: {a}, best: {best}") | |||
| res.append([proportion, best]) | |||
| text = str(adaptation_model[i]) if adaptation_model != [] else "null" | |||
| text += "_residual_" if residual else "" | |||
| np.savetxt(os.path.join(grid_dir, f"{algo}_using_{text}_{store}.out"), np.array(res)) | |||
| if proportion > len(org_train_y): | |||
| break | |||
| elif algo == "svm" or algo == "krr": | |||
| res = [[proportion, 10000] for proportion in proportion_list[:sample_idx]] | |||
| org_train_x = org_train_x.to_numpy() | |||
| org_train_y = org_train_y.to_numpy() | |||
| val_x = val_x.to_numpy() | |||
| val_y = val_y.to_numpy() | |||
| y1_list, y2_list = [], [] | |||
| gamma_list = [0.01, 0.1, 0.5, 1] | |||
| if residual: | |||
| aux_algo, model_idx = adaptation_model[i][0] | |||
| org_train_y = org_train_y.astype(np.float64) | |||
| val_y = val_y.astype(np.float64) | |||
| org_train_y -= model_predict(aux_algo, model_idx, org_train_x) | |||
| val_y -= model_predict(aux_algo, model_idx, val_x) | |||
| elif adaptation_model != []: | |||
| for aux_algo, idx in adaptation_model[i]: | |||
| y1_list.append(model_predict(aux_algo, idx, org_train_x[-max_sample:]).reshape(-1, 1)) | |||
| y2_list.append(model_predict(aux_algo, idx, val_x).reshape(-1, 1)) | |||
| for gamma in gamma_list: | |||
| K1 = np.zeros((max_sample, max_sample)) | |||
| K2 = np.zeros((len(val_x), max_sample)) | |||
| if (not residual) and adaptation_model != []: | |||
| for j in range(len(adaptation_model[i])): | |||
| aux_algo, idx = adaptation_model[i][j] | |||
| y1 = y1_list[j] | |||
| y2 = y2_list[j] | |||
| K1 += np.dot(y1, y1.T) | |||
| K2 += np.dot(y2, y1.T) | |||
| K1 += rbf_kernel(org_train_x[-max_sample:], org_train_x[-max_sample:], gamma=gamma) | |||
| K2 += rbf_kernel(val_x, org_train_x[-max_sample:], gamma=gamma) | |||
| for idx in range(len(proportion_list[:sample_idx])): | |||
| proportion = proportion_list[idx] | |||
| """ | |||
| random | |||
| org_idx_list = list(range(len(org_train_y))) | |||
| idx_list = random.sample(org_idx_list, min(proportion, len(org_train_y))) | |||
| train_x = org_train_x.iloc[idx_list] | |||
| train_y = org_train_y.iloc[idx_list] | |||
| """ | |||
| train_x = org_train_x[-proportion:] | |||
| train_y = org_train_y[-proportion:] | |||
| best = 10000000 | |||
| if algo == "svm": | |||
| C = [1, 10, 50, 100, 200] | |||
| epsilon = 0.001 | |||
| for c in C: | |||
| adapt_m = [] if adaptation_model == [] else adaptation_model[i] | |||
| best = train_svm_model( | |||
| train_x, | |||
| train_y, | |||
| val_x, | |||
| val_y, | |||
| store, | |||
| c, | |||
| epsilon, | |||
| best, | |||
| save=False, | |||
| gamma=gamma, | |||
| adaptation_model=adapt_m, | |||
| K1=K1, | |||
| K2=K2, | |||
| ) | |||
| print(f"store: {store}, gamma: {gamma}, C: {c}, epsilon: {epsilon}, best: {best}") | |||
| elif algo == "krr": | |||
| alpha = [0.01, 0.1, 0.5, 1.0, 5.0, 10] | |||
| for a in alpha: | |||
| best = train_krr_model( | |||
| train_x, train_y, val_x, val_y, store, a, best, save=False, gamma=gamma, K1=K1, K2=K2 | |||
| ) | |||
| print(f"store: {store}, a: {a}, gamma: {gamma}, best: {best}") | |||
| if best < res[idx][1]: | |||
| res[idx][1] = best | |||
| text = str(adaptation_model[i]) if adaptation_model != [] else "null" | |||
| text += "_residual" if residual else "" | |||
| np.savetxt(os.path.join(grid_dir, f"{algo}_using_{text}_{store}.out"), np.array(res)) | |||
| if proportion > len(org_train_y): | |||
| break | |||
| del train_x, train_y | |||
| gc.collect() | |||
| del K1, K2 | |||
| gc.collect() | |||
| del org_train_x, org_train_y | |||
| gc.collect() | |||
| @@ -1,177 +0,0 @@ | |||
| from math import gamma | |||
| from tkinter import Y | |||
| import joblib | |||
| from tqdm import tqdm | |||
| import numpy as np | |||
| import pandas as pd | |||
| import lightgbm as lgb | |||
| from sklearn.svm import SVR | |||
| from sklearn.metrics import mean_squared_error | |||
| from sklearn.metrics.pairwise import rbf_kernel | |||
| import os, sys, gc, time, warnings, pickle, psutil, random | |||
| import matplotlib.pyplot as plt | |||
| from mpl_toolkits.axes_grid1 import make_axes_locatable | |||
| from .config import * | |||
| class AuxiliarySVR: | |||
| def __init__( | |||
| self, C, epsilon, gamma, adaptation_model=[], max_iter=30000, cache_size=10240, verbose=False, K1=None, K2=None | |||
| ): | |||
| self.gamma = gamma | |||
| self.adaptation_model = adaptation_model | |||
| self.model = SVR( | |||
| C=C, | |||
| epsilon=epsilon, | |||
| kernel=self.auxiliary_rbf_kernel, | |||
| max_iter=max_iter, | |||
| cache_size=cache_size, | |||
| verbose=verbose, | |||
| ) | |||
| self.K1 = K1 | |||
| self.K2 = K2 | |||
| def auxiliary_rbf_kernel(self, X1, X2): | |||
| if self.K1 is not None: | |||
| if X1.shape[0] == X2.shape[0]: | |||
| return self.K1[-X1.shape[0] :, -X2.shape[0] :] | |||
| else: | |||
| return self.K2[-X1.shape[0] :, -X2.shape[0] :] | |||
| else: | |||
| K = np.zeros((len(X1), len(X2))) | |||
| for algo, idx in self.adaptation_model: | |||
| Y1 = model_predict(algo, idx, X1).reshape(-1, 1) | |||
| Y2 = model_predict(algo, idx, X2).reshape(-1, 1) | |||
| K += Y1 @ Y2.T | |||
| K += rbf_kernel(X1, X2, self.gamma) | |||
| return K | |||
| def fit(self, X, Y): | |||
| self.gamma = 1 / X.shape[1] | |||
| self.model.fit(X, Y) | |||
| def predict(self, X): | |||
| return self.model.predict(X) | |||
| def measure_aux_algo(idx, test_sample, model): | |||
| """ | |||
| model = ("lgb", 1) | |||
| """ | |||
| store = store_list[idx] | |||
| org_train_x, org_train_y, val_x, val_y = acquire_data(store, True) | |||
| pred_y = model_predict(model[0], model[1], val_x[-test_sample:]) | |||
| return score(pred_y, val_y[-test_sample:]) | |||
| # Simple "Memory profilers" to see memory usage | |||
| def get_memory_usage(): | |||
| return np.round(psutil.Process(os.getpid()).memory_info()[0] / 2.0**30, 2) | |||
| def sizeof_fmt(num, suffix="B"): | |||
| for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]: | |||
| if abs(num) < 1024.0: | |||
| return "%3.1f%s%s" % (num, unit, suffix) | |||
| num /= 1024.0 | |||
| return "%.1f%s%s" % (num, "Yi", suffix) | |||
| # Memory Reducer | |||
| def reduce_mem_usage(df, float16_flag=True, verbose=True): | |||
| numerics = ["int16", "int32", "int64", "float16", "float32", "float64"] | |||
| start_mem = df.memory_usage().sum() / 1024**2 | |||
| for col in df.columns: | |||
| col_type = df[col].dtypes | |||
| if col_type in numerics: | |||
| c_min = df[col].min() | |||
| c_max = df[col].max() | |||
| if str(col_type)[:3] == "int": | |||
| if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: | |||
| df[col] = df[col].astype(np.int8) | |||
| elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: | |||
| df[col] = df[col].astype(np.int16) | |||
| elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: | |||
| df[col] = df[col].astype(np.int32) | |||
| elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: | |||
| df[col] = df[col].astype(np.int64) | |||
| else: | |||
| if float16_flag and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: | |||
| df[col] = df[col].astype(np.float16) | |||
| elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: | |||
| df[col] = df[col].astype(np.float32) | |||
| else: | |||
| df[col] = df[col].astype(np.float64) | |||
| end_mem = df.memory_usage().sum() / 1024**2 | |||
| if verbose: | |||
| print( | |||
| "Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)".format( | |||
| end_mem, 100 * (start_mem - end_mem) / start_mem | |||
| ) | |||
| ) | |||
| return df | |||
| # Merging by concat to not lose dtypes | |||
| def merge_by_concat(df1, df2, merge_on): | |||
| merged_gf = df1[merge_on] | |||
| merged_gf = merged_gf.merge(df2, on=merge_on, how="left") | |||
| new_columns = [col for col in list(merged_gf) if col not in merge_on] | |||
| df1 = pd.concat([df1, merged_gf[new_columns]], axis=1) | |||
| return df1 | |||
| def model_predict(algo, idx, test_x): | |||
| store = store_list[idx] | |||
| if algo == "lgb": | |||
| model = lgb.Booster(model_file=os.path.join(model_dir, f"lgb_{store}.out")) | |||
| return model.predict(test_x, num_iteration=model.best_iteration) | |||
| elif algo == "ridge": | |||
| model = joblib.load(os.path.join(model_dir, f"ridge_{store}.out")) | |||
| return model.predict(test_x) | |||
| elif algo == "svm": | |||
| model = joblib.load(os.path.join(model_dir, f"svm_{store}.out")) | |||
| return model.predict(test_x) | |||
| def get_weights(algo): | |||
| weights = [] | |||
| if algo == "lgb": | |||
| for store in store_list: | |||
| model = lgb.Booster(model_file=os.path.join(model_dir, f"lgb_{store}.out")) | |||
| weights.append(model.feature_importance()) | |||
| else: | |||
| for store in store_list: | |||
| model = joblib.load(os.path.join(model_dir, f"ridge_{store}.out")) | |||
| weights.append(model.coef_) | |||
| return np.array(weights) | |||
| def score(real_y, pred_y, sample_weight, multioutput): | |||
| return mean_squared_error(real_y, pred_y, sample_weight=sample_weight, multioutput=multioutput, squared=False) | |||
| def acquire_data(store, fill_flag=False): | |||
| TARGET = "sales" | |||
| suffix = f"_fill" if fill_flag else "" | |||
| train = pd.read_pickle(os.path.join(processed_data_dir, f"train_{store}{suffix}.pkl")) | |||
| val = pd.read_pickle(os.path.join(processed_data_dir, f"val_{store}{suffix}.pkl")) | |||
| train_y = train[TARGET] | |||
| train_x = train.drop(columns=TARGET, axis=1) | |||
| val_y = val[TARGET] | |||
| val_x = val.drop(columns=TARGET, axis=1) | |||
| train_x = train_x.to_numpy() | |||
| train_y = train_y.to_numpy() | |||
| val_x = val_x.to_numpy() | |||
| val_y = val_y.to_numpy() | |||
| return train_x, train_y, val_x, val_y | |||
| @@ -1,211 +0,0 @@ | |||
| import os | |||
| import fire | |||
| import time | |||
| import zipfile | |||
| import numpy as np | |||
| from tqdm import tqdm | |||
| from shutil import copyfile, rmtree | |||
| import learnware | |||
| from learnware.market import instantiate_learnware_market, BaseUserInfo | |||
| from learnware.reuse import JobSelectorReuser, AveragingReuser | |||
| from learnware.specification import generate_rkme_table_spec | |||
| from m5 import DataLoader | |||
| from learnware.logger import get_module_logger | |||
| logger = get_module_logger("m5_test", level="INFO") | |||
| output_description = { | |||
| "Dimension": 1, | |||
| "Description": {}, | |||
| } | |||
| input_description = { | |||
| "Dimension": 82, | |||
| "Description": {}, | |||
| } | |||
| semantic_specs = [ | |||
| { | |||
| "Data": {"Values": ["Table"], "Type": "Class"}, | |||
| "Task": {"Values": ["Regression"], "Type": "Class"}, | |||
| "Library": {"Values": ["Scikit-learn"], "Type": "Class"}, | |||
| "Scenario": {"Values": ["Business"], "Type": "Tag"}, | |||
| "Description": {"Values": "", "Type": "String"}, | |||
| "Name": {"Values": "learnware_1", "Type": "String"}, | |||
| "Input": input_description, | |||
| "Output": output_description, | |||
| "License": {"Values": ["MIT"], "Type": "Class"}, | |||
| } | |||
| ] | |||
| user_semantic = { | |||
| "Data": {"Values": ["Table"], "Type": "Class"}, | |||
| "Task": {"Values": ["Regression"], "Type": "Class"}, | |||
| "Library": {"Values": ["Scikit-learn"], "Type": "Class"}, | |||
| "Scenario": {"Values": ["Business"], "Type": "Tag"}, | |||
| "Description": {"Values": "", "Type": "String"}, | |||
| "Name": {"Values": "", "Type": "String"}, | |||
| "Input": input_description, | |||
| "Output": output_description, | |||
| "License": {"Values": ["MIT"], "Type": "Class"}, | |||
| } | |||
| class M5DatasetWorkflow: | |||
| def _init_m5_dataset(self): | |||
| m5 = DataLoader() | |||
| m5.regenerate_data() | |||
| algo_list = ["ridge", "lgb"] | |||
| for algo in algo_list: | |||
| m5.set_algo(algo) | |||
| m5.retrain_models() | |||
| def _init_learnware_market(self): | |||
| """initialize learnware market""" | |||
| # database_ops.clear_learnware_table() | |||
| learnware.init() | |||
| easy_market = instantiate_learnware_market(name="easy", rebuild=True) | |||
| print("Total Item:", len(easy_market)) | |||
| zip_path_list = [] | |||
| curr_root = os.path.dirname(os.path.abspath(__file__)) | |||
| curr_root = os.path.join(curr_root, "learnware_pool") | |||
| for zip_path in os.listdir(curr_root): | |||
| zip_path_list.append(os.path.join(curr_root, zip_path)) | |||
| for idx, zip_path in enumerate(zip_path_list): | |||
| semantic_spec = semantic_specs[0] | |||
| semantic_spec["Name"]["Values"] = "learnware_%d" % (idx) | |||
| semantic_spec["Description"]["Values"] = "test_learnware_number_%d" % (idx) | |||
| easy_market.add_learnware(zip_path, semantic_spec) | |||
| print("Total Item:", len(easy_market)) | |||
| def prepare_learnware(self, regenerate_flag=False): | |||
| if regenerate_flag: | |||
| self._init_m5_dataset() | |||
| m5 = DataLoader() | |||
| idx_list = m5.get_idx_list() | |||
| algo_list = ["lgb"] # algo_list = ["ridge", "lgb"] | |||
| curr_root = os.path.dirname(os.path.abspath(__file__)) | |||
| curr_root = os.path.join(curr_root, "learnware_pool") | |||
| os.makedirs(curr_root, exist_ok=True) | |||
| for idx in tqdm(idx_list): | |||
| train_x, train_y, test_x, test_y = m5.get_idx_data(idx) | |||
| st = time.time() | |||
| spec = generate_rkme_table_spec(X=train_x, gamma=0.1, cuda_idx=0) | |||
| ed = time.time() | |||
| logger.info("Stat spec generated in %.3f s" % (ed - st)) | |||
| for algo in algo_list: | |||
| m5.set_algo(algo) | |||
| dir_path = os.path.join(curr_root, f"{algo}_{idx}") | |||
| os.makedirs(dir_path, exist_ok=True) | |||
| spec_path = os.path.join(dir_path, "rkme.json") | |||
| spec.save(spec_path) | |||
| model_path = m5.get_model_path(idx) | |||
| model_file = os.path.join(dir_path, "model.out") | |||
| copyfile(model_path, model_file) | |||
| init_file = os.path.join(dir_path, "__init__.py") | |||
| copyfile("example_init.py", init_file) | |||
| yaml_file = os.path.join(dir_path, "learnware.yaml") | |||
| copyfile("example.yaml", yaml_file) | |||
| zip_file = dir_path + ".zip" | |||
| with zipfile.ZipFile(zip_file, "w") as zip_obj: | |||
| for foldername, subfolders, filenames in os.walk(dir_path): | |||
| for filename in filenames: | |||
| file_path = os.path.join(foldername, filename) | |||
| zip_info = zipfile.ZipInfo(filename) | |||
| zip_info.compress_type = zipfile.ZIP_STORED | |||
| with open(file_path, "rb") as file: | |||
| zip_obj.writestr(zip_info, file.read()) | |||
| rmtree(dir_path) | |||
| def test(self, regenerate_flag=False): | |||
| self.prepare_learnware(regenerate_flag) | |||
| self._init_learnware_market() | |||
| easy_market = instantiate_learnware_market(name="easy") | |||
| print("Total Item:", len(easy_market)) | |||
| m5 = DataLoader() | |||
| idx_list = m5.get_idx_list() | |||
| os.makedirs("./user_spec", exist_ok=True) | |||
| single_score_list = [] | |||
| random_score_list = [] | |||
| job_selector_score_list = [] | |||
| ensemble_score_list = [] | |||
| improve_list = [] | |||
| for idx in idx_list: | |||
| train_x, train_y, test_x, test_y = m5.get_idx_data(idx) | |||
| user_spec = generate_rkme_table_spec(X=test_x, gamma=0.1, cuda_idx=0) | |||
| user_spec_path = f"./user_spec/user_{idx}.json" | |||
| user_spec.save(user_spec_path) | |||
| user_info = BaseUserInfo(semantic_spec=user_semantic, stat_info={"RKMETableSpecification": user_spec}) | |||
| search_result = easy_market.search_learnware(user_info) | |||
| single_result = search_result.get_single_results() | |||
| multiple_result = search_result.get_multiple_results() | |||
| print(f"search result of user{idx}:") | |||
| print( | |||
| f"single model num: {len(single_result)}, max_score: {single_result[0].score}, min_score: {single_result[-1].score}" | |||
| ) | |||
| loss_list = [] | |||
| for single_item in single_result: | |||
| pred_y = single_item.learnware.predict(test_x) | |||
| loss_list.append(m5.score(test_y, pred_y)) | |||
| print( | |||
| f"Top1-score: {single_result[0].score}, learnware_id: {single_result[0].learnware.id}, loss: {loss_list[0]}" | |||
| ) | |||
| if len(multiple_result) > 0: | |||
| mixture_id = " ".join([learnware.id for learnware in multiple_result[0].learnwares]) | |||
| print(f"mixture_score: {multiple_result[0].score}, mixture_learnware: {mixture_id}") | |||
| mixture_learnware_list = multiple_result[0].learnwares | |||
| else: | |||
| mixture_learnware_list = [single_result[0].learnware] | |||
| reuse_job_selector = JobSelectorReuser(learnware_list=mixture_learnware_list, use_herding=False) | |||
| job_selector_predict_y = reuse_job_selector.predict(user_data=test_x) | |||
| job_selector_score = m5.score(test_y, job_selector_predict_y) | |||
| print(f"mixture reuse loss (job selector): {job_selector_score}") | |||
| reuse_ensemble = AveragingReuser(learnware_list=mixture_learnware_list, mode="vote_by_prob") | |||
| ensemble_predict_y = reuse_ensemble.predict(user_data=test_x) | |||
| ensemble_score = m5.score(test_y, ensemble_predict_y) | |||
| print(f"mixture reuse loss (ensemble): {ensemble_score}\n") | |||
| single_score_list.append(loss_list[0]) | |||
| random_score_list.append(np.mean(loss_list)) | |||
| job_selector_score_list.append(job_selector_score) | |||
| ensemble_score_list.append(ensemble_score) | |||
| improve_list.append((np.mean(loss_list) - loss_list[0]) / np.mean(loss_list)) | |||
| logger.info("Single search score %.3f +/- %.3f" % (np.mean(single_score_list), np.std(single_score_list))) | |||
| logger.info("Random search score: %.3f +/- %.3f" % (np.mean(random_score_list), np.std(random_score_list))) | |||
| logger.info("Average score improvement: %.3f" % (np.mean(improve_list))) | |||
| logger.info( | |||
| "Job selector score: %.3f +/- %.3f" % (np.mean(job_selector_score_list), np.std(job_selector_score_list)) | |||
| ) | |||
| logger.info( | |||
| "Average ensemble score: %.3f +/- %.3f" % (np.mean(ensemble_score_list), np.std(ensemble_score_list)) | |||
| ) | |||
| if __name__ == "__main__": | |||
| fire.Fire(M5DatasetWorkflow) | |||
| @@ -1,87 +0,0 @@ | |||
| import hashlib | |||
| import requests | |||
| import os | |||
| import random | |||
| import json | |||
| import time | |||
| from tqdm import tqdm | |||
| email = "tanzh@lamda.nju.edu.cn" | |||
| password = hashlib.md5(b"Qwerty123").hexdigest() | |||
| login_url = "http://210.28.134.201:8089/auth/login" | |||
| submit_url = "http://210.28.134.201:8089/user/add_learnware" | |||
| all_data_type = ["Table", "Image", "Video", "Text", "Audio"] | |||
| all_task_type = [ | |||
| "Classification", | |||
| "Regression", | |||
| "Clustering", | |||
| "Feature Extraction", | |||
| "Generation", | |||
| "Segmentation", | |||
| "Object Detection", | |||
| ] | |||
| all_device_type = ["CPU", "GPU"] | |||
| all_scenario = [ | |||
| "Business", | |||
| "Financial", | |||
| "Health", | |||
| "Politics", | |||
| "Computer", | |||
| "Internet", | |||
| "Traffic", | |||
| "Nature", | |||
| "Fashion", | |||
| "Industry", | |||
| "Agriculture", | |||
| "Education", | |||
| "Entertainment", | |||
| "Architecture", | |||
| ] | |||
| # ############### | |||
| # 以上部分无需修改 # | |||
| # ############### | |||
| def main(): | |||
| session = requests.Session() | |||
| res = session.post(login_url, json={"email": email, "password": password}) | |||
| # /path/to/learnware/folder 修改为学件文件夹地址 | |||
| learnware_pool = os.listdir(os.path.join(os.path.abspath("."), "learnware_pool")) | |||
| for learnware in learnware_pool: | |||
| # 修改相应的语义规约 | |||
| name = "M5_Shop" + "%02d" % int(learnware.split(".")[0].split("_")[1]) | |||
| name = name + "_" + time.strftime("%Y%m%d%H%M%S", time.localtime()) | |||
| description = f"This is a description of learnware {name}" | |||
| data = random.choice(all_data_type) | |||
| task = random.choice(all_task_type) | |||
| device = list(set(random.choices(all_device_type, k=2))) | |||
| scenario = list(set(random.choices(all_scenario, k=5))) | |||
| semantic_specification = { | |||
| "Data": {"Values": ["Table"], "Type": "Class"}, | |||
| "Task": {"Values": ["Regression"], "Type": "Class"}, | |||
| "Device": {"Values": ["CPU"], "Type": "Tag"}, | |||
| "Scenario": {"Values": ["Business"], "Type": "Tag"}, | |||
| "Description": {"Values": "A sales-forecasting model from Walmart store", "Type": "String"}, | |||
| "Name": {"Values": name, "Type": "String"}, | |||
| "License": {"Values": ["MIT"], "Type": "Class"}, | |||
| } | |||
| res = session.post( | |||
| submit_url, | |||
| data={ | |||
| "semantic_specification": json.dumps(semantic_specification), | |||
| }, | |||
| files={ | |||
| "learnware_file": open( | |||
| os.path.join(os.path.abspath("."), "learnware_pool", learnware), | |||
| "rb", | |||
| ) | |||
| }, | |||
| ) | |||
| assert json.loads(res.text)["code"] == 0, "Upload error" | |||
| if __name__ == "__main__": | |||
| main() | |||
| @@ -1,8 +0,0 @@ | |||
| model: | |||
| class_name: Model | |||
| kwargs: {} | |||
| stat_specifications: | |||
| - module_path: learnware.specification | |||
| class_name: RKMETableSpecification | |||
| file_name: rkme.json | |||
| kwargs: {} | |||
| @@ -1,20 +0,0 @@ | |||
| import os | |||
| import joblib | |||
| import numpy as np | |||
| from learnware.model import BaseModel | |||
| class Model(BaseModel): | |||
| def __init__(self): | |||
| super(Model, self).__init__(input_shape=(31,), output_shape=(1,)) | |||
| dir_path = os.path.dirname(os.path.abspath(__file__)) | |||
| self.model = joblib.load(os.path.join(dir_path, "model.out")) | |||
| def fit(self, X: np.ndarray, y: np.ndarray): | |||
| pass | |||
| def predict(self, X: np.ndarray) -> np.ndarray: | |||
| return self.model.predict(X) | |||
| def finetune(self, X: np.ndarray, y: np.ndarray): | |||
| pass | |||
| @@ -1,208 +0,0 @@ | |||
| import os | |||
| import fire | |||
| import zipfile | |||
| import time | |||
| import numpy as np | |||
| from tqdm import tqdm | |||
| from shutil import copyfile, rmtree | |||
| import learnware | |||
| from learnware.market import instantiate_learnware_market, BaseUserInfo | |||
| from learnware.reuse import JobSelectorReuser, AveragingReuser | |||
| from learnware.specification import generate_rkme_table_spec | |||
| from pfs import Dataloader | |||
| from learnware.logger import get_module_logger | |||
| logger = get_module_logger("pfs_test", level="INFO") | |||
| output_description = { | |||
| "Dimension": 1, | |||
| "Description": {}, | |||
| } | |||
| input_description = { | |||
| "Dimension": 31, | |||
| "Description": {}, | |||
| } | |||
| semantic_specs = [ | |||
| { | |||
| "Data": {"Values": ["Table"], "Type": "Class"}, | |||
| "Task": {"Values": ["Regression"], "Type": "Class"}, | |||
| "Library": {"Values": ["Scikit-learn"], "Type": "Class"}, | |||
| "Scenario": {"Values": ["Business"], "Type": "Tag"}, | |||
| "Description": {"Values": "", "Type": "String"}, | |||
| "Name": {"Values": "learnware_1", "Type": "String"}, | |||
| "Input": input_description, | |||
| "Output": output_description, | |||
| "License": {"Values": ["MIT"], "Type": "Class"}, | |||
| } | |||
| ] | |||
| user_semantic = { | |||
| "Data": {"Values": ["Table"], "Type": "Class"}, | |||
| "Task": {"Values": ["Regression"], "Type": "Class"}, | |||
| "Library": {"Values": ["Scikit-learn"], "Type": "Class"}, | |||
| "Scenario": {"Values": ["Business"], "Type": "Tag"}, | |||
| "Description": {"Values": "", "Type": "String"}, | |||
| "Name": {"Values": "", "Type": "String"}, | |||
| "Input": input_description, | |||
| "Output": output_description, | |||
| "License": {"Values": ["MIT"], "Type": "Class"}, | |||
| } | |||
| class PFSDatasetWorkflow: | |||
| def _init_pfs_dataset(self): | |||
| pfs = Dataloader() | |||
| pfs.regenerate_data() | |||
| algo_list = ["ridge"] # "ridge", "lgb" | |||
| for algo in algo_list: | |||
| pfs.set_algo(algo) | |||
| pfs.retrain_models() | |||
| def _init_learnware_market(self): | |||
| """initialize learnware market""" | |||
| learnware.init() | |||
| easy_market = instantiate_learnware_market(market_id="pfs", name="easy", rebuild=True) | |||
| print("Total Item:", len(easy_market)) | |||
| zip_path_list = [] | |||
| curr_root = os.path.dirname(os.path.abspath(__file__)) | |||
| curr_root = os.path.join(curr_root, "learnware_pool") | |||
| for zip_path in os.listdir(curr_root): | |||
| zip_path_list.append(os.path.join(curr_root, zip_path)) | |||
| for idx, zip_path in enumerate(zip_path_list): | |||
| semantic_spec = semantic_specs[0] | |||
| semantic_spec["Name"]["Values"] = "learnware_%d" % (idx) | |||
| semantic_spec["Description"]["Values"] = "test_learnware_number_%d" % (idx) | |||
| easy_market.add_learnware(zip_path, semantic_spec) | |||
| print("Total Item:", len(easy_market)) | |||
| def prepare_learnware(self, regenerate_flag=False): | |||
| if regenerate_flag: | |||
| self._init_pfs_dataset() | |||
| pfs = Dataloader() | |||
| idx_list = pfs.get_idx_list() | |||
| algo_list = ["ridge"] # ["ridge", "lgb"] | |||
| curr_root = os.path.dirname(os.path.abspath(__file__)) | |||
| curr_root = os.path.join(curr_root, "learnware_pool") | |||
| os.makedirs(curr_root, exist_ok=True) | |||
| for idx in tqdm(idx_list): | |||
| train_x, train_y, test_x, test_y = pfs.get_idx_data(idx) | |||
| st = time.time() | |||
| spec = generate_rkme_table_spec(X=train_x, gamma=0.1, cuda_idx=0) | |||
| ed = time.time() | |||
| logger.info("Stat spec generated in %.3f s" % (ed - st)) | |||
| for algo in algo_list: | |||
| pfs.set_algo(algo) | |||
| dir_path = os.path.join(curr_root, f"{algo}_{idx}") | |||
| os.makedirs(dir_path, exist_ok=True) | |||
| spec_path = os.path.join(dir_path, "rkme.json") | |||
| spec.save(spec_path) | |||
| model_path = pfs.get_model_path(idx) | |||
| model_file = os.path.join(dir_path, "model.out") | |||
| copyfile(model_path, model_file) | |||
| init_file = os.path.join(dir_path, "__init__.py") | |||
| copyfile("example_init.py", init_file) | |||
| yaml_file = os.path.join(dir_path, "learnware.yaml") | |||
| copyfile("example.yaml", yaml_file) | |||
| zip_file = dir_path + ".zip" | |||
| with zipfile.ZipFile(zip_file, "w") as zip_obj: | |||
| for foldername, subfolders, filenames in os.walk(dir_path): | |||
| for filename in filenames: | |||
| file_path = os.path.join(foldername, filename) | |||
| zip_info = zipfile.ZipInfo(filename) | |||
| zip_info.compress_type = zipfile.ZIP_STORED | |||
| with open(file_path, "rb") as file: | |||
| zip_obj.writestr(zip_info, file.read()) | |||
| rmtree(dir_path) | |||
| def test(self, regenerate_flag=False): | |||
| self.prepare_learnware(regenerate_flag) | |||
| self._init_learnware_market() | |||
| easy_market = instantiate_learnware_market(market_id="pfs", name="easy") | |||
| print("Total Item:", len(easy_market)) | |||
| pfs = Dataloader() | |||
| idx_list = pfs.get_idx_list() | |||
| os.makedirs("./user_spec", exist_ok=True) | |||
| single_score_list = [] | |||
| random_score_list = [] | |||
| job_selector_score_list = [] | |||
| ensemble_score_list = [] | |||
| improve_list = [] | |||
| for idx in idx_list: | |||
| train_x, train_y, test_x, test_y = pfs.get_idx_data(idx) | |||
| user_spec = generate_rkme_table_spec(X=test_x, gamma=0.1, cuda_idx=0) | |||
| user_spec_path = f"./user_spec/user_{idx}.json" | |||
| user_spec.save(user_spec_path) | |||
| user_info = BaseUserInfo(semantic_spec=user_semantic, stat_info={"RKMETableSpecification": user_spec}) | |||
| search_result = easy_market.search_learnware(user_info) | |||
| single_result = search_result.get_single_results() | |||
| multiple_result = search_result.get_multiple_results() | |||
| print(f"search result of user{idx}:") | |||
| print( | |||
| f"single model num: {len(single_result)}, max_score: {single_result[0].score}, min_score: {single_result[-1].score}" | |||
| ) | |||
| loss_list = [] | |||
| for single_item in single_result: | |||
| pred_y = single_item.learnware.predict(test_x) | |||
| loss_list.append(pfs.score(test_y, pred_y)) | |||
| print( | |||
| f"Top1-score: {single_result[0].score}, learnware_id: {single_result[0].learnware.id}, loss: {loss_list[0]}, random: {np.mean(loss_list)}" | |||
| ) | |||
| if len(multiple_result) > 0: | |||
| mixture_id = " ".join([learnware.id for learnware in multiple_result[0].learnwares]) | |||
| print(f"mixture_score: {multiple_result[0].score}, mixture_learnware: {mixture_id}") | |||
| mixture_learnware_list = multiple_result[0].learnwares | |||
| else: | |||
| mixture_learnware_list = [single_result[0].learnware] | |||
| reuse_job_selector = JobSelectorReuser(learnware_list=mixture_learnware_list, use_herding=False) | |||
| job_selector_predict_y = reuse_job_selector.predict(user_data=test_x) | |||
| job_selector_score = pfs.score(test_y, job_selector_predict_y) | |||
| print(f"mixture reuse loss (job selector): {job_selector_score}") | |||
| reuse_ensemble = AveragingReuser(learnware_list=mixture_learnware_list) | |||
| ensemble_predict_y = reuse_ensemble.predict(user_data=test_x) | |||
| ensemble_score = pfs.score(test_y, ensemble_predict_y) | |||
| print(f"mixture reuse loss (ensemble): {ensemble_score}\n") | |||
| single_score_list.append(loss_list[0]) | |||
| random_score_list.append(np.mean(loss_list)) | |||
| job_selector_score_list.append(job_selector_score) | |||
| ensemble_score_list.append(ensemble_score) | |||
| improve_list.append((np.mean(loss_list) - loss_list[0]) / np.mean(loss_list)) | |||
| logger.info("Single search score %.3f +/- %.3f" % (np.mean(single_score_list), np.std(single_score_list))) | |||
| logger.info("Random search score: %.3f +/- %.3f" % (np.mean(random_score_list), np.std(random_score_list))) | |||
| logger.info("Average score improvement: %.3f" % (np.mean(improve_list))) | |||
| logger.info( | |||
| "Job selector score: %.3f +/- %.3f" % (np.mean(job_selector_score_list), np.std(job_selector_score_list)) | |||
| ) | |||
| logger.info( | |||
| "Average ensemble score: %.3f +/- %.3f" % (np.mean(ensemble_score_list), np.std(ensemble_score_list)) | |||
| ) | |||
| if __name__ == "__main__": | |||
| fire.Fire(PFSDatasetWorkflow) | |||
| @@ -1,48 +0,0 @@ | |||
| # Learnware based on Prediction Future Sales (PFS) data downloaded from Kaggle | |||
| --> Data Page Link: https://www.kaggle.com/c/competitive-data-science-predict-future-sales/data | |||
| --> Code Page Link: https://www.kaggle.com/uladzimirkapeika/feature-engineering-lightgbm-top-1 | |||
| # PFS任务描述 | |||
| --> 目标:预测每个商店每个商品在下一个月的销量(注意:粒度为月,而不是每天) | |||
| --> 特征信息:商店所在城市信息、商品类别信息、商品价格信息、商品历史价格信息(特征工程中只使用了前三个月的历史信息然后拼接在一起)等 | |||
| --> 使用的模型:XgBoost, LightGBM, LinearRegression | |||
| --> 评价指标:RMSE | |||
| * split_pfs_data.py | |||
| --> 根据Kaggle上公开的数据预处理方案处理下载的数据 | |||
| --> 直接运行即可将数据根据Shop ID划分为每个商店的信息,包括: | |||
| ----> 每个商品在每个月下的特征和目标值,存储为pandas.DataFrame格式 | |||
| ----> 字段包括: | |||
| -- 标识信息: 'shop_id', 'item_id', 'date_block_num' (标识月份), | |||
| -- 目标值(本月销量): 'item_cnt_month', | |||
| -- 城市信息: 'city_code', 'city_coord_1', 'city_coord_2', 'country_part', | |||
| -- 商品种类信息: 'item_category_common', 'item_category_code', | |||
| -- 该月的时间信息: 'weeknd_count', 'days_in_month', | |||
| -- 商品是否第一次销售: 'item_first_interaction', 'shop_item_sold_before', | |||
| -- 商品前三个月的销售量和价格信息: | |||
| 'item_cnt_month_lag_1', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3', | |||
| 'item_shop_price_avg_lag_1', 'item_shop_price_avg_lag_2', 'item_shop_price_avg_lag_3', | |||
| 'item_target_enc_lag_1', 'item_target_enc_lag_2', 'item_target_enc_lag_3', | |||
| 'item_loc_target_enc_lag_1', 'item_loc_target_enc_lag_2', 'item_loc_target_enc_lag_3', 'item_shop_target_enc_lag_1', 'item_shop_target_enc_lag_2', 'item_shop_target_enc_lag_3', | |||
| 'new_item_cat_avg_lag_1', 'new_item_cat_avg_lag_2', 'new_item_cat_avg_lag_3', | |||
| 'new_item_shop_cat_avg_lag_1', 'new_item_shop_cat_avg_lag_2', 'new_item_shop_cat_avg_lag_3', | |||
| 'item_cnt_month_lag_1_adv', 'item_cnt_month_lag_2_adv', 'item_cnt_month_lag_3_adv' | |||
| ----> 特征: 除了'item_cnt_month'之外的列都当做特征列 | |||
| ----> 目标值: 'item_cnt_month' | |||
| ----> 时间标识: 'data_block_num'将2013.01到2015.10月的数据标识为0-33,要预测的2015.11月数据为34 | |||
| --> 存储结果分为两部分: 按照时间划分的train & val,是pandas.DataFrame格式 | |||
| * pfs_cross_transfer.py | |||
| --> 在各自商店训练集上训练一个模型,然后在所有商店的测试集上测试,保存两两预测的RMSE结果,并进行分析 | |||
| --> 分析包括两部分:(1) 对于一个目标商店,其余源域模型的性能均值,方差,最小值(最好的模型),最大值,超过均值的源域数目,选择最好模型能够提升的比例等等;(2) HeatMap | |||
| --> 需要扩展的方向:(1) LightGBM, Ridge, Xgboost,以及超参数调参;(2) 特征工程去除标识信息,例如shop_id, item_id等等 | |||
| * data_api.py | |||
| --> 后续封装的代码,需继续完善 | |||
| * packages | |||
| --> pip install lightgbm | |||
| @@ -1,77 +0,0 @@ | |||
| import joblib | |||
| import os | |||
| from sklearn.metrics import mean_squared_error | |||
| from .pfs_cross_transfer import * | |||
| from .split_data import feature_engineering | |||
| class Dataloader: | |||
| def __init__(self): | |||
| self.algo = "ridge" | |||
| def regenerate_data(self): | |||
| feature_engineering() | |||
| def set_algo(self, algo): | |||
| self.algo = algo | |||
| def get_algo_list(self): | |||
| return ["lgb", "ridge"] | |||
| def get_idx_list(self): | |||
| return [i for i in range(53)] | |||
| def get_idx_data(self, idx): | |||
| shop_ids = [i for i in range(60) if i not in [0, 1, 40]] | |||
| shop_ids = [i for i in shop_ids if i not in [8, 11, 23, 36]] | |||
| fpath = os.path.join(pfs_split_dir, "Shop{:0>2d}-train.csv".format(shop_ids[idx])) | |||
| train_xs, train_ys, _, _ = load_pfs_data(fpath) | |||
| fpath = os.path.join(pfs_split_dir, "Shop{:0>2d}-val.csv".format(shop_ids[idx])) | |||
| test_xs, test_ys, _, _ = load_pfs_data(fpath) | |||
| return train_xs, train_ys, test_xs, test_ys | |||
| def get_model_path(self, idx): | |||
| shop_ids = [i for i in range(60) if i not in [0, 1, 40]] | |||
| shop_ids = [i for i in shop_ids if i not in [8, 11, 23, 36]] | |||
| return os.path.join(model_dir, "{}_Shop{:0>2d}.out".format(self.algo, shop_ids[idx])) | |||
| def retrain_models(self): | |||
| algo = self.algo | |||
| errs = get_errors(algo=algo) | |||
| fpath = os.path.join(pfs_res_dir, "PFS_{}_errs.txt".format(algo)) | |||
| np.savetxt(fpath, errs.T) | |||
| plot_heatmap(errs.T, algo) | |||
| weights = np.loadtxt(os.path.join(pfs_res_dir, "PFS_{}_weights.txt".format(algo))) | |||
| plot_performance(errs.T, weights, algo) | |||
| def retrain_split_models(self): | |||
| fpath = os.path.join(pfs_res_dir, "PFS_{}_split_errs_user.txt".format(self.algo)) | |||
| if os.path.exists(fpath): | |||
| return np.loadtxt(fpath) | |||
| algo = self.algo | |||
| errs = get_split_errs(algo=algo) | |||
| fpath = os.path.join(pfs_res_dir, "PFS_{}_split_errs_user.txt".format(algo)) | |||
| np.savetxt(fpath, errs) | |||
| return errs | |||
| def get_errs(self): | |||
| return np.loadtxt(os.path.join(pfs_res_dir, "PFS_{}_errs.txt".format(self.algo))) | |||
| def get_weights(self): | |||
| return np.loadtxt(os.path.join(pfs_res_dir, "PFS_{}_weights.txt".format(self.algo))) | |||
| def predict(self, idx, test_x): | |||
| shop_ids = [i for i in range(60) if i not in [0, 1, 40]] | |||
| shop_ids = [i for i in shop_ids if i not in [8, 11, 23, 36]] | |||
| model = joblib.load(os.path.join(model_dir, "{}_Shop{:0>2d}.out".format(self.algo, shop_ids[idx]))) | |||
| # test_x = (test_x - test_x.min(0)) / (test_x.max(0) - test_x.min(0) + 0.0001) | |||
| return model.predict(test_x) | |||
| def score(self, real_y, pred_y, sample_weight=None): | |||
| return mean_squared_error(real_y, pred_y, sample_weight=sample_weight, squared=False) | |||
| @@ -1,272 +0,0 @@ | |||
| market_store_list = [ | |||
| 0, | |||
| 2, | |||
| 3, | |||
| 4, | |||
| 5, | |||
| 6, | |||
| 7, | |||
| 8, | |||
| 9, | |||
| 10, | |||
| 12, | |||
| 13, | |||
| 14, | |||
| 15, | |||
| 16, | |||
| 17, | |||
| 18, | |||
| 20, | |||
| 22, | |||
| 23, | |||
| 24, | |||
| 25, | |||
| 26, | |||
| 27, | |||
| 28, | |||
| 30, | |||
| 31, | |||
| 32, | |||
| 33, | |||
| 34, | |||
| 35, | |||
| 37, | |||
| 38, | |||
| 39, | |||
| 40, | |||
| 42, | |||
| 44, | |||
| 45, | |||
| 46, | |||
| 47, | |||
| 48, | |||
| 50, | |||
| 52, | |||
| ] | |||
| user_store_list = [1, 11, 19, 21, 29, 36, 43, 49] | |||
| dataset_info = { | |||
| "name": "PFS", | |||
| "range of date": "2014.01-2015.10", | |||
| "description": "You are provided with daily historical sales data. The task is to forecast the total amount of products sold in every shop for the test set. Note that the list of shops and products slightly changes every month. More specifically, the dataset involves 53 shops in Russia", | |||
| "location_original": [ | |||
| "Адыгея, Россия", | |||
| "Балашиха, Россия", | |||
| "Волжский, Россия", | |||
| "Вологда, Россия", | |||
| "Воронеж, Россия", | |||
| "Воронеж, Россия", | |||
| "Воронеж, Россия", | |||
| "выезд, Россия", | |||
| "Жуковский, Россия", | |||
| "интернет-магазин, Россия", | |||
| "Казань, Россия", | |||
| "Калуга, Россия", | |||
| "колонна, Россия", | |||
| "Красноярск, Россия", | |||
| "Красноярск, Россия", | |||
| "курск, Россия", | |||
| "Москва, Россия", | |||
| "Москва, Россия", | |||
| "Москва, Россия", | |||
| "Москва, Россия", | |||
| "Москва, Россия", | |||
| "Москва, Россия", | |||
| "Москва, Россия", | |||
| "Москва, Россия", | |||
| "Москва, Россия", | |||
| "Москва, Россия", | |||
| "Москва, Россия", | |||
| "Москва, Россия", | |||
| "Мытищи, Россия", | |||
| "Н.Новгород, Россия", | |||
| "Н.Новгород, Россия", | |||
| "Новосибирск, Россия", | |||
| "Новосибирск, Россия", | |||
| "Ростовнадон, Россия", | |||
| "Ростовнадон, Россия", | |||
| "спб, Россия", | |||
| "спб, Россия", | |||
| "самара, Россия", | |||
| "самара, Россия", | |||
| "Сергий, Россия", | |||
| "Сургут, Россия", | |||
| "томск, Россия", | |||
| "тюмень, Россия", | |||
| "тюмень, Россия", | |||
| "тюмень, Россия", | |||
| "Уфа, Россия", | |||
| "Уфа, Россия", | |||
| "Химки, Россия", | |||
| "цифровой, Россия", | |||
| "Чехи, Россия", | |||
| "Якутск, Россия", | |||
| "Якутск, Россия", | |||
| "Ярославль, Россия", | |||
| ], | |||
| "location_english": [ | |||
| "adygea, Russia", | |||
| "Balashikha, Russia", | |||
| "Volzhsky, Russia", | |||
| "Vologda, Russia", | |||
| "Voronezh, Russia", | |||
| "Voronezh, Russia", | |||
| "Voronezh, Russia", | |||
| "outbound, Russia", | |||
| "zhukovsky, Russia", | |||
| "online stor, Russia", | |||
| "Kazan, Russia", | |||
| "Kaluga, Russia", | |||
| "column, Russia", | |||
| "Krasnoyarsk, Russia", | |||
| "Krasnoyarsk, Russia", | |||
| "kursk, Russia", | |||
| "Moscow, Russia", | |||
| "Moscow, Russia", | |||
| "Moscow, Russia", | |||
| "Moscow, Russia", | |||
| "Moscow, Russia", | |||
| "Moscow, Russia", | |||
| "Moscow, Russia", | |||
| "Moscow, Russia", | |||
| "Moscow, Russia", | |||
| "Moscow, Russia", | |||
| "Moscow, Russia", | |||
| "Moscow, Russia", | |||
| "mytishchi, Russia", | |||
| "N.Novgorod, Russia", | |||
| "N.Novgorod, Russia", | |||
| "Novosibirsk, Russia", | |||
| "Novosibirsk, Russia", | |||
| "rostovnadon, Russia", | |||
| "rostovnadon, Russia", | |||
| "spb, Russia", | |||
| "spb, Russia", | |||
| "samara, Russia", | |||
| "samara, Russia", | |||
| "Sergius, Russia", | |||
| "surgut, Russia", | |||
| "tomsk, Russia", | |||
| "tyumen, Russia", | |||
| "tyumen, Russia", | |||
| "tyumen, Russia", | |||
| "Ufa, Russia", | |||
| "Ufa, Russia", | |||
| "Khimki, Russia", | |||
| "numeric, Russia", | |||
| "Czechs, Russia", | |||
| "Yakutsk, Russia", | |||
| "Yakutsk, Russia", | |||
| "Yaroslavl, Russia", | |||
| ], | |||
| "location_chinese": [ | |||
| "阿迪格亚, 俄罗斯", | |||
| "巴拉希哈, 俄罗斯", | |||
| "沃尔日斯基, 俄罗斯", | |||
| "沃洛格达, 俄罗斯", | |||
| "沃罗涅日, 俄罗斯", | |||
| "沃罗涅日, 俄罗斯", | |||
| "沃罗涅日, 俄罗斯", | |||
| "对外贸易, 俄罗斯", | |||
| "茹科夫斯基, 俄罗斯", | |||
| "在线商店, 俄罗斯", | |||
| "喀山, 俄罗斯", | |||
| "卡卢加, 俄罗斯", | |||
| "科洛姆纳, 俄罗斯", | |||
| "克拉斯诺亚尔斯克, 俄罗斯", | |||
| "克拉斯诺亚尔斯克, 俄罗斯", | |||
| "库尔斯克, 俄罗斯", | |||
| "莫斯科, 俄罗斯", | |||
| "莫斯科, 俄罗斯", | |||
| "莫斯科, 俄罗斯", | |||
| "莫斯科, 俄罗斯", | |||
| "莫斯科, 俄罗斯", | |||
| "莫斯科, 俄罗斯", | |||
| "莫斯科, 俄罗斯", | |||
| "莫斯科, 俄罗斯", | |||
| "莫斯科, 俄罗斯", | |||
| "莫斯科, 俄罗斯", | |||
| "莫斯科, 俄罗斯", | |||
| "莫斯科, 俄罗斯", | |||
| "梅季希, 俄罗斯", | |||
| "北诺夫哥罗德, 俄罗斯", | |||
| "北诺夫哥罗德, 俄罗斯", | |||
| "新西伯利亚, 俄罗斯", | |||
| "新西伯利亚, 俄罗斯", | |||
| "罗斯托夫纳东, 俄罗斯", | |||
| "罗斯托夫纳东, 俄罗斯", | |||
| "圣彼得堡, 俄罗斯", | |||
| "圣彼得堡, 俄罗斯", | |||
| "萨马拉, 俄罗斯", | |||
| "萨马拉, 俄罗斯", | |||
| "谢尔盖, 俄罗斯", | |||
| "苏尔古特, 俄罗斯", | |||
| "托木斯克, 俄罗斯", | |||
| "秋明, 俄罗斯", | |||
| "秋明, 俄罗斯", | |||
| "秋明, 俄罗斯", | |||
| "乌法, 俄罗斯", | |||
| "乌法, 俄罗斯", | |||
| "希姆基, 俄罗斯", | |||
| "在线商店, 俄罗斯", | |||
| "契诃夫, 俄罗斯", | |||
| "雅库茨克, 俄罗斯", | |||
| "雅库茨克, 俄罗斯", | |||
| "雅罗斯拉夫尔, 俄罗斯", | |||
| ], | |||
| "memory(KB)": [ | |||
| 246, | |||
| 302, | |||
| 3631, | |||
| 379, | |||
| 862, | |||
| 1020, | |||
| 471, | |||
| 867, | |||
| 588, | |||
| 233, | |||
| 657, | |||
| 1272, | |||
| 801, | |||
| 469, | |||
| 146, | |||
| 1309, | |||
| 98, | |||
| 1003, | |||
| 932, | |||
| 257, | |||
| 1959, | |||
| 1361, | |||
| 35, | |||
| 3265, | |||
| 217, | |||
| 283, | |||
| 4311, | |||
| 1155, | |||
| 43, | |||
| 1388, | |||
| 1971, | |||
| 971, | |||
| 7272, | |||
| 2782, | |||
| 304, | |||
| 6801, | |||
| 4942, | |||
| 181, | |||
| 190, | |||
| 3664, | |||
| 2061, | |||
| 170, | |||
| 807, | |||
| 593, | |||
| 1584, | |||
| 257, | |||
| 1819, | |||
| 50, | |||
| 1063, | |||
| 692, | |||
| 336, | |||
| 277, | |||
| 743, | |||
| ], | |||
| } | |||
| @@ -1,21 +0,0 @@ | |||
| import os | |||
| ROOT_PATH = os.path.abspath(os.path.join(__file__, "..", "data")) | |||
| raw_data_dir = os.path.join(ROOT_PATH, "raw_data") | |||
| split_data_dir = os.path.join(ROOT_PATH, "split_data") | |||
| res_dir = os.path.join(ROOT_PATH, "results") | |||
| model_dir = os.path.join(ROOT_PATH, "models") | |||
| model_dir2 = os.path.join(ROOT_PATH, "models2") | |||
| for dir_name in [ROOT_PATH, raw_data_dir, split_data_dir, res_dir, model_dir, model_dir2]: | |||
| if not os.path.exists(dir_name): | |||
| os.mkdir(dir_name) | |||
| pfs_data_dir = os.path.join(raw_data_dir, "PFS") | |||
| pfs_split_dir = os.path.join(split_data_dir, "PFS") | |||
| pfs_res_dir = os.path.join(res_dir, "PFS") | |||
| for dir_name in [pfs_data_dir, pfs_split_dir, pfs_res_dir]: | |||
| if not os.path.exists(dir_name): | |||
| os.mkdir(dir_name) | |||
| @@ -1,384 +0,0 @@ | |||
| import os | |||
| import pickle | |||
| import joblib | |||
| import numpy as np | |||
| import pandas as pd | |||
| import lightgbm as lgb | |||
| from sklearn.linear_model import Ridge | |||
| from sklearn.model_selection import GridSearchCV | |||
| from matplotlib import pyplot as plt | |||
| import matplotlib.ticker as ticker | |||
| from mpl_toolkits.axes_grid1 import make_axes_locatable | |||
| np.seterr(divide="ignore", invalid="ignore") | |||
| from .paths import pfs_split_dir, pfs_res_dir, model_dir | |||
| np.random.seed(0) | |||
| def load_pfs_data(fpath): | |||
| df = pd.read_csv(fpath) | |||
| features = list(df.columns) | |||
| features.remove("item_cnt_month") | |||
| features.remove("date_block_num") | |||
| # remove id info | |||
| # features.remove('shop_id') | |||
| # features.remove('item_id') | |||
| # remove discrete info | |||
| # features.remove('city_code') | |||
| # features.remove('item_category_code') | |||
| # features.remove('item_category_common') | |||
| xs = df[features].values | |||
| ys = df["item_cnt_month"].values | |||
| categorical_feature_names = ["country_part", "item_category_common", "item_category_code", "city_code"] | |||
| types = None | |||
| return xs, ys, features, types | |||
| def get_split_errs(algo): | |||
| """ | |||
| according to proportion_list, generate errs whose shape is [shop, split_data] | |||
| """ | |||
| shop_ids = [i for i in range(60) if i not in [0, 1, 40]] | |||
| shop_ids = [i for i in shop_ids if i not in [8, 11, 23, 36]] | |||
| user_list = [i for i in range(53)] | |||
| proportion_list = [100, 300, 500, 700, 900, 1000, 3000, 5000, 7000, 9000, 10000, 30000, 50000, 70000] | |||
| # train | |||
| errs = np.zeros((len(user_list), len(proportion_list))) | |||
| for s, sid in enumerate(user_list): | |||
| # load train data | |||
| fpath = os.path.join(pfs_split_dir, "Shop{:0>2d}-train.csv".format(shop_ids[sid])) | |||
| fpath_val = os.path.join(pfs_split_dir, "Shop{:0>2d}-val.csv".format(shop_ids[sid])) | |||
| train_xs, train_ys, _, _ = load_pfs_data(fpath) | |||
| val_xs, val_ys, _, _ = load_pfs_data(fpath_val) | |||
| print(shop_ids[sid], train_xs.shape, train_ys.shape) | |||
| # data regu | |||
| # train_xs = (train_xs - train_xs.min(0)) / (train_xs.max(0) - train_xs.min(0) + 0.0001) | |||
| # val_xs = (val_xs - val_xs.min(0)) / (val_xs.max(0) - val_xs.min(0) + 0.0001) | |||
| if algo == "lgb": | |||
| for tmp in range(len(proportion_list)): | |||
| model = lgb.LGBMModel( | |||
| boosting_type="gbdt", | |||
| num_leaves=2**7 - 1, | |||
| learning_rate=0.01, | |||
| objective="rmse", | |||
| metric="rmse", | |||
| feature_fraction=0.75, | |||
| bagging_fraction=0.75, | |||
| bagging_freq=5, | |||
| seed=1, | |||
| verbose=1, | |||
| n_estimators=100000, | |||
| ) | |||
| model_ori = joblib.load(os.path.join(model_dir, "{}_Shop{:0>2d}.out".format("lgb", shop_ids[sid]))) | |||
| para = model_ori.get_params() | |||
| para["n_estimators"] = 1000 | |||
| model.set_params(**para) | |||
| split = train_xs.shape[0] - proportion_list[tmp] | |||
| model.fit( | |||
| train_xs[ | |||
| split:, | |||
| ], | |||
| train_ys[split:], | |||
| eval_set=[(val_xs, val_ys)], | |||
| early_stopping_rounds=50, | |||
| verbose=100, | |||
| ) | |||
| pred_ys = model.predict(val_xs) | |||
| rmse = np.sqrt(((val_ys - pred_ys) ** 2).mean()) | |||
| errs[s][tmp] = rmse | |||
| return errs | |||
| def get_errors(algo): | |||
| shop_ids = [i for i in range(60) if i not in [0, 1, 40]] | |||
| shop_ids = [i for i in shop_ids if i not in [8, 11, 23, 36]] | |||
| # train | |||
| K = len(shop_ids) | |||
| feature_weight = np.zeros(()) | |||
| errs = np.zeros((K, K)) | |||
| for s, sid in enumerate(shop_ids): | |||
| # load train data | |||
| fpath = os.path.join(pfs_split_dir, "Shop{:0>2d}-train.csv".format(sid)) | |||
| fpath_val = os.path.join(pfs_split_dir, "Shop{:0>2d}-val.csv".format(sid)) | |||
| train_xs, train_ys, features, _ = load_pfs_data(fpath) | |||
| val_xs, val_ys, _, _ = load_pfs_data(fpath_val) | |||
| print(sid, train_xs.shape, train_ys.shape) | |||
| if s == 0: | |||
| feature_weight = np.zeros((K, len(features))) | |||
| if algo == "lgb": | |||
| model = lgb.LGBMModel( | |||
| boosting_type="gbdt", | |||
| num_leaves=2**7 - 1, | |||
| learning_rate=0.01, | |||
| objective="rmse", | |||
| metric="rmse", | |||
| feature_fraction=0.75, | |||
| bagging_fraction=0.75, | |||
| bagging_freq=5, | |||
| seed=1, | |||
| verbose=1, | |||
| n_estimators=1000, | |||
| ) | |||
| # train regu data | |||
| # train_xs = (train_xs - train_xs.min(0)) / (train_xs.max(0) - train_xs.min(0) + 0.0001) | |||
| # val_xs = (val_xs - val_xs.min(0)) / (val_xs.max(0) - val_xs.min(0) + 0.0001) | |||
| model.fit(train_xs, train_ys, eval_set=[(val_xs, val_ys)], early_stopping_rounds=100, verbose=100) | |||
| # grid search | |||
| # para = {'learning_rate': [0.005, 0.01, 0.015], 'num_leaves' : [128, 224, 300], 'max_depth' : [50, 66, 80]} | |||
| # grid_search = GridSearchCV(model, para, scoring='neg_mean_squared_error') | |||
| # grid_result = grid_search.fit(train_xs, train_ys, eval_set=[(val_xs, val_ys)], verbose = 1000, early_stopping_rounds=1000) | |||
| # model = grid_result.best_estimator_ | |||
| joblib.dump(model, os.path.join(model_dir, "{}_Shop{:0>2d}.out".format(algo, sid))) | |||
| importances = model.feature_importances_ | |||
| elif algo == "ridge": | |||
| # train_xs = (train_xs - train_xs.min(0)) / (train_xs.max(0) - train_xs.min(0) + 0.0001) | |||
| model = Ridge() | |||
| para = {"alpha": [0.01, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10, 20, 30]} | |||
| grid_search = GridSearchCV(model, para) | |||
| grid_result = grid_search.fit(train_xs, train_ys) | |||
| model = grid_result.best_estimator_ | |||
| importances = model.coef_ | |||
| joblib.dump(model, os.path.join(model_dir, "{}_Shop{:0>2d}.out".format(algo, sid))) | |||
| feature_weight[s] = importances | |||
| # leave one out test | |||
| for t, tid in enumerate(shop_ids): | |||
| # load test data | |||
| fpath = os.path.join(pfs_split_dir, "Shop{:0>2d}-val.csv".format(tid)) | |||
| test_xs, test_ys, _, _ = load_pfs_data(fpath) | |||
| # data regu | |||
| # test_xs = (test_xs - test_xs.min(0)) / (test_xs.max(0) - test_xs.min(0) + 0.0001) | |||
| pred_ys = model.predict(test_xs) | |||
| rmse = np.sqrt(((test_ys - pred_ys) ** 2).mean()) | |||
| print("Shop{} --> Shop{}: {}".format(s, t, rmse)) | |||
| errs[s][t] = rmse | |||
| np.savetxt(os.path.join(pfs_res_dir, "PFS_{}_weights.txt".format(algo)), feature_weight) | |||
| return errs | |||
| def plot_heatmap(mat, algo): | |||
| x_labels = [f"Model{i}" for i in range(mat.shape[1])] | |||
| y_labels = [f"Task{i}" for i in range(mat.shape[0])] | |||
| fig = plt.figure(figsize=(10, 9)) | |||
| plt.subplot(1, 1, 1) | |||
| ax = plt.gca() | |||
| im = plt.imshow(mat) | |||
| divider = make_axes_locatable(ax) | |||
| cax = divider.append_axes("right", size="4%", pad=0.3) | |||
| plt.colorbar(im, cax=cax) | |||
| ax.set_xticks(range(len(x_labels))) | |||
| ax.set_xticklabels(x_labels) | |||
| ax.set_yticks(range(len(y_labels))) | |||
| ax.set_yticklabels(y_labels) | |||
| ax.xaxis.set_major_locator(ticker.MultipleLocator(base=5)) | |||
| ax.yaxis.set_major_locator(ticker.MultipleLocator(base=5)) | |||
| ax.set_title(f"RMSE on Test set ({algo})") | |||
| plt.tight_layout() | |||
| plt.savefig(os.path.join(pfs_res_dir, "PFS_{}_heatmap.jpg".format(algo)), dpi=700) | |||
| def plot_var(errs, algo): | |||
| avg_err = [] | |||
| min_err = [] | |||
| med_err = [] | |||
| max_err = [] | |||
| std_err = [] | |||
| cnts = [] | |||
| improves = [] | |||
| for j in range(len(errs)): | |||
| inds = [i for i in range(len(errs)) if i != j] | |||
| ys = errs[:, j][inds] | |||
| avg_err.append(np.mean(ys)) | |||
| min_err.append(np.min(ys)) | |||
| med_err.append(np.median(ys)) | |||
| max_err.append(np.max(ys)) | |||
| std_err.append(np.std(ys)) | |||
| cnts.append(np.sum(ys >= np.mean(ys))) | |||
| improves.append((np.mean(ys) - np.min(ys)) / np.mean(ys)) | |||
| avg_err = np.array(avg_err) | |||
| min_err = np.array(min_err) | |||
| med_err = np.array(med_err) | |||
| max_err = np.array(max_err) | |||
| std_err = np.array(std_err) | |||
| cnts = np.array(cnts) | |||
| improves = np.array(improves) | |||
| inds = np.argsort(avg_err) | |||
| avg_err = avg_err[inds] | |||
| min_err = min_err[inds] | |||
| med_err = med_err[inds] | |||
| max_err = max_err[inds] | |||
| std_err = std_err[inds] | |||
| cnts = cnts[inds] | |||
| improves = improves[inds] | |||
| xs = list(range(len(inds))) | |||
| fig = plt.figure(figsize=(8, 8)) | |||
| ax = plt.subplot(3, 1, 1) | |||
| ax.plot(xs, avg_err, color="red", linestyle="solid", linewidth=2.5) | |||
| ax.plot(xs, min_err, color="blue", linestyle="dotted", linewidth=1.5) | |||
| ax.plot(xs, med_err, color="purple", linestyle="solid", linewidth=1.0) | |||
| ax.plot(xs, max_err, color="green", linestyle="dashed", linewidth=1.5) | |||
| ax.legend(["Avg", "Min", "Median", "Max"], fontsize=14) | |||
| ax.fill_between(xs, avg_err - std_err, avg_err + std_err, alpha=0.2) | |||
| gap = np.mean(avg_err - min_err) | |||
| ax.set_ylabel("RMSE", fontsize=14) | |||
| ax.set_title("RMSE of Source Models ({}) [Avg-Min:{:.3f}]".format(algo, gap), fontsize=18) | |||
| ax = plt.subplot(3, 1, 2) | |||
| ax.bar(xs, cnts) | |||
| ax.set_ylabel("Number", fontsize=14) | |||
| ax.set_title("Number of sources above average", fontsize=18) | |||
| ax = plt.subplot(3, 1, 3) | |||
| ax.plot(xs, improves) | |||
| ax.set_xlabel("Sorted Shop ID by Avg.Err", fontsize=14) | |||
| ax.set_ylabel("Ratio", fontsize=14) | |||
| ax.set_title("Best Improve Ratio: (Avg - Min) / Avg", fontsize=18) | |||
| fig.tight_layout() | |||
| fig.savefig(os.path.join(pfs_res_dir, "{}-var.jpg".format(algo))) | |||
| plt.show() | |||
| def plot_performance(errs, weights, algo): | |||
| avg_err = [] | |||
| min_err = [] | |||
| med_err = [] | |||
| max_err = [] | |||
| std_err = [] | |||
| cnts = [] | |||
| improves = [] | |||
| for i in range(errs.shape[0]): | |||
| inds = [j for j in range(errs.shape[1]) if j != i] | |||
| arr = errs[i][inds] | |||
| avg_err.append(np.mean(arr)) | |||
| min_err.append(np.min(arr)) | |||
| med_err.append(np.median(arr)) | |||
| max_err.append(np.max(arr)) | |||
| std_err.append(np.std(arr)) | |||
| cnts.append(np.sum(arr >= np.mean(arr))) | |||
| improves.append((np.mean(arr) - np.min(arr)) / np.mean(arr)) | |||
| avg_err = np.array(avg_err) | |||
| min_err = np.array(min_err) | |||
| med_err = np.array(med_err) | |||
| max_err = np.array(max_err) | |||
| std_err = np.array(std_err) | |||
| cnts = np.array(cnts) | |||
| improves = np.array(improves) | |||
| inds = np.argsort(avg_err) | |||
| avg_err = avg_err[inds] | |||
| min_err = min_err[inds] | |||
| med_err = med_err[inds] | |||
| max_err = max_err[inds] | |||
| std_err = std_err[inds] | |||
| cnts = cnts[inds] | |||
| improves = improves[inds] | |||
| xs = list(range(len(inds))) | |||
| fig = plt.figure(figsize=(12, 9)) | |||
| ax = plt.subplot(2, 2, 1) | |||
| ax.plot(xs, avg_err, color="red", linestyle="solid", linewidth=2.5) | |||
| ax.plot(xs, min_err, color="blue", linestyle="dotted", linewidth=1.5) | |||
| ax.plot(xs, med_err, color="purple", linestyle="solid", linewidth=1.0) | |||
| ax.plot(xs, max_err, color="green", linestyle="dashed", linewidth=1.5) | |||
| ax.legend(["Avg", "Min", "Median", "Max"], fontsize=14) | |||
| ax.fill_between(xs, avg_err - std_err, avg_err + std_err, alpha=0.2) | |||
| gap = np.mean(avg_err - min_err) | |||
| ax.set_ylabel("RMSE", fontsize=14) | |||
| ax.set_title("RMSE of Source Models ({}) [Avg-Min:{:.3f}]".format(algo, gap), fontsize=18) | |||
| ax = plt.subplot(2, 2, 2) | |||
| ax.bar(xs, cnts) | |||
| ax.set_ylabel("Number", fontsize=14) | |||
| ax.set_title("Number of sources above average", fontsize=18) | |||
| ax = plt.subplot(2, 2, 3) | |||
| ax.plot(xs, improves) | |||
| ax.set_xlabel("Sorted Shop ID by Avg.Err", fontsize=14) | |||
| ax.set_ylabel("Ratio", fontsize=14) | |||
| ax.set_title("Best Improve Ratio: (Avg - Min) / Avg", fontsize=18) | |||
| ax = plt.subplot(2, 2, 4) | |||
| weights = np.mean(weights, axis=0) / weights.sum() | |||
| weights = np.sort(weights) | |||
| xs = list(range(len(weights))) | |||
| ax.plot(xs, weights) | |||
| # ax.set_xlabel("Sorted Feature ID by Avg.Feature_Importance", fontsize=14) | |||
| ax.set_ylabel("Proportion", fontsize=14) | |||
| ax.set_title("Avg.Feature_Importances", fontsize=18) | |||
| fig.tight_layout() | |||
| fig.savefig(os.path.join(pfs_res_dir, "PFS_{}_performance.png".format(algo)), dpi=700) | |||
| # fig.savefig(f"{algo}_performance.png", dpi=700) | |||
| plt.show() | |||
| if __name__ == "__main__": | |||
| # for algo in ["ridge", "lgb", "xgboost_125"]: | |||
| for algo in ["ridge"]: | |||
| fpath = os.path.join(pfs_res_dir, "{}_errs.pkl".format(algo)) | |||
| if os.path.exists(fpath): | |||
| with open(fpath, "rb") as fr: | |||
| errs = pickle.load(fr) | |||
| else: | |||
| errs = get_errors(algo=algo) | |||
| with open(fpath, "wb") as fw: | |||
| pickle.dump(errs, fw) | |||
| index = ["Source{}".format(k) for k in range(len(errs))] | |||
| columns = ["Target{}".format(k) for k in range(len(errs[0]))] | |||
| df = pd.DataFrame(errs, index=index, columns=columns) | |||
| fpath = os.path.join(pfs_res_dir, "PFS_{}_errs.txt".format(algo)) | |||
| # df.to_csv(fpath, index=True) | |||
| np.savetxt(fpath, errs.T) | |||
| # plot_var(errs, algo) | |||
| plot_heatmap(errs.T, algo) | |||
| weights = np.loadtxt(os.path.join(pfs_res_dir, "PFS_{}_weights.txt".format(algo))) | |||
| plot_performance(errs.T, weights, algo) | |||
| @@ -1,384 +0,0 @@ | |||
| import os | |||
| import pickle | |||
| import pandas as pd | |||
| import numpy as np | |||
| from itertools import product | |||
| from sklearn.preprocessing import LabelEncoder | |||
| from sklearn.preprocessing import MinMaxScaler | |||
| import calendar | |||
| from .paths import pfs_data_dir | |||
| from .paths import pfs_split_dir | |||
| def feature_engineering(): | |||
| # read data | |||
| sales = pd.read_csv(os.path.join(pfs_data_dir, "sales_train.csv")) | |||
| shops = pd.read_csv(os.path.join(pfs_data_dir, "shops.csv")) | |||
| items = pd.read_csv(os.path.join(pfs_data_dir, "items.csv")) | |||
| item_cats = pd.read_csv(os.path.join(pfs_data_dir, "item_categories.csv")) | |||
| test = pd.read_csv(os.path.join(pfs_data_dir, "test.csv")) | |||
| # remove outliers | |||
| train = sales[(sales.item_price < 10000) & (sales.item_price > 0)] | |||
| train = train[sales.item_cnt_day < 1001] | |||
| print(train.shape, sales.shape) | |||
| print(train.tail(5)) | |||
| print(sales.tail(5)) | |||
| # combine shops with different id but the same name | |||
| train.loc[train.shop_id == 0, "shop_id"] = 57 | |||
| test.loc[test.shop_id == 0, "shop_id"] = 57 | |||
| train.loc[train.shop_id == 1, "shop_id"] = 58 | |||
| test.loc[test.shop_id == 1, "shop_id"] = 58 | |||
| train.loc[train.shop_id == 40, "shop_id"] = 39 | |||
| test.loc[test.shop_id == 40, "shop_id"] = 39 | |||
| # obtain shop_id, item_id, month information | |||
| index_cols = ["shop_id", "item_id", "date_block_num"] | |||
| df = [] | |||
| for block_num in train["date_block_num"].unique(): | |||
| cur_shops = train.loc[sales["date_block_num"] == block_num, "shop_id"].unique() | |||
| cur_items = train.loc[sales["date_block_num"] == block_num, "item_id"].unique() | |||
| df.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])), dtype="int32")) | |||
| df = pd.DataFrame(np.vstack(df), columns=index_cols, dtype=np.int32) | |||
| print("df.shape: ", df.shape) | |||
| print(df.head(5)) | |||
| # Add month sales | |||
| group = train.groupby(["date_block_num", "shop_id", "item_id"]).agg({"item_cnt_day": ["sum"]}) | |||
| group.columns = ["item_cnt_month"] | |||
| group.reset_index(inplace=True) | |||
| print("group.shape: ", group.shape) | |||
| print(group.head(5)) | |||
| df = pd.merge(df, group, on=index_cols, how="left") | |||
| df["item_cnt_month"] = ( | |||
| df["item_cnt_month"] | |||
| .fillna(0) | |||
| .astype(np.float32) | |||
| # df['item_cnt_month'].fillna(0).clip(0, 20).astype(np.float32) | |||
| ) | |||
| # fill test data | |||
| test["date_block_num"] = 34 | |||
| test["date_block_num"] = test["date_block_num"].astype(np.int8) | |||
| test["shop_id"] = test["shop_id"].astype(np.int8) | |||
| test["item_id"] = test["item_id"].astype(np.int16) | |||
| df = pd.concat([df, test], ignore_index=True, sort=False, keys=index_cols) | |||
| df.fillna(0, inplace=True) | |||
| # shop location features | |||
| shops["city"] = shops["shop_name"].apply(lambda x: x.split()[0].lower()) | |||
| shops.loc[shops.city == "!якутск", "city"] = "якутск" | |||
| shops["city_code"] = LabelEncoder().fit_transform(shops["city"]) | |||
| coords = dict() | |||
| coords["якутск"] = (62.028098, 129.732555, 4) | |||
| coords["адыгея"] = (44.609764, 40.100516, 3) | |||
| coords["балашиха"] = (55.8094500, 37.9580600, 1) | |||
| coords["волжский"] = (53.4305800, 50.1190000, 3) | |||
| coords["вологда"] = (59.2239000, 39.8839800, 2) | |||
| coords["воронеж"] = (51.6720400, 39.1843000, 3) | |||
| coords["выездная"] = (0, 0, 0) | |||
| coords["жуковский"] = (55.5952800, 38.1202800, 1) | |||
| coords["интернет-магазин"] = (0, 0, 0) | |||
| coords["казань"] = (55.7887400, 49.1221400, 4) | |||
| coords["калуга"] = (54.5293000, 36.2754200, 4) | |||
| coords["коломна"] = (55.0794400, 38.7783300, 4) | |||
| coords["красноярск"] = (56.0183900, 92.8671700, 4) | |||
| coords["курск"] = (51.7373300, 36.1873500, 3) | |||
| coords["москва"] = (55.7522200, 37.6155600, 1) | |||
| coords["мытищи"] = (55.9116300, 37.7307600, 1) | |||
| coords["н.новгород"] = (56.3286700, 44.0020500, 4) | |||
| coords["новосибирск"] = (55.0415000, 82.9346000, 4) | |||
| coords["омск"] = (54.9924400, 73.3685900, 4) | |||
| coords["ростовнадону"] = (47.2313500, 39.7232800, 3) | |||
| coords["спб"] = (59.9386300, 30.3141300, 2) | |||
| coords["самара"] = (53.2000700, 50.1500000, 4) | |||
| coords["сергиев"] = (56.3000000, 38.1333300, 4) | |||
| coords["сургут"] = (61.2500000, 73.4166700, 4) | |||
| coords["томск"] = (56.4977100, 84.9743700, 4) | |||
| coords["тюмень"] = (57.1522200, 65.5272200, 4) | |||
| coords["уфа"] = (54.7430600, 55.9677900, 4) | |||
| coords["химки"] = (55.8970400, 37.4296900, 1) | |||
| coords["цифровой"] = (0, 0, 0) | |||
| coords["чехов"] = (55.1477000, 37.4772800, 4) | |||
| coords["ярославль"] = (57.6298700, 39.8736800, 2) | |||
| shops["city_coord_1"] = shops["city"].apply(lambda x: coords[x][0]) | |||
| shops["city_coord_2"] = shops["city"].apply(lambda x: coords[x][1]) | |||
| shops["country_part"] = shops["city"].apply(lambda x: coords[x][2]) | |||
| shops = shops[["shop_id", "city_code", "city_coord_1", "city_coord_2", "country_part"]] | |||
| df = pd.merge(df, shops, on=["shop_id"], how="left") | |||
| # process items category name | |||
| map_dict = { | |||
| "Чистые носители (штучные)": "Чистые носители", | |||
| "Чистые носители (шпиль)": "Чистые носители", | |||
| "PC ": "Аксессуары", | |||
| "Служебные": "Служебные ", | |||
| } | |||
| items = pd.merge(items, item_cats, on="item_category_id") | |||
| items["item_category"] = items["item_category_name"].apply(lambda x: x.split("-")[0]) | |||
| items["item_category"] = items["item_category"].apply(lambda x: map_dict[x] if x in map_dict.keys() else x) | |||
| items["item_category_common"] = LabelEncoder().fit_transform(items["item_category"]) | |||
| items["item_category_code"] = LabelEncoder().fit_transform(items["item_category_name"]) | |||
| items = items[["item_id", "item_category_common", "item_category_code"]] | |||
| df = pd.merge(df, items, on=["item_id"], how="left") | |||
| # Weekends count / number of days in a month | |||
| def count_days(date_block_num): | |||
| year = 2013 + date_block_num // 12 | |||
| month = 1 + date_block_num % 12 | |||
| weeknd_count = len([1 for i in calendar.monthcalendar(year, month) if i[6] != 0]) | |||
| days_in_month = calendar.monthrange(year, month)[1] | |||
| return weeknd_count, days_in_month, month | |||
| map_dict = {i: count_days(i) for i in range(35)} | |||
| df["weeknd_count"] = df["date_block_num"].apply(lambda x: map_dict[x][0]) | |||
| df["days_in_month"] = df["date_block_num"].apply(lambda x: map_dict[x][1]) | |||
| # Interation features: Item is new / Item was bought in this shop before | |||
| first_item_block = df.groupby(["item_id"])["date_block_num"].min().reset_index() | |||
| first_item_block["item_first_interaction"] = 1 | |||
| first_shop_item_buy_block = ( | |||
| df[df["date_block_num"] > 0].groupby(["shop_id", "item_id"])["date_block_num"].min().reset_index() | |||
| ) | |||
| first_shop_item_buy_block["first_date_block_num"] = first_shop_item_buy_block["date_block_num"] | |||
| df = pd.merge( | |||
| df, | |||
| first_item_block[["item_id", "date_block_num", "item_first_interaction"]], | |||
| on=["item_id", "date_block_num"], | |||
| how="left", | |||
| ) | |||
| df = pd.merge( | |||
| df, | |||
| first_shop_item_buy_block[["item_id", "shop_id", "first_date_block_num"]], | |||
| on=["item_id", "shop_id"], | |||
| how="left", | |||
| ) | |||
| df["first_date_block_num"].fillna(100, inplace=True) | |||
| df["shop_item_sold_before"] = (df["first_date_block_num"] < df["date_block_num"]).astype("int8") | |||
| df.drop(["first_date_block_num"], axis=1, inplace=True) | |||
| df["item_first_interaction"].fillna(0, inplace=True) | |||
| df["shop_item_sold_before"].fillna(0, inplace=True) | |||
| df["item_first_interaction"] = df["item_first_interaction"].astype("int8") | |||
| df["shop_item_sold_before"] = df["shop_item_sold_before"].astype("int8") | |||
| def lag_feature(df, lags, col): | |||
| tmp = df[["date_block_num", "shop_id", "item_id", col]] | |||
| for i in lags: | |||
| shifted = tmp.copy() | |||
| shifted.columns = ["date_block_num", "shop_id", "item_id", col + "_lag_" + str(i)] | |||
| shifted["date_block_num"] += i | |||
| df = pd.merge(df, shifted, on=["date_block_num", "shop_id", "item_id"], how="left") | |||
| lag_name = col + "_lag_" + str(i) | |||
| df[lag_name] = df[lag_name].astype("float32") | |||
| return df | |||
| df = lag_feature(df, [1, 2, 3], "item_cnt_month") | |||
| index_cols = ["shop_id", "item_id", "date_block_num"] | |||
| group = ( | |||
| train.groupby(index_cols)["item_price"] | |||
| .mean() | |||
| .reset_index() | |||
| .rename(columns={"item_price": "avg_shop_price"}, errors="raise") | |||
| ) | |||
| df = pd.merge(df, group, on=index_cols, how="left") | |||
| df["avg_shop_price"] = df["avg_shop_price"].fillna(0).astype(np.float32) | |||
| index_cols = ["item_id", "date_block_num"] | |||
| group = ( | |||
| train.groupby(["date_block_num", "item_id"])["item_price"] | |||
| .mean() | |||
| .reset_index() | |||
| .rename(columns={"item_price": "avg_item_price"}, errors="raise") | |||
| ) | |||
| df = pd.merge(df, group, on=index_cols, how="left") | |||
| df["avg_item_price"] = df["avg_item_price"].fillna(0).astype(np.float32) | |||
| df["item_shop_price_avg"] = (df["avg_shop_price"] - df["avg_item_price"]) / df["avg_item_price"] | |||
| df["item_shop_price_avg"].fillna(0, inplace=True) | |||
| df = lag_feature(df, [1, 2, 3], "item_shop_price_avg") | |||
| df.drop(["avg_shop_price", "avg_item_price", "item_shop_price_avg"], axis=1, inplace=True) | |||
| item_id_target_mean = ( | |||
| df.groupby(["date_block_num", "item_id"])["item_cnt_month"] | |||
| .mean() | |||
| .reset_index() | |||
| .rename(columns={"item_cnt_month": "item_target_enc"}, errors="raise") | |||
| ) | |||
| df = pd.merge(df, item_id_target_mean, on=["date_block_num", "item_id"], how="left") | |||
| df["item_target_enc"] = df["item_target_enc"].fillna(0).astype(np.float32) | |||
| df = lag_feature(df, [1, 2, 3], "item_target_enc") | |||
| df.drop(["item_target_enc"], axis=1, inplace=True) | |||
| item_id_target_mean = ( | |||
| df.groupby(["date_block_num", "item_id", "city_code"])["item_cnt_month"] | |||
| .mean() | |||
| .reset_index() | |||
| .rename(columns={"item_cnt_month": "item_loc_target_enc"}, errors="raise") | |||
| ) | |||
| df = pd.merge(df, item_id_target_mean, on=["date_block_num", "item_id", "city_code"], how="left") | |||
| df["item_loc_target_enc"] = df["item_loc_target_enc"].fillna(0).astype(np.float32) | |||
| df = lag_feature(df, [1, 2, 3], "item_loc_target_enc") | |||
| df.drop(["item_loc_target_enc"], axis=1, inplace=True) | |||
| item_id_target_mean = ( | |||
| df.groupby(["date_block_num", "item_id", "shop_id"])["item_cnt_month"] | |||
| .mean() | |||
| .reset_index() | |||
| .rename(columns={"item_cnt_month": "item_shop_target_enc"}, errors="raise") | |||
| ) | |||
| df = pd.merge(df, item_id_target_mean, on=["date_block_num", "item_id", "shop_id"], how="left") | |||
| df["item_shop_target_enc"] = df["item_shop_target_enc"].fillna(0).astype(np.float32) | |||
| df = lag_feature(df, [1, 2, 3], "item_shop_target_enc") | |||
| df.drop(["item_shop_target_enc"], axis=1, inplace=True) | |||
| item_id_target_mean = ( | |||
| df[df["item_first_interaction"] == 1] | |||
| .groupby(["date_block_num", "item_category_code"])["item_cnt_month"] | |||
| .mean() | |||
| .reset_index() | |||
| .rename(columns={"item_cnt_month": "new_item_cat_avg"}, errors="raise") | |||
| ) | |||
| df = pd.merge(df, item_id_target_mean, on=["date_block_num", "item_category_code"], how="left") | |||
| df["new_item_cat_avg"] = df["new_item_cat_avg"].fillna(0).astype(np.float32) | |||
| df = lag_feature(df, [1, 2, 3], "new_item_cat_avg") | |||
| df.drop(["new_item_cat_avg"], axis=1, inplace=True) | |||
| # For new items add avg category sales in a separate store for last 3 months | |||
| item_id_target_mean = ( | |||
| df[df["item_first_interaction"] == 1] | |||
| .groupby(["date_block_num", "item_category_code", "shop_id"])["item_cnt_month"] | |||
| .mean() | |||
| .reset_index() | |||
| .rename(columns={"item_cnt_month": "new_item_shop_cat_avg"}, errors="raise") | |||
| ) | |||
| df = pd.merge(df, item_id_target_mean, on=["date_block_num", "item_category_code", "shop_id"], how="left") | |||
| df["new_item_shop_cat_avg"] = df["new_item_shop_cat_avg"].fillna(0).astype(np.float32) | |||
| df = lag_feature(df, [1, 2, 3], "new_item_shop_cat_avg") | |||
| df.drop(["new_item_shop_cat_avg"], axis=1, inplace=True) | |||
| def lag_feature_adv(df, lags, col): | |||
| tmp = df[["date_block_num", "shop_id", "item_id", col]] | |||
| for i in lags: | |||
| shifted = tmp.copy() | |||
| shifted.columns = ["date_block_num", "shop_id", "item_id", col + "_lag_" + str(i) + "_adv"] | |||
| shifted["date_block_num"] += i | |||
| shifted["item_id"] -= 1 | |||
| df = pd.merge(df, shifted, on=["date_block_num", "shop_id", "item_id"], how="left") | |||
| lag_name = col + "_lag_" + str(i) + "_adv" | |||
| df[lag_name] = df[lag_name].astype("float32") | |||
| return df | |||
| df = lag_feature_adv(df, [1, 2, 3], "item_cnt_month") | |||
| # df.fillna(0, inplace=True) | |||
| df = df[(df["date_block_num"] > 2)] | |||
| df.drop(["ID"], axis=1, inplace=True, errors="ignore") | |||
| print(df.shape) | |||
| print(df.columns) | |||
| print(df.head(10)) | |||
| fill_dict = {} | |||
| for col in df.columns: | |||
| fill_dict[col] = df[col].mean() | |||
| group_df = df.groupby(["shop_id"]) | |||
| for shop_id, shop_df in group_df: | |||
| # remove data of data_block_num=34, i.e., 2015.11 | |||
| # this is test set in competition | |||
| shop_df = shop_df[shop_df.date_block_num <= 33] | |||
| # fill the null | |||
| cols = shop_df.isnull().any() | |||
| idx = list(cols[cols.values].index) | |||
| shop_df[idx] = shop_df.groupby("item_id", sort=False)[idx].apply( | |||
| lambda x: x.fillna(method="ffill").fillna(method="bfill") | |||
| ) | |||
| shop_df[idx] = shop_df[idx].fillna(shop_df[idx].mean()) | |||
| for col in idx: | |||
| shop_df[col] = shop_df[col].fillna(fill_dict[col]) | |||
| # min-max scale | |||
| drop_fea_list = [ | |||
| "shop_id", | |||
| "city_code", | |||
| "city_coord_1", | |||
| "city_coord_2", | |||
| "country_part", | |||
| "item_cnt_month", | |||
| "date_block_num", | |||
| ] | |||
| fea_list = [col for col in shop_df.columns if col not in drop_fea_list] | |||
| mms = MinMaxScaler() | |||
| shop_df[fea_list] = mms.fit_transform(shop_df[fea_list]) | |||
| shop_df = shop_df[fea_list + ["item_cnt_month", "date_block_num"]] | |||
| date_split = 29 | |||
| split = False | |||
| while split is False: | |||
| df1 = shop_df[shop_df["date_block_num"] <= date_split] | |||
| df2 = shop_df[shop_df["date_block_num"] > date_split] | |||
| if df2.shape[0] > 0 and df1.shape[0] > 0: | |||
| split = True | |||
| else: | |||
| date_split -= 1 | |||
| if date_split < 0: | |||
| break | |||
| if split is True: | |||
| print("ShopID:{}, split block:{}".format(shop_id, date_split)) | |||
| print(df1.shape, df2.shape) | |||
| # save train csv | |||
| fpath = os.path.join(pfs_split_dir, "Shop{:0>2d}-train.csv".format(shop_id)) | |||
| df1.to_csv(fpath, index=False) | |||
| # save val csv | |||
| fpath = os.path.join(pfs_split_dir, "Shop{:0>2d}-val.csv".format(shop_id)) | |||
| df2.to_csv(fpath, index=False) | |||
| @@ -1,90 +0,0 @@ | |||
| import hashlib | |||
| import requests | |||
| import os | |||
| import random | |||
| import json | |||
| import time | |||
| from tqdm import tqdm | |||
| email = "liujd@lamda.nju.edu.cn" | |||
| password = hashlib.md5(b"liujdlamda").hexdigest() | |||
| login_url = "http://210.28.134.201:8089/auth/login" | |||
| submit_url = "http://210.28.134.201:8089/user/add_learnware" | |||
| all_data_type = ["Table", "Image", "Video", "Text", "Audio"] | |||
| all_task_type = [ | |||
| "Classification", | |||
| "Regression", | |||
| "Clustering", | |||
| "Feature Extraction", | |||
| "Generation", | |||
| "Segmentation", | |||
| "Object Detection", | |||
| ] | |||
| all_device_type = ["CPU", "GPU"] | |||
| all_scenario = [ | |||
| "Business", | |||
| "Financial", | |||
| "Health", | |||
| "Politics", | |||
| "Computer", | |||
| "Internet", | |||
| "Traffic", | |||
| "Nature", | |||
| "Fashion", | |||
| "Industry", | |||
| "Agriculture", | |||
| "Education", | |||
| "Entertainment", | |||
| "Architecture", | |||
| ] | |||
| # ############### | |||
| # 以上部分无需修改 # | |||
| # ############### | |||
| def main(): | |||
| session = requests.Session() | |||
| res = session.post(login_url, json={"email": email, "password": password}) | |||
| # /path/to/learnware/folder 修改为学件文件夹地址 | |||
| learnware_pool = os.listdir(os.path.join(os.path.abspath("."), "learnware_pool")) | |||
| for learnware in learnware_pool: | |||
| # 修改相应的语义规约 | |||
| name = "PFS_Shop" + "%02d" % int(learnware.split(".")[0].split("_")[1]) | |||
| name = name + "_" + time.strftime("%Y%m%d%H%M%S", time.localtime()) | |||
| description = f"This is a description of learnware {name}" | |||
| data = random.choice(all_data_type) | |||
| task = random.choice(all_task_type) | |||
| device = list(set(random.choices(all_device_type, k=2))) | |||
| scenario = list(set(random.choices(all_scenario, k=5))) | |||
| semantic_specification = { | |||
| "Data": {"Values": ["Table"], "Type": "Class"}, | |||
| "Library": {"Values": ["Scikit-learn"], "Type": "Class"}, | |||
| "Task": {"Values": ["Regression"], "Type": "Class"}, | |||
| "Scenario": {"Values": ["Business"], "Type": "Tag"}, | |||
| "Description": { | |||
| "Values": "A sales-forecasting model from Predict Future Sales Competition on Kaggle", | |||
| "Type": "String", | |||
| }, | |||
| "Name": {"Values": name, "Type": "String"}, | |||
| "License": {"Values": ["MIT"], "Type": "Class"}, | |||
| } | |||
| res = session.post( | |||
| submit_url, | |||
| data={ | |||
| "semantic_specification": json.dumps(semantic_specification), | |||
| }, | |||
| files={ | |||
| "learnware_file": open( | |||
| os.path.join(os.path.abspath("."), "learnware_pool", learnware), | |||
| "rb", | |||
| ) | |||
| }, | |||
| ) | |||
| assert json.loads(res.text)["code"] == 0, "Upload error" | |||
| if __name__ == "__main__": | |||
| main() | |||
| @@ -64,7 +64,7 @@ class TextDatasetWorkflow: | |||
| plt.xlabel("Amout of Labeled User Data", fontsize=14) | |||
| plt.ylabel("1 - Accuracy", fontsize=14) | |||
| plt.title(f"Results on Text Experimental Scenario", fontsize=16) | |||
| plt.title("Results on Text Experimental Scenario", fontsize=16) | |||
| plt.legend(fontsize=14) | |||
| plt.tight_layout() | |||
| plt.savefig(os.path.join(self.fig_path, "text_labeled_curves.svg"), bbox_inches="tight", dpi=700) | |||
| @@ -76,7 +76,7 @@ class TextDatasetWorkflow: | |||
| self.user_semantic = client.get_semantic_specification(self.text_benchmark.learnware_ids[0]) | |||
| self.user_semantic["Name"]["Values"] = "" | |||
| if len(self.text_market) == 0 or rebuild == True: | |||
| if len(self.text_market) == 0 or rebuild is True: | |||
| for learnware_id in self.text_benchmark.learnware_ids: | |||
| with tempfile.TemporaryDirectory(prefix="text_benchmark_") as tempdir: | |||
| zip_path = os.path.join(tempdir, f"{learnware_id}.zip") | |||
| @@ -86,7 +86,7 @@ class TextDatasetWorkflow: | |||
| client.download_learnware(learnware_id, zip_path) | |||
| self.text_market.add_learnware(zip_path, semantic_spec) | |||
| break | |||
| except: | |||
| except Exception: | |||
| time.sleep(1) | |||
| continue | |||
| @@ -36,12 +36,12 @@ def init(verbose=True, **kwargs): | |||
| with open(config_file, "r") as fin_config: | |||
| C.update(**dict(json.load(fin_config))) | |||
| ## random seed | |||
| # random seed | |||
| deterministic = kwargs.get("deterministic", True) | |||
| if deterministic: | |||
| setup_seed(C.random_seed) | |||
| ## make dirs | |||
| # make dirs | |||
| mkdir = kwargs.get("mkdir", True) | |||
| if mkdir: | |||
| os.makedirs(C.root_path, exist_ok=True) | |||
| @@ -49,7 +49,7 @@ def init(verbose=True, **kwargs): | |||
| os.makedirs(C.stdout_path, exist_ok=True) | |||
| os.makedirs(C.cache_path, exist_ok=True) | |||
| ## ignore tensorflow warning | |||
| # ignore tensorflow warning | |||
| tf_loglevel = kwargs.get("tf_loglevel", "2") | |||
| os.environ["TF_CPP_MIN_LOG_LEVEL"] = tf_loglevel | |||
| @@ -86,7 +86,7 @@ def filter_nonexist_pip_packages(packages: list) -> Tuple[List[str], List[str]]: | |||
| pass | |||
| except Exception as err: | |||
| logger.error(err) | |||
| return None | |||
| exist_packages = [] | |||
| @@ -101,7 +101,7 @@ def filter_nonexist_pip_packages(packages: list) -> Tuple[List[str], List[str]]: | |||
| exist_packages.append(result) | |||
| else: | |||
| nonexist_packages.append(package) | |||
| if len(nonexist_packages) > 0: | |||
| logger.info(f"Filtered out {len(nonexist_packages)} non-exist pip packages.") | |||
| return exist_packages, nonexist_packages | |||
| @@ -13,7 +13,9 @@ from ..utils import read_yaml_to_dict | |||
| logger = get_module_logger("learnware.learnware") | |||
| def get_learnware_from_dirpath(id: str, semantic_spec: dict, learnware_dirpath, ignore_error=True) -> Optional[Learnware]: | |||
| def get_learnware_from_dirpath( | |||
| id: str, semantic_spec: dict, learnware_dirpath, ignore_error=True | |||
| ) -> Optional[Learnware]: | |||
| """Get the learnware object from dirpath, and provide the manage interface tor Learnware class | |||
| Parameters | |||
| @@ -46,11 +48,11 @@ def get_learnware_from_dirpath(id: str, semantic_spec: dict, learnware_dirpath, | |||
| } | |||
| try: | |||
| learnware_yaml_path = os.path.join(learnware_dirpath, C.learnware_folder_config["yaml_file"]) | |||
| assert os.path.exists(learnware_yaml_path), f"learnware.yaml is not found for learnware_{id}, please check the learnware folder or zipfile." | |||
| assert os.path.exists( | |||
| learnware_yaml_path | |||
| ), f"learnware.yaml is not found for learnware_{id}, please check the learnware folder or zipfile." | |||
| yaml_config = read_yaml_to_dict(learnware_yaml_path) | |||
| if "name" in yaml_config: | |||
| @@ -67,8 +69,10 @@ def get_learnware_from_dirpath(id: str, semantic_spec: dict, learnware_dirpath, | |||
| for _stat_spec in learnware_config["stat_specifications"]: | |||
| stat_spec = _stat_spec.copy() | |||
| stat_spec_path = os.path.join(learnware_dirpath, stat_spec["file_name"]) | |||
| assert os.path.exists(stat_spec_path), f"statistical specification file {stat_spec['file_name']} is not found for learnware_{id}, please check the learnware folder or zipfile." | |||
| assert os.path.exists( | |||
| stat_spec_path | |||
| ), f"statistical specification file {stat_spec['file_name']} is not found for learnware_{id}, please check the learnware folder or zipfile." | |||
| stat_spec["file_name"] = stat_spec_path | |||
| stat_spec_inst = get_stat_spec_from_config(stat_spec) | |||
| learnware_spec.update_stat_spec(**{stat_spec_inst.type: stat_spec_inst}) | |||
| @@ -1,6 +1,6 @@ | |||
| import os | |||
| import sys | |||
| from typing import List, Union | |||
| from typing import Union | |||
| import numpy as np | |||
| @@ -13,8 +13,7 @@ logger = get_module_logger("Learnware") | |||
| class Learnware: | |||
| """The learnware class, which is the basic components in learnware market | |||
| """ | |||
| """The learnware class, which is the basic components in learnware market""" | |||
| def __init__(self, id: str, model: Union[BaseModel, dict], specification: Specification, learnware_dirpath: str): | |||
| """The initialization method for learnware. | |||
| @@ -41,7 +40,7 @@ class Learnware: | |||
| dirpath: str | |||
| The path of the learnware directory | |||
| """ | |||
| self.id = id | |||
| self.model = model | |||
| self.specification = specification | |||
| @@ -1,4 +1,3 @@ | |||
| import copy | |||
| from typing import Union | |||
| from ..model import BaseModel | |||
| @@ -45,5 +44,5 @@ def get_stat_spec_from_config(stat_spec: dict) -> BaseStatSpecification: | |||
| f"Statistic specification must be type of BaseStatSpecification, not {BaseStatSpecification.__class__.__name__}" | |||
| ) | |||
| stat_spec_inst.load(stat_spec["file_name"]) | |||
| return stat_spec_inst | |||
| @@ -1,5 +1,5 @@ | |||
| import logging | |||
| from logging import Logger, handlers | |||
| from logging import Logger | |||
| from .config import C | |||
| @@ -1,9 +1,7 @@ | |||
| from .anchor import AnchoredOrganizer, AnchoredSearcher, AnchoredUserInfo | |||
| from .base import (BaseChecker, BaseOrganizer, BaseSearcher, BaseUserInfo, | |||
| LearnwareMarket) | |||
| from .base import BaseChecker, BaseOrganizer, BaseSearcher, BaseUserInfo, LearnwareMarket | |||
| from .classes import CondaChecker | |||
| from .easy import (EasyOrganizer, EasySearcher, EasySemanticChecker, | |||
| EasyStatChecker) | |||
| from .easy import EasyOrganizer, EasySearcher, EasySemanticChecker, EasyStatChecker | |||
| from .evolve import EvolvedOrganizer | |||
| from .evolve_anchor import EvolvedAnchoredOrganizer | |||
| from .heterogeneous import HeteroMapTableOrganizer, HeteroSearcher | |||
| @@ -44,7 +44,7 @@ class AnchoredOrganizer(EasyOrganizer): | |||
| Exception | |||
| Raise an excpetion when given anchor_id is NOT found in anchor_learnware_list | |||
| """ | |||
| if not anchor_id in self.anchor_learnware_list: | |||
| if anchor_id not in self.anchor_learnware_list: | |||
| raise Exception("Anchor learnware id:{} NOT Found!".format(anchor_id)) | |||
| self.anchor_learnware_list.pop(anchor_id) | |||
| @@ -11,5 +11,4 @@ if not is_torch_available(verbose=False): | |||
| logger.error("EasySeacher and EasyChecker are not available because 'torch' is not installed!") | |||
| else: | |||
| from .checker import EasySemanticChecker, EasyStatChecker | |||
| from .searcher import (EasyExactSemanticSearcher, EasyFuzzSemanticSearcher, | |||
| EasySearcher, EasyStatSearcher) | |||
| from .searcher import EasyExactSemanticSearcher, EasyFuzzSemanticSearcher, EasySearcher, EasyStatSearcher | |||
| @@ -94,12 +94,12 @@ class EasyOrganizer(BaseOrganizer): | |||
| new_learnware = get_learnware_from_dirpath( | |||
| id=learnware_id, semantic_spec=semantic_spec, learnware_dirpath=target_folder_dir | |||
| ) | |||
| except: | |||
| except Exception: | |||
| logger.warning("New learnware is not properly added!") | |||
| try: | |||
| os.remove(target_zip_dir) | |||
| rmtree(target_folder_dir) | |||
| except: | |||
| except Exception: | |||
| pass | |||
| return None, BaseChecker.INVALID_LEARNWARE | |||
| @@ -137,7 +137,7 @@ class EasyOrganizer(BaseOrganizer): | |||
| True for successful operation. | |||
| False for id not found. | |||
| """ | |||
| if not id in self.learnware_list: | |||
| if id not in self.learnware_list: | |||
| logger.warning("Learnware id:'{}' NOT Found!".format(id)) | |||
| return False | |||
| @@ -253,7 +253,7 @@ class EasyOrganizer(BaseOrganizer): | |||
| else: | |||
| try: | |||
| return self.learnware_list[ids] | |||
| except: | |||
| except Exception: | |||
| logger.warning("Learnware ID '%s' NOT Found!" % (ids)) | |||
| return None | |||
| @@ -285,7 +285,7 @@ class EasyOrganizer(BaseOrganizer): | |||
| else: | |||
| try: | |||
| return self.learnware_zip_list[ids] | |||
| except: | |||
| except Exception: | |||
| logger.warning("Learnware ID '%s' NOT Found!" % (ids)) | |||
| return None | |||
| @@ -317,7 +317,7 @@ class EasyOrganizer(BaseOrganizer): | |||
| else: | |||
| try: | |||
| return self.learnware_folder_list[ids] | |||
| except: | |||
| except Exception: | |||
| logger.warning("Learnware ID '%s' NOT Found!" % (ids)) | |||
| return None | |||
| @@ -6,13 +6,11 @@ import torch | |||
| from rapidfuzz import fuzz | |||
| from .organizer import EasyOrganizer | |||
| from ..base import (BaseSearcher, BaseUserInfo, MultipleSearchItem, | |||
| SearchResults, SingleSearchItem) | |||
| from ..base import BaseSearcher, BaseUserInfo, MultipleSearchItem, SearchResults, SingleSearchItem | |||
| from ..utils import parse_specification_type | |||
| from ...learnware import Learnware | |||
| from ...logger import get_module_logger | |||
| from ...specification import (RKMEImageSpecification, RKMETableSpecification, | |||
| RKMETextSpecification, rkme_solve_qp) | |||
| from ...specification import RKMEImageSpecification, RKMETableSpecification, RKMETextSpecification, rkme_solve_qp | |||
| logger = get_module_logger("easy_seacher") | |||
| @@ -281,7 +279,7 @@ class EasyStatSearcher(BaseSearcher): | |||
| learnware_num = len(learnware_list) | |||
| RKME_list = [learnware.specification.get_stat_spec_by_name(self.stat_spec_type) for learnware in learnware_list] | |||
| if type(intermediate_K) == np.ndarray: | |||
| if isinstance(intermediate_K, np.ndarray): | |||
| K = intermediate_K | |||
| else: | |||
| K = np.zeros((learnware_num, learnware_num)) | |||
| @@ -290,7 +288,7 @@ class EasyStatSearcher(BaseSearcher): | |||
| for j in range(i + 1, K.shape[0]): | |||
| K[i, j] = K[j, i] = RKME_list[i].inner_prod(RKME_list[j]) | |||
| if type(intermediate_C) == np.ndarray: | |||
| if isinstance(intermediate_C, np.ndarray): | |||
| C = intermediate_C | |||
| else: | |||
| C = np.zeros((learnware_num, 1)) | |||
| @@ -42,7 +42,7 @@ class HeteroMapTableOrganizer(EasyOrganizer): | |||
| for hetero_id in hetero_ids: | |||
| self._reload_learnware_hetero_spec(hetero_id) | |||
| else: | |||
| logger.warning(f"No market mapping to reload!") | |||
| logger.warning("No market mapping to reload!") | |||
| self.market_mapping = HeteroMap() | |||
| def reset(self, market_id, rebuild=False, auto_update=False, auto_update_limit=100, **training_args): | |||
| @@ -8,8 +8,7 @@ from torch import nn | |||
| from .feature_extractor import CLSToken, FeatureProcessor, FeatureTokenizer | |||
| from .trainer import Trainer, TransTabCollatorForCL | |||
| from .....specification import (HeteroMapTableSpecification, | |||
| RKMETableSpecification) | |||
| from .....specification import HeteroMapTableSpecification, RKMETableSpecification | |||
| from .....utils import allocate_cuda_idx, choose_device | |||
| @@ -288,7 +287,7 @@ class HeteroMap(nn.Module): | |||
| # go through transformers, get the first cls embedding | |||
| encoder_output = self.encoder(**outputs) # bs, seqlen+1, hidden_dim | |||
| output_features = encoder_output[:, 0, :] | |||
| del inputs, outputs, encoder_output | |||
| torch.cuda.empty_cache() | |||
| @@ -1,5 +1,4 @@ | |||
| import math | |||
| import os | |||
| import time | |||
| from typing import Any, Callable, Dict, List | |||
| @@ -11,7 +11,11 @@ logger = get_module_logger("hetero_searcher") | |||
| class HeteroSearcher(EasySearcher): | |||
| def __call__( | |||
| self, user_info: BaseUserInfo, check_status: Optional[int] = None, max_search_num: int = 5, search_method: str = "greedy" | |||
| self, | |||
| user_info: BaseUserInfo, | |||
| check_status: Optional[int] = None, | |||
| max_search_num: int = 5, | |||
| search_method: str = "greedy", | |||
| ) -> SearchResults: | |||
| """Search learnwares based on user_info from learnwares with check_status. | |||
| Employs heterogeneous learnware search if specific requirements are met, otherwise resorts to homogeneous search methods. | |||
| @@ -1,5 +1,3 @@ | |||
| import traceback | |||
| from ...logger import get_module_logger | |||
| logger = get_module_logger("hetero_utils") | |||
| @@ -49,5 +47,5 @@ def is_hetero(stat_specs: dict, semantic_spec: dict, verbose=True) -> bool: | |||
| return True | |||
| except Exception as err: | |||
| if verbose: | |||
| logger.warning(f"Invalid heterogeneous search information provided.") | |||
| logger.warning("Invalid heterogeneous search information provided.") | |||
| return False | |||
| @@ -1,11 +1,12 @@ | |||
| from .base import LearnwareMarket | |||
| from .classes import CondaChecker | |||
| from .easy import (EasyOrganizer, EasySearcher, EasySemanticChecker, | |||
| EasyStatChecker) | |||
| from .easy import EasyOrganizer, EasySearcher, EasySemanticChecker, EasyStatChecker | |||
| from .heterogeneous import HeteroMapTableOrganizer, HeteroSearcher | |||
| def get_market_component(name, market_id, rebuild, organizer_kwargs=None, searcher_kwargs=None, checker_kwargs=None, conda_checker=False): | |||
| def get_market_component( | |||
| name, market_id, rebuild, organizer_kwargs=None, searcher_kwargs=None, checker_kwargs=None, conda_checker=False | |||
| ): | |||
| organizer_kwargs = {} if organizer_kwargs is None else organizer_kwargs | |||
| searcher_kwargs = {} if searcher_kwargs is None else searcher_kwargs | |||
| checker_kwargs = {} if checker_kwargs is None else checker_kwargs | |||
| @@ -13,7 +14,10 @@ def get_market_component(name, market_id, rebuild, organizer_kwargs=None, search | |||
| if name == "easy": | |||
| easy_organizer = EasyOrganizer(market_id=market_id, rebuild=rebuild) | |||
| easy_searcher = EasySearcher(organizer=easy_organizer) | |||
| easy_checker_list = [EasySemanticChecker(), EasyStatChecker() if conda_checker is False else CondaChecker(EasyStatChecker())] | |||
| easy_checker_list = [ | |||
| EasySemanticChecker(), | |||
| EasyStatChecker() if conda_checker is False else CondaChecker(EasyStatChecker()), | |||
| ] | |||
| market_component = { | |||
| "organizer": easy_organizer, | |||
| "searcher": easy_searcher, | |||
| @@ -22,7 +26,10 @@ def get_market_component(name, market_id, rebuild, organizer_kwargs=None, search | |||
| elif name == "hetero": | |||
| hetero_organizer = HeteroMapTableOrganizer(market_id=market_id, rebuild=rebuild, **organizer_kwargs) | |||
| hetero_searcher = HeteroSearcher(organizer=hetero_organizer) | |||
| hetero_checker_list = [EasySemanticChecker(), EasyStatChecker() if conda_checker is False else CondaChecker(EasyStatChecker())] | |||
| hetero_checker_list = [ | |||
| EasySemanticChecker(), | |||
| EasyStatChecker() if conda_checker is False else CondaChecker(EasyStatChecker()), | |||
| ] | |||
| market_component = { | |||
| "organizer": hetero_organizer, | |||
| @@ -45,7 +52,9 @@ def instantiate_learnware_market( | |||
| conda_checker: bool = False, | |||
| **kwargs, | |||
| ): | |||
| market_componets = get_market_component(name, market_id, rebuild, organizer_kwargs, searcher_kwargs, checker_kwargs, conda_checker) | |||
| market_componets = get_market_component( | |||
| name, market_id, rebuild, organizer_kwargs, searcher_kwargs, checker_kwargs, conda_checker | |||
| ) | |||
| return LearnwareMarket( | |||
| organizer=market_componets["organizer"], | |||
| searcher=market_componets["searcher"], | |||
| @@ -1,6 +1,3 @@ | |||
| from ..specification import Specification | |||
| def parse_specification_type( | |||
| stat_specs: dict, | |||
| spec_list=[ | |||
| @@ -1,5 +1,3 @@ | |||
| from typing import Union | |||
| import numpy as np | |||
| @@ -20,4 +20,4 @@ else: | |||
| from .ensemble_pruning import EnsemblePruningReuser | |||
| from .feature_augment import FeatureAugmentReuser | |||
| from .hetero import FeatureAlignLearnware, HeteroMapAlignLearnware | |||
| from .job_selector import JobSelectorReuser | |||
| from .job_selector import JobSelectorReuser | |||
| @@ -1,4 +1,4 @@ | |||
| from typing import List, Union | |||
| from typing import List | |||
| import numpy as np | |||
| import torch | |||
| @@ -50,7 +50,7 @@ class AveragingReuser(BaseReuser): | |||
| if isinstance(pred_y, torch.Tensor): | |||
| pred_y = pred_y.detach().cpu().numpy() | |||
| if not isinstance(pred_y, np.ndarray): | |||
| raise TypeError(f"Model output must be np.ndarray or torch.Tensor") | |||
| raise TypeError("Model output must be np.ndarray or torch.Tensor") | |||
| if len(pred_y.shape) == 1: | |||
| pred_y = pred_y.reshape(-1, 1) | |||
| @@ -54,13 +54,14 @@ class EnsemblePruningReuser(BaseReuser): | |||
| np.ndarray | |||
| Binary one-dimensional vector, 1 indicates that the corresponding model is selected. | |||
| """ | |||
| try: | |||
| import geatpy as ea | |||
| except ModuleNotFoundError: | |||
| raise ModuleNotFoundError(f"EnsemblePruningReuser is not available because 'geatpy' is not installed! Please install it manually (only support python_version<3.11).") | |||
| raise ModuleNotFoundError( | |||
| "EnsemblePruningReuser is not available because 'geatpy' is not installed! Please install it manually (only support python_version<3.11)." | |||
| ) | |||
| model_num = v_predict.shape[1] | |||
| @ea.Problem.single | |||
| @@ -148,7 +149,9 @@ class EnsemblePruningReuser(BaseReuser): | |||
| try: | |||
| import geatpy as ea | |||
| except ModuleNotFoundError: | |||
| raise ModuleNotFoundError(f"EnsemblePruningReuser is not available because 'geatpy' is not installed! Please install it manually (only support python_version<3.11).") | |||
| raise ModuleNotFoundError( | |||
| "EnsemblePruningReuser is not available because 'geatpy' is not installed! Please install it manually (only support python_version<3.11)." | |||
| ) | |||
| if torch.is_tensor(v_true): | |||
| v_true = v_true.detach().cpu().numpy() | |||
| @@ -270,8 +273,10 @@ class EnsemblePruningReuser(BaseReuser): | |||
| try: | |||
| import geatpy as ea | |||
| except ModuleNotFoundError: | |||
| raise ModuleNotFoundError(f"EnsemblePruningReuser is not available because 'geatpy' is not installed! Please install it manually (only support python_version<3.11).") | |||
| raise ModuleNotFoundError( | |||
| "EnsemblePruningReuser is not available because 'geatpy' is not installed! Please install it manually (only support python_version<3.11)." | |||
| ) | |||
| model_num = v_predict.shape[1] | |||
| v_predict[v_predict == 0.0] = -1 | |||
| v_true[v_true == 0.0] = -1 | |||
| @@ -372,7 +377,7 @@ class EnsemblePruningReuser(BaseReuser): | |||
| if isinstance(pred_y, torch.Tensor): | |||
| pred_y = pred_y.detach().cpu().numpy() | |||
| if not isinstance(pred_y, np.ndarray): | |||
| raise TypeError(f"Model output must be np.ndarray or torch.Tensor") | |||
| raise TypeError("Model output must be np.ndarray or torch.Tensor") | |||
| if len(pred_y.shape) == 1: | |||
| pred_y = pred_y.reshape(-1, 1) | |||
| @@ -103,7 +103,7 @@ class FeatureAugmentReuser(BaseReuser): | |||
| if isinstance(y_pred, torch.Tensor): | |||
| y_pred = y_pred.detach().cpu().numpy() | |||
| if not isinstance(y_pred, np.ndarray): | |||
| raise TypeError(f"Model output must be np.ndarray or torch.Tensor") | |||
| raise TypeError("Model output must be np.ndarray or torch.Tensor") | |||
| if len(y_pred.shape) == 1: | |||
| y_pred = y_pred.reshape(-1, 1) | |||
| y_preds.append(y_pred) | |||
| @@ -8,8 +8,7 @@ from .base import BaseReuser | |||
| from ..learnware import Learnware | |||
| from ..logger import get_module_logger | |||
| from ..market.utils import parse_specification_type | |||
| from ..specification import (RKMETableSpecification, RKMETextSpecification, | |||
| generate_rkme_table_spec, rkme_solve_qp) | |||
| from ..specification import RKMETableSpecification, RKMETextSpecification, generate_rkme_table_spec, rkme_solve_qp | |||
| logger = get_module_logger("job_selector_reuse") | |||
| @@ -70,7 +69,7 @@ class JobSelectorReuser(BaseReuser): | |||
| # pred_y = pred_y.numpy() | |||
| if not isinstance(pred_y, np.ndarray): | |||
| raise TypeError(f"Model output must be np.ndarray or torch.Tensor") | |||
| raise TypeError("Model output must be np.ndarray or torch.Tensor") | |||
| pred_y_list.append(pred_y) | |||
| data_idxs_list.append(data_idx_list) | |||
| @@ -230,7 +229,7 @@ class JobSelectorReuser(BaseReuser): | |||
| from lightgbm import LGBMClassifier, early_stopping | |||
| except ModuleNotFoundError: | |||
| raise ModuleNotFoundError( | |||
| f"JobSelectorReuser is not available because 'lightgbm' is not installed! Please install it manually." | |||
| "JobSelectorReuser is not available because 'lightgbm' is not installed! Please install it manually." | |||
| ) | |||
| score_best = -1 | |||
| @@ -4,6 +4,7 @@ from ..logger import get_module_logger | |||
| logger = get_module_logger("reuse_utils") | |||
| def fill_data_with_mean(X: np.ndarray) -> np.ndarray: | |||
| """ | |||
| Fill missing data (NaN, Inf) in the input array with the mean of the column. | |||
| @@ -1,7 +1,12 @@ | |||
| from .base import BaseStatSpecification, Specification | |||
| from .regular import (RegularStatSpecification, RKMEImageSpecification, | |||
| RKMEStatSpecification, RKMETableSpecification, | |||
| RKMETextSpecification, rkme_solve_qp) | |||
| from .regular import ( | |||
| RegularStatSpecification, | |||
| RKMEImageSpecification, | |||
| RKMEStatSpecification, | |||
| RKMETableSpecification, | |||
| RKMETextSpecification, | |||
| rkme_solve_qp, | |||
| ) | |||
| from .system import HeteroMapTableSpecification | |||
| from ..utils import is_torch_available | |||
| @@ -12,6 +17,10 @@ if not is_torch_available(verbose=False): | |||
| generate_rkme_text_spec = None | |||
| generate_semantic_spec = None | |||
| else: | |||
| from .module import (generate_rkme_image_spec, generate_rkme_table_spec, | |||
| generate_rkme_text_spec, generate_semantic_spec, | |||
| generate_stat_spec) | |||
| from .module import ( | |||
| generate_rkme_image_spec, | |||
| generate_rkme_table_spec, | |||
| generate_rkme_text_spec, | |||
| generate_semantic_spec, | |||
| generate_stat_spec, | |||
| ) | |||
| @@ -1,10 +1,7 @@ | |||
| from __future__ import annotations | |||
| import copy | |||
| from typing import Dict | |||
| import numpy as np | |||
| class BaseStatSpecification: | |||
| """The Statistical Specification Interface, which provide save and load method""" | |||
| @@ -27,7 +24,7 @@ class BaseStatSpecification: | |||
| def dist(self, stat_spec: BaseStatSpecification): | |||
| raise NotImplementedError("dist is not implemented") | |||
| def save(self, filepath: str): | |||
| """Save the statistical specification into file in filepath | |||
| @@ -4,9 +4,7 @@ import numpy as np | |||
| import pandas as pd | |||
| import torch | |||
| from .base import BaseStatSpecification | |||
| from .regular import (RKMEImageSpecification, RKMETableSpecification, | |||
| RKMETextSpecification) | |||
| from .regular import RKMEImageSpecification, RKMETableSpecification, RKMETextSpecification | |||
| from .utils import convert_to_numpy | |||
| from ..config import C | |||
| @@ -5,6 +5,6 @@ logger = get_module_logger("regular_image_spec") | |||
| if not is_torch_available(verbose=False): | |||
| RKMEImageSpecification = None | |||
| logger.error(f"RKMEImageSpecification is not available because 'torch' is not installed!") | |||
| logger.error("RKMEImageSpecification is not available because 'torch' is not installed!") | |||
| else: | |||
| from .rkme import RKMEImageSpecification | |||
| @@ -9,7 +9,7 @@ __all__ = ("NNGPKernel", "Conv2d", "ReLU", "Sequential", "ConvKP", "NonlinKP") | |||
| """ | |||
| With this package, we are able to accurately and efficiently compute the kernel matrix corresponding to the NNGP during the search phase. | |||
| Github Repository: https://github.com/cambridge-mlg/cnn-gp | |||
| Github Repository: https://github.com/cambridge-mlg/cnn-gp | |||
| References: [1] A. Garriga-Alonso, L. Aitchison, and C. E. Rasmussen. Deep Convolutional Networks as shallow Gaussian Processes. In: International Conference on Learning Representations (ICLR'19), 2019. | |||
| """ | |||
| @@ -17,7 +17,6 @@ from tqdm import tqdm | |||
| from . import cnn_gp | |||
| from ..base import RegularStatSpecification | |||
| from ..table.rkme import rkme_solve_qp | |||
| from .... import setup_seed | |||
| from ....logger import get_module_logger | |||
| from ....utils import allocate_cuda_idx, choose_device | |||
| @@ -136,7 +135,7 @@ class RKMEImageSpecification(RegularStatSpecification): | |||
| from torchvision.transforms import Resize | |||
| except ModuleNotFoundError: | |||
| raise ModuleNotFoundError( | |||
| f"RKMEImageSpecification is not available because 'torchvision' is not installed! Please install it manually." | |||
| "RKMEImageSpecification is not available because 'torchvision' is not installed! Please install it manually." | |||
| ) | |||
| if X.shape[2] != RKMEImageSpecification.IMAGE_WIDTH or X.shape[3] != RKMEImageSpecification.IMAGE_WIDTH: | |||
| @@ -168,7 +167,7 @@ class RKMEImageSpecification(RegularStatSpecification): | |||
| import torch_optimizer | |||
| except ModuleNotFoundError: | |||
| raise ModuleNotFoundError( | |||
| f"RKMEImageSpecification is not available because 'torch-optimizer' is not installed! Please install it manually." | |||
| "RKMEImageSpecification is not available because 'torch-optimizer' is not installed! Please install it manually." | |||
| ) | |||
| # Cross-platform by default, unless the spec is specified to be generated specifically for local experiments. | |||
| @@ -8,8 +8,7 @@ if not is_torch_available(verbose=False): | |||
| RKMEStatSpecification = None | |||
| rkme_solve_qp = None | |||
| logger.error( | |||
| f"RKMETableSpecification, RKMEStatSpecification and rkme_solve_qp are not available because 'torch' is not installed!" | |||
| "RKMETableSpecification, RKMEStatSpecification and rkme_solve_qp are not available because 'torch' is not installed!" | |||
| ) | |||
| else: | |||
| from .rkme import (RKMEStatSpecification, RKMETableSpecification, | |||
| rkme_solve_qp) | |||
| from .rkme import RKMEStatSpecification, RKMETableSpecification, rkme_solve_qp | |||
| @@ -148,7 +148,7 @@ class RKMETableSpecification(RegularStatSpecification): | |||
| from fast_pytorch_kmeans import KMeans | |||
| except ModuleNotFoundError: | |||
| raise ModuleNotFoundError( | |||
| f"RKMETableSpecification is not available because 'fast_pytorch_kmeans' is not installed! Please install it manually." | |||
| "RKMETableSpecification is not available because 'fast_pytorch_kmeans' is not installed! Please install it manually." | |||
| ) | |||
| kmeans = KMeans(n_clusters=K, mode="euclidean", max_iter=100, verbose=0) | |||
| @@ -5,6 +5,6 @@ logger = get_module_logger("regular_text_spec") | |||
| if not is_torch_available(verbose=False): | |||
| RKMETextSpecification = None | |||
| logger.error(f"RKMETextSpecification is not available because 'torch' is not installed!") | |||
| logger.error("RKMETextSpecification is not available because 'torch' is not installed!") | |||
| else: | |||
| from .rkme import RKMETextSpecification | |||
| @@ -63,9 +63,8 @@ class RKMETextSpecification(RKMETableSpecification): | |||
| def get_language_ids(X): | |||
| try: | |||
| text = " ".join(X) | |||
| lang = langdetect.detect(text) | |||
| langs = langdetect.detect_langs(text) | |||
| return [l.lang for l in langs] | |||
| return [item.lang for item in langs] | |||
| except Exception as e: | |||
| logger.warning("Language detection failed.") | |||
| return [] | |||
| @@ -87,12 +86,14 @@ class RKMETextSpecification(RKMETableSpecification): | |||
| return np.array(miniLM_learnware.predict(X)) | |||
| logger.info("Load the necessary feature extractor for RKMETextSpecification.") | |||
| try: | |||
| from sentence_transformers import SentenceTransformer | |||
| except ModuleNotFoundError: | |||
| raise ModuleNotFoundError(f"RKMETextSpecification is not available because 'sentence_transformers' is not installed! Please install it manually.") | |||
| raise ModuleNotFoundError( | |||
| "RKMETextSpecification is not available because 'sentence_transformers' is not installed! Please install it manually." | |||
| ) | |||
| if os.path.exists(zip_path): | |||
| X = _get_from_client(zip_path, X) | |||
| else: | |||
| @@ -101,7 +102,7 @@ class RKMETextSpecification(RKMETableSpecification): | |||
| "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", cache_folder=cache_dir | |||
| ) | |||
| X = np.array(model.encode(X)) | |||
| except: | |||
| except Exception: | |||
| X = _get_from_client(zip_path, X) | |||
| return X | |||
| @@ -137,7 +137,9 @@ class HeteroMapTableSpecification(SystemStatSpecification): | |||
| for d in self.get_states(): | |||
| if d in embedding_load.keys(): | |||
| if d == "type" and embedding_load[d] != self.type: | |||
| raise TypeError(f"The type of loaded RKME ({embedding_load[d]}) is different from the expected type ({self.type})!") | |||
| raise TypeError( | |||
| f"The type of loaded RKME ({embedding_load[d]}) is different from the expected type ({self.type})!" | |||
| ) | |||
| setattr(self, d, embedding_load[d]) | |||
| def save(self, filepath: str) -> bool: | |||
| @@ -1 +1 @@ | |||
| from .utils import parametrize | |||
| from .utils import parametrize | |||
| @@ -13,10 +13,13 @@ class ModelTemplate: | |||
| class_name: str = field(init=False) | |||
| template_path: str = field(init=False) | |||
| model_kwargs: dict = field(init=False) | |||
| @dataclass | |||
| class PickleModelTemplate(ModelTemplate): | |||
| model_kwargs: dict | |||
| pickle_filepath: str | |||
| def __post_init__(self): | |||
| self.class_name = "PickleLoadedModel" | |||
| self.template_path = os.path.join(C.package_path, "tests", "templates", "pickle_model.py") | |||
| @@ -29,13 +32,14 @@ class PickleModelTemplate(ModelTemplate): | |||
| default_model_kwargs.update(self.model_kwargs) | |||
| self.model_kwargs = default_model_kwargs | |||
| @dataclass | |||
| class StatSpecTemplate: | |||
| filepath: str | |||
| type: str = field(default="RKMETableSpecification") | |||
| class LearnwareTemplate: | |||
| class LearnwareTemplate: | |||
| @staticmethod | |||
| def generate_requirements(filepath, requirements: Optional[List[Union[Tuple[str, str, str], str]]] = None): | |||
| requirements = [] if requirements is None else requirements | |||
| @@ -45,18 +49,20 @@ class LearnwareTemplate: | |||
| if isinstance(requirement, str): | |||
| line_str = requirement.strip() + "\n" | |||
| elif isinstance(requirement, tuple): | |||
| assert requirement[1] in operators, f"The operator of requirements is not supported." | |||
| assert requirement[1] in operators, "The operator of requirements is not supported." | |||
| line_str = requirement[0].strip() + requirement[1].strip() + requirement[2].strip() + "\n" | |||
| else: | |||
| raise TypeError(f"requirement must be type str/tuple, rather than {type(requirement)}") | |||
| requirements_str += line_str | |||
| with open(filepath, "w") as fdout: | |||
| fdout.write(requirements_str) | |||
| @staticmethod | |||
| def generate_learnware_yaml(filepath, model_config: Optional[dict] = None, stat_spec_config: Optional[List[dict]] = None): | |||
| def generate_learnware_yaml( | |||
| filepath, model_config: Optional[dict] = None, stat_spec_config: Optional[List[dict]] = None | |||
| ): | |||
| learnware_config = {} | |||
| if model_config is not None: | |||
| learnware_config["model"] = model_config | |||
| @@ -64,7 +70,7 @@ class LearnwareTemplate: | |||
| learnware_config["stat_specifications"] = stat_spec_config | |||
| save_dict_to_yaml(learnware_config, filepath) | |||
| @staticmethod | |||
| def generate_learnware_zipfile( | |||
| learnware_zippath: str, | |||
| @@ -75,27 +81,29 @@ class LearnwareTemplate: | |||
| with tempfile.TemporaryDirectory(suffix="learnware_template") as tempdir: | |||
| requirement_filepath = os.path.join(tempdir, "requirements.txt") | |||
| LearnwareTemplate.generate_requirements(requirement_filepath, requirements) | |||
| model_filepath = os.path.join(tempdir, "__init__.py") | |||
| model_filepath = os.path.join(tempdir, "__init__.py") | |||
| copyfile(model_template.template_path, model_filepath) | |||
| learnware_yaml_filepath = os.path.join(tempdir, "learnware.yaml") | |||
| model_config = { | |||
| "class_name": model_template.class_name, | |||
| "kwargs": model_template.model_kwargs, | |||
| } | |||
| stat_spec_config = { | |||
| "module_path": "learnware.specification", | |||
| "class_name": stat_spec_template.type, | |||
| "file_name": "stat_spec.json", | |||
| "kwargs": {} | |||
| "kwargs": {}, | |||
| } | |||
| copyfile(stat_spec_template.filepath, os.path.join(tempdir, stat_spec_config["file_name"])) | |||
| LearnwareTemplate.generate_learnware_yaml(learnware_yaml_filepath, model_config, stat_spec_config=[stat_spec_config]) | |||
| LearnwareTemplate.generate_learnware_yaml( | |||
| learnware_yaml_filepath, model_config, stat_spec_config=[stat_spec_config] | |||
| ) | |||
| if isinstance(model_template, PickleModelTemplate): | |||
| pickle_filepath = os.path.join(tempdir, model_template.model_kwargs["pickle_filename"]) | |||
| copyfile(model_template.pickle_filepath, pickle_filepath) | |||
| convert_folder_to_zipfile(tempdir, learnware_zippath) | |||
| @@ -7,7 +7,6 @@ from learnware.model.base import BaseModel | |||
| class PickleLoadedModel(BaseModel): | |||
| def __init__( | |||
| self, | |||
| input_shape, | |||
| @@ -25,10 +24,10 @@ class PickleLoadedModel(BaseModel): | |||
| self.predict_method = predict_method | |||
| self.fit_method = fit_method | |||
| self.finetune_method = finetune_method | |||
| def predict(self, X: np.ndarray) -> np.ndarray: | |||
| return getattr(self.model, self.predict_method)(X) | |||
| def fit(self, X: np.ndarray, y: np.ndarray): | |||
| getattr(self.model, self.fit_method)(X, y) | |||
| @@ -7,4 +7,4 @@ def parametrize(test_class, **kwargs): | |||
| _suite = unittest.TestSuite() | |||
| for name in test_names: | |||
| _suite.addTest(test_class(name, **kwargs)) | |||
| return _suite | |||
| return _suite | |||
| @@ -1,8 +1,7 @@ | |||
| import os | |||
| import zipfile | |||
| from .file import (convert_folder_to_zipfile, read_yaml_to_dict, | |||
| save_dict_to_yaml) | |||
| from .file import convert_folder_to_zipfile, read_yaml_to_dict, save_dict_to_yaml | |||
| from .gpu import allocate_cuda_idx, choose_device, setup_seed | |||
| from .import_utils import is_torch_available | |||
| from .module import get_module_by_module_path | |||
| @@ -16,6 +16,7 @@ def read_yaml_to_dict(yaml_path: str): | |||
| dict_value = yaml.load(file.read(), Loader=yaml.FullLoader) | |||
| return dict_value | |||
| def convert_folder_to_zipfile(folder_path, zip_path): | |||
| with zipfile.ZipFile(zip_path, "w") as zip_obj: | |||
| for foldername, subfolders, filenames in os.walk(folder_path): | |||
| @@ -17,6 +17,7 @@ def setup_seed(seed): | |||
| random.seed(seed) | |||
| if is_torch_available(verbose=False): | |||
| import torch | |||
| torch.manual_seed(seed) | |||
| torch.cuda.manual_seed_all(seed) | |||
| torch.backends.cudnn.deterministic = True | |||
| @@ -1,6 +1,6 @@ | |||
| import os | |||
| from setuptools import find_packages, setup | |||
| from enum import Enum | |||
| def read(rel_path: str) -> str: | |||
| here = os.path.abspath(os.path.dirname(__file__)) | |||
| @@ -100,7 +100,7 @@ if __name__ == "__main__": | |||
| install_requires=REQUIRED, | |||
| extras_require={ | |||
| "dev": DEV_REQUIRED, | |||
| "full": FULL_REQUIRED, | |||
| "full": FULL_REQUIRED, | |||
| }, | |||
| classifiers=[ | |||
| "Intended Audience :: Science/Research", | |||
| @@ -4,13 +4,11 @@ import tempfile | |||
| import logging | |||
| import learnware | |||
| learnware.init(logging_level=logging.WARNING) | |||
| from learnware.learnware import Learnware | |||
| from learnware.client import LearnwareClient | |||
| from learnware.market import instantiate_learnware_market, BaseUserInfo, EasySemanticChecker | |||
| from learnware.config import C | |||
| from learnware.market import instantiate_learnware_market, BaseUserInfo | |||
| learnware.init(logging_level=logging.WARNING) | |||
| class TestSearch(unittest.TestCase): | |||
| @@ -1,9 +1,7 @@ | |||
| import os | |||
| import json | |||
| import zipfile | |||
| import unittest | |||
| import tempfile | |||
| import argparse | |||
| from learnware.client import LearnwareClient | |||
| from learnware.specification import generate_semantic_spec | |||
| @@ -58,7 +56,7 @@ class TestAllLearnware(unittest.TestCase): | |||
| semantic_spec = self.client.get_semantic_specification(idx) | |||
| LearnwareClient.check_learnware(zip_path, semantic_spec) | |||
| print(f"check learnware {idx} succeed") | |||
| except: | |||
| except Exception: | |||
| failed_ids.append(idx) | |||
| print(f"check learnware {idx} failed!!!") | |||
| @@ -1,6 +1,4 @@ | |||
| import os | |||
| import json | |||
| import zipfile | |||
| import unittest | |||
| import tempfile | |||
| @@ -4,15 +4,16 @@ import numpy as np | |||
| from learnware.client import LearnwareClient | |||
| from learnware.client.container import LearnwaresContainer | |||
| class TestContainer(unittest.TestCase): | |||
| def __init__(self, method_name='runTest', mode="all"): | |||
| def __init__(self, method_name="runTest", mode="all"): | |||
| super(TestContainer, self).__init__(method_name) | |||
| self.modes = [] | |||
| if mode in {"all", "conda"}: | |||
| self.modes.append("conda") | |||
| if mode in {"all", "docker"}: | |||
| self.modes.append("docker") | |||
| def setUp(self): | |||
| self.client = LearnwareClient() | |||
| @@ -35,17 +36,19 @@ class TestContainer(unittest.TestCase): | |||
| def test_container_with_pip(self): | |||
| for mode in self.modes: | |||
| self._test_container_with_pip(mode=mode) | |||
| def test_container_with_conda(self): | |||
| for mode in self.modes: | |||
| self._test_container_with_conda(mode=mode) | |||
| def suite(): | |||
| _suite = unittest.TestSuite() | |||
| _suite.addTest(TestContainer("test_container_with_pip", mode="all")) | |||
| _suite.addTest(TestContainer("test_container_with_conda", mode="all")) | |||
| return _suite | |||
| if __name__ == "__main__": | |||
| runner = unittest.TextTestRunner() | |||
| runner.run(suite()) | |||
| runner.run(suite()) | |||
| @@ -5,8 +5,9 @@ import numpy as np | |||
| from learnware.client import LearnwareClient | |||
| from learnware.reuse import AveragingReuser | |||
| class TestLearnwareLoad(unittest.TestCase): | |||
| def __init__(self, method_name='runTest', mode="all"): | |||
| def __init__(self, method_name="runTest", mode="all"): | |||
| super(TestLearnwareLoad, self).__init__(method_name) | |||
| self.runnable_options = [] | |||
| if mode in {"all", "conda"}: | |||
| @@ -31,7 +32,6 @@ class TestLearnwareLoad(unittest.TestCase): | |||
| for learnware in learnware_list: | |||
| print(learnware.id, learnware.predict(input_array)) | |||
| def _test_load_learnware_by_id(self, runnable_option): | |||
| learnware_list = self.client.load_learnware(learnware_id=self.learnware_ids, runnable_option=runnable_option) | |||
| reuser = AveragingReuser(learnware_list, mode="vote_by_label") | |||
| @@ -44,11 +44,11 @@ class TestLearnwareLoad(unittest.TestCase): | |||
| def test_load_learnware_by_zippath(self): | |||
| for runnable_option in self.runnable_options: | |||
| self._test_load_learnware_by_zippath(runnable_option=runnable_option) | |||
| def test_load_learnware_by_id(self): | |||
| for runnable_option in self.runnable_options: | |||
| self._test_load_learnware_by_id(runnable_option=runnable_option) | |||
| def suite(): | |||
| _suite = unittest.TestSuite() | |||
| @@ -56,6 +56,7 @@ def suite(): | |||
| _suite.addTest(TestLearnwareLoad("test_load_learnware_by_id", mode="all")) | |||
| return _suite | |||
| if __name__ == "__main__": | |||
| runner = unittest.TextTestRunner() | |||
| runner.run(suite()) | |||
| runner.run(suite()) | |||
| @@ -60,7 +60,7 @@ class TestUpload(unittest.TestCase): | |||
| download_learnware_id = "00000084" | |||
| with tempfile.TemporaryDirectory(prefix="learnware_") as tempdir: | |||
| zip_path = os.path.join(tempdir, f"test.zip") | |||
| zip_path = os.path.join(tempdir, "test.zip") | |||
| self.client.download_learnware(download_learnware_id, zip_path) | |||
| learnware_id = self.client.upload_learnware( | |||
| learnware_zip_path=zip_path, semantic_specification=semantic_spec | |||
| @@ -1,8 +1,5 @@ | |||
| import os | |||
| import json | |||
| import string | |||
| import random | |||
| import torch | |||
| import unittest | |||
| import tempfile | |||
| import numpy as np | |||
| @@ -11,11 +8,11 @@ from learnware.specification import RKMETableSpecification, HeteroMapTableSpecif | |||
| from learnware.specification import generate_stat_spec | |||
| from learnware.market.heterogeneous.organizer import HeteroMap | |||
| class TestTableRKME(unittest.TestCase): | |||
| def setUp(self): | |||
| self.hetero_map = HeteroMap() | |||
| def _test_hetero_spec(self, X): | |||
| rkme: RKMETableSpecification = generate_stat_spec(type="table", X=X) | |||
| hetero_spec = self.hetero_map.hetero_mapping(rkme_spec=rkme, features=dict()) | |||
| @@ -30,14 +27,14 @@ class TestTableRKME(unittest.TestCase): | |||
| rkme2 = HeteroMapTableSpecification() | |||
| rkme2.load(rkme_path) | |||
| assert rkme2.type == "HeteroMapTableSpecification" | |||
| def test_hetero_rkme(self): | |||
| self._test_hetero_spec(np.random.uniform(-10000, 10000, size=(5000, 200))) | |||
| self._test_hetero_spec(np.random.uniform(-10000, 10000, size=(10000, 100))) | |||
| self._test_hetero_spec(np.random.uniform(-10000, 10000, size=(5, 20))) | |||
| self._test_hetero_spec(np.random.uniform(-10000, 10000, size=(1, 50))) | |||
| self._test_hetero_spec(np.random.uniform(-10000, 10000, size=(100, 150))) | |||
| if __name__ == "__main__": | |||
| unittest.main() | |||
| @@ -25,7 +25,7 @@ class TestImageRKME(unittest.TestCase): | |||
| rkme2 = RKMEImageSpecification() | |||
| rkme2.load(rkme_path) | |||
| assert rkme2.type == "RKMEImageSpecification" | |||
| def test_image_rkme(self): | |||
| self._test_image_rkme(np.random.randint(0, 255, size=(2000, 3, 32, 32))) | |||
| self._test_image_rkme(np.random.randint(0, 255, size=(100, 1, 128, 128))) | |||
| @@ -34,5 +34,6 @@ class TestImageRKME(unittest.TestCase): | |||
| self._test_image_rkme(torch.randint(0, 255, (20, 3, 128, 128))) | |||
| self._test_image_rkme(torch.randint(0, 255, (1, 1, 128, 128)) / 255) | |||
| if __name__ == "__main__": | |||
| unittest.main() | |||
| @@ -4,7 +4,7 @@ import unittest | |||
| import tempfile | |||
| import numpy as np | |||
| from learnware.specification import RKMETableSpecification, RKMEImageSpecification, RKMETextSpecification | |||
| from learnware.specification import RKMETableSpecification | |||
| from learnware.specification import generate_stat_spec | |||
| @@ -24,7 +24,7 @@ class TestTableRKME(unittest.TestCase): | |||
| rkme2 = RKMETableSpecification() | |||
| rkme2.load(rkme_path) | |||
| assert rkme2.type == "RKMETableSpecification" | |||
| def test_table_rkme(self): | |||
| self._test_table_rkme(np.random.uniform(-10000, 10000, size=(5000, 200))) | |||
| self._test_table_rkme(np.random.uniform(-10000, 10000, size=(10000, 100))) | |||
| @@ -32,5 +32,6 @@ class TestTableRKME(unittest.TestCase): | |||
| self._test_table_rkme(np.random.uniform(-10000, 10000, size=(1, 50))) | |||
| self._test_table_rkme(np.random.uniform(-10000, 10000, size=(100, 150))) | |||
| if __name__ == "__main__": | |||
| unittest.main() | |||
| @@ -12,19 +12,19 @@ from learnware.specification import generate_stat_spec | |||
| class TestTextRKME(unittest.TestCase): | |||
| @staticmethod | |||
| def generate_random_text_list(num, text_type="en", min_len=10, max_len=1000): | |||
| text_list = [] | |||
| for i in range(num): | |||
| length = random.randint(min_len, max_len) | |||
| if text_type == "en": | |||
| characters = string.ascii_letters + string.digits + string.punctuation | |||
| result_str = "".join(random.choice(characters) for i in range(length)) | |||
| text_list.append(result_str) | |||
| elif text_type == "zh": | |||
| result_str = "".join(chr(random.randint(0x4E00, 0x9FFF)) for i in range(length)) | |||
| text_list.append(result_str) | |||
| else: | |||
| raise ValueError("Type should be en or zh") | |||
| return text_list | |||
| text_list = [] | |||
| for i in range(num): | |||
| length = random.randint(min_len, max_len) | |||
| if text_type == "en": | |||
| characters = string.ascii_letters + string.digits + string.punctuation | |||
| result_str = "".join(random.choice(characters) for i in range(length)) | |||
| text_list.append(result_str) | |||
| elif text_type == "zh": | |||
| result_str = "".join(chr(random.randint(0x4E00, 0x9FFF)) for i in range(length)) | |||
| text_list.append(result_str) | |||
| else: | |||
| raise ValueError("Type should be en or zh") | |||
| return text_list | |||
| @staticmethod | |||
| def _test_text_rkme(X): | |||
| @@ -7,12 +7,9 @@ import tempfile | |||
| import zipfile | |||
| from sklearn.linear_model import Ridge | |||
| from sklearn.datasets import make_regression | |||
| from shutil import copyfile, rmtree | |||
| from sklearn.metrics import mean_squared_error | |||
| import learnware | |||
| learnware.init(logging_level=logging.WARNING) | |||
| from learnware.market import instantiate_learnware_market, BaseUserInfo | |||
| from learnware.specification import RKMETableSpecification, generate_rkme_table_spec, generate_semantic_spec | |||
| from learnware.reuse import HeteroMapAlignLearnware, AveragingReuser, EnsemblePruningReuser | |||
| @@ -20,9 +17,10 @@ from learnware.tests.templates import LearnwareTemplate, PickleModelTemplate, St | |||
| from hetero_config import input_shape_list, input_description_list, output_description_list, user_description_list | |||
| learnware.init(logging_level=logging.WARNING) | |||
| curr_root = os.path.dirname(os.path.abspath(__file__)) | |||
| class TestHeteroWorkflow(unittest.TestCase): | |||
| universal_semantic_config = { | |||
| "data_type": "Table", | |||
| @@ -46,10 +44,12 @@ class TestHeteroWorkflow(unittest.TestCase): | |||
| learnware_pool_dirpath = os.path.join(curr_root, "learnware_pool_hetero") | |||
| os.makedirs(learnware_pool_dirpath, exist_ok=True) | |||
| learnware_zippath = os.path.join(learnware_pool_dirpath, "ridge_%d.zip" % (i)) | |||
| print("Preparing Learnware: %d" % (i)) | |||
| X, y = make_regression(n_samples=5000, n_informative=15, n_features=input_shape_list[i % 2], noise=0.1, random_state=42) | |||
| X, y = make_regression( | |||
| n_samples=5000, n_informative=15, n_features=input_shape_list[i % 2], noise=0.1, random_state=42 | |||
| ) | |||
| clf = Ridge(alpha=1.0) | |||
| clf.fit(X, y) | |||
| pickle_filepath = os.path.join(learnware_pool_dirpath, "ridge.pkl") | |||
| @@ -62,20 +62,22 @@ class TestHeteroWorkflow(unittest.TestCase): | |||
| LearnwareTemplate.generate_learnware_zipfile( | |||
| learnware_zippath=learnware_zippath, | |||
| model_template=PickleModelTemplate(pickle_filepath=pickle_filepath, model_kwargs={"input_shape":(input_shape_list[i % 2],), "output_shape": (1,)}), | |||
| model_template=PickleModelTemplate( | |||
| pickle_filepath=pickle_filepath, | |||
| model_kwargs={"input_shape": (input_shape_list[i % 2],), "output_shape": (1,)}, | |||
| ), | |||
| stat_spec_template=StatSpecTemplate(filepath=spec_filepath, type="RKMETableSpecification"), | |||
| requirements=["scikit-learn==0.22"], | |||
| ) | |||
| self.zip_path_list.append(learnware_zippath) | |||
| def _upload_delete_learnware(self, hetero_market, learnware_num, delete): | |||
| self.test_prepare_learnware_randomly(learnware_num) | |||
| self.learnware_num = learnware_num | |||
| print("Total Item:", len(hetero_market)) | |||
| assert len(hetero_market) == 0, f"The market should be empty!" | |||
| assert len(hetero_market) == 0, "The market should be empty!" | |||
| for idx, zip_path in enumerate(self.zip_path_list): | |||
| semantic_spec = generate_semantic_spec( | |||
| @@ -83,7 +85,7 @@ class TestHeteroWorkflow(unittest.TestCase): | |||
| description=f"test_learnware_number_{idx}", | |||
| input_description=input_description_list[idx % 2], | |||
| output_description=output_description_list[idx % 2], | |||
| **self.universal_semantic_config | |||
| **self.universal_semantic_config, | |||
| ) | |||
| hetero_market.add_learnware(zip_path, semantic_spec) | |||
| @@ -103,10 +105,10 @@ class TestHeteroWorkflow(unittest.TestCase): | |||
| curr_inds = hetero_market.get_learnware_ids() | |||
| print("Available ids After Deleting Learnwares:", curr_inds) | |||
| assert len(curr_inds) == 0, f"The market should be empty!" | |||
| assert len(curr_inds) == 0, "The market should be empty!" | |||
| return hetero_market | |||
| def test_upload_delete_learnware(self, learnware_num=5, delete=True): | |||
| hetero_market = self._init_learnware_market() | |||
| return self._upload_delete_learnware(hetero_market, learnware_num, delete) | |||
| @@ -129,32 +131,32 @@ class TestHeteroWorkflow(unittest.TestCase): | |||
| name=f"learnware_{learnware_num - 1}", | |||
| **self.universal_semantic_config, | |||
| ) | |||
| user_info = BaseUserInfo(semantic_spec=semantic_spec) | |||
| search_result = hetero_market.search_learnware(user_info) | |||
| single_result = search_result.get_single_results() | |||
| print(f"Search result1:") | |||
| assert len(single_result) == 1, f"Exact semantic search failed!" | |||
| print("Search result1:") | |||
| assert len(single_result) == 1, "Exact semantic search failed!" | |||
| for search_item in single_result: | |||
| semantic_spec1 = search_item.learnware.get_specification().get_semantic_spec() | |||
| print("Choose learnware:", search_item.learnware.id) | |||
| assert semantic_spec1["Name"]["Values"] == semantic_spec["Name"]["Values"], f"Exact semantic search failed!" | |||
| assert semantic_spec1["Name"]["Values"] == semantic_spec["Name"]["Values"], "Exact semantic search failed!" | |||
| semantic_spec["Name"]["Values"] = "laernwaer" | |||
| user_info = BaseUserInfo(semantic_spec=semantic_spec) | |||
| search_result = hetero_market.search_learnware(user_info) | |||
| single_result = search_result.get_single_results() | |||
| print(f"Search result2:") | |||
| assert len(single_result) == self.learnware_num, f"Fuzzy semantic search failed!" | |||
| print("Search result2:") | |||
| assert len(single_result) == self.learnware_num, "Fuzzy semantic search failed!" | |||
| for search_item in single_result: | |||
| print("Choose learnware:", search_item.learnware.id) | |||
| def test_hetero_stat_search(self, learnware_num=5): | |||
| hetero_market = self.test_train_market_model(learnware_num, delete=False) | |||
| print("Total Item:", len(hetero_market)) | |||
| user_dim = 15 | |||
| with tempfile.TemporaryDirectory(prefix="learnware_test_hetero") as test_folder: | |||
| @@ -174,7 +176,10 @@ class TestHeteroWorkflow(unittest.TestCase): | |||
| semantic_spec = generate_semantic_spec( | |||
| input_description={ | |||
| "Dimension": user_dim, | |||
| "Description": {str(key): input_description_list[idx % 2]["Description"][str(key)] for key in range(user_dim)}, | |||
| "Description": { | |||
| str(key): input_description_list[idx % 2]["Description"][str(key)] | |||
| for key in range(user_dim) | |||
| }, | |||
| }, | |||
| **self.universal_semantic_config, | |||
| ) | |||
| @@ -182,7 +187,7 @@ class TestHeteroWorkflow(unittest.TestCase): | |||
| search_result = hetero_market.search_learnware(user_info) | |||
| single_result = search_result.get_single_results() | |||
| multiple_result = search_result.get_multiple_results() | |||
| print(f"search result of user{idx}:") | |||
| for single_item in single_result: | |||
| print(f"score: {single_item.score}, learnware_id: {single_item.learnware.id}") | |||
| @@ -199,7 +204,7 @@ class TestHeteroWorkflow(unittest.TestCase): | |||
| search_result = hetero_market.search_learnware(user_info) | |||
| single_result = search_result.get_single_results() | |||
| assert len(single_result) == 0, f"Statistical search failed!" | |||
| assert len(single_result) == 0, "Statistical search failed!" | |||
| # delete key "Task" in semantic_spec, use homo search and print WARNING INFO with "User doesn't provide correct task type" | |||
| print(">> delele key 'Task' test:") | |||
| @@ -208,14 +213,17 @@ class TestHeteroWorkflow(unittest.TestCase): | |||
| search_result = hetero_market.search_learnware(user_info) | |||
| single_result = search_result.get_single_results() | |||
| assert len(single_result) == 0, f"Statistical search failed!" | |||
| assert len(single_result) == 0, "Statistical search failed!" | |||
| # modify semantic info with mismatch dim, use homo search and print "User data feature dimensions mismatch with semantic specification." | |||
| print(">> mismatch dim test") | |||
| semantic_spec = generate_semantic_spec( | |||
| input_description={ | |||
| "Dimension": user_dim - 2, | |||
| "Description": {str(key): input_description_list[idx % 2]["Description"][str(key)] for key in range(user_dim)}, | |||
| "Description": { | |||
| str(key): input_description_list[idx % 2]["Description"][str(key)] | |||
| for key in range(user_dim) | |||
| }, | |||
| }, | |||
| **self.universal_semantic_config, | |||
| ) | |||
| @@ -223,12 +231,12 @@ class TestHeteroWorkflow(unittest.TestCase): | |||
| search_result = hetero_market.search_learnware(user_info) | |||
| single_result = search_result.get_single_results() | |||
| assert len(single_result) == 0, f"Statistical search failed!" | |||
| assert len(single_result) == 0, "Statistical search failed!" | |||
| def test_homo_stat_search(self, learnware_num=5): | |||
| hetero_market = self.test_train_market_model(learnware_num, delete=False) | |||
| print("Total Item:", len(hetero_market)) | |||
| with tempfile.TemporaryDirectory(prefix="learnware_test_hetero") as test_folder: | |||
| for idx, zip_path in enumerate(self.zip_path_list): | |||
| with zipfile.ZipFile(zip_path, "r") as zip_obj: | |||
| @@ -242,7 +250,7 @@ class TestHeteroWorkflow(unittest.TestCase): | |||
| single_result = search_result.get_single_results() | |||
| multiple_result = search_result.get_multiple_results() | |||
| assert len(single_result) >= 1, f"Statistical search failed!" | |||
| assert len(single_result) >= 1, "Statistical search failed!" | |||
| print(f"search result of user{idx}:") | |||
| for single_item in single_result: | |||
| print(f"score: {single_item.score}, learnware_id: {single_item.learnware.id}") | |||
| @@ -260,7 +268,9 @@ class TestHeteroWorkflow(unittest.TestCase): | |||
| user_spec = generate_rkme_table_spec(X=X, gamma=0.1, cuda_idx=0) | |||
| # generate specification | |||
| semantic_spec = generate_semantic_spec(input_description=user_description_list[0], **self.universal_semantic_config) | |||
| semantic_spec = generate_semantic_spec( | |||
| input_description=user_description_list[0], **self.universal_semantic_config | |||
| ) | |||
| user_info = BaseUserInfo(semantic_spec=semantic_spec, stat_info={"RKMETableSpecification": user_spec}) | |||
| # learnware market search | |||
| @@ -268,7 +278,7 @@ class TestHeteroWorkflow(unittest.TestCase): | |||
| search_result = hetero_market.search_learnware(user_info) | |||
| single_result = search_result.get_single_results() | |||
| multiple_result = search_result.get_multiple_results() | |||
| # print search results | |||
| for single_item in single_result: | |||
| print(f"score: {single_item.score}, learnware_id: {single_item.learnware.id}") | |||
| @@ -285,8 +295,8 @@ class TestHeteroWorkflow(unittest.TestCase): | |||
| # multi model reuse | |||
| hetero_learnware_list = [] | |||
| for learnware in multiple_result[0].learnwares: | |||
| hetero_learnware = HeteroMapAlignLearnware(learnware, mode="regression") | |||
| for org_learnware in multiple_result[0].learnwares: | |||
| hetero_learnware = HeteroMapAlignLearnware(org_learnware, mode="regression") | |||
| hetero_learnware.align(user_spec, X[:100], y[:100]) | |||
| hetero_learnware_list.append(hetero_learnware) | |||
| @@ -306,9 +316,9 @@ class TestHeteroWorkflow(unittest.TestCase): | |||
| def suite(): | |||
| _suite = unittest.TestSuite() | |||
| #_suite.addTest(TestHeteroWorkflow("test_prepare_learnware_randomly")) | |||
| #_suite.addTest(TestHeteroWorkflow("test_upload_delete_learnware")) | |||
| #_suite.addTest(TestHeteroWorkflow("test_train_market_model")) | |||
| # _suite.addTest(TestHeteroWorkflow("test_prepare_learnware_randomly")) | |||
| # _suite.addTest(TestHeteroWorkflow("test_upload_delete_learnware")) | |||
| # _suite.addTest(TestHeteroWorkflow("test_train_market_model")) | |||
| _suite.addTest(TestHeteroWorkflow("test_search_semantics")) | |||
| _suite.addTest(TestHeteroWorkflow("test_hetero_stat_search")) | |||
| _suite.addTest(TestHeteroWorkflow("test_homo_stat_search")) | |||
| @@ -10,17 +10,16 @@ from sklearn.datasets import load_digits | |||
| from sklearn.model_selection import train_test_split | |||
| import learnware | |||
| learnware.init(logging_level=logging.WARNING) | |||
| from learnware.market import instantiate_learnware_market, BaseUserInfo | |||
| from learnware.specification import RKMETableSpecification, generate_rkme_table_spec, generate_semantic_spec | |||
| from learnware.reuse import JobSelectorReuser, AveragingReuser, EnsemblePruningReuser, FeatureAugmentReuser | |||
| from learnware.tests.templates import LearnwareTemplate, PickleModelTemplate, StatSpecTemplate | |||
| learnware.init(logging_level=logging.WARNING) | |||
| curr_root = os.path.dirname(os.path.abspath(__file__)) | |||
| class TestWorkflow(unittest.TestCase): | |||
| universal_semantic_config = { | |||
| "data_type": "Table", | |||
| "task_type": "Classification", | |||
| @@ -28,7 +27,7 @@ class TestWorkflow(unittest.TestCase): | |||
| "scenarios": "Education", | |||
| "license": "MIT", | |||
| } | |||
| def _init_learnware_market(self): | |||
| """initialize learnware market""" | |||
| easy_market = instantiate_learnware_market(market_id="sklearn_digits_easy", name="easy", rebuild=True) | |||
| @@ -42,7 +41,7 @@ class TestWorkflow(unittest.TestCase): | |||
| learnware_pool_dirpath = os.path.join(curr_root, "learnware_pool") | |||
| os.makedirs(learnware_pool_dirpath, exist_ok=True) | |||
| learnware_zippath = os.path.join(learnware_pool_dirpath, "svm_%d.zip" % (i)) | |||
| print("Preparing Learnware: %d" % (i)) | |||
| data_X, _, data_y, _ = train_test_split(X, y, test_size=0.3, shuffle=True) | |||
| clf = svm.SVC(kernel="linear", probability=True) | |||
| @@ -54,14 +53,17 @@ class TestWorkflow(unittest.TestCase): | |||
| spec = generate_rkme_table_spec(X=data_X, gamma=0.1, cuda_idx=0) | |||
| spec_filepath = os.path.join(learnware_pool_dirpath, "stat_spec.json") | |||
| spec.save(spec_filepath) | |||
| LearnwareTemplate.generate_learnware_zipfile( | |||
| learnware_zippath=learnware_zippath, | |||
| model_template=PickleModelTemplate(pickle_filepath=pickle_filepath, model_kwargs={"input_shape":(64,), "output_shape": (10,), "predict_method": "predict_proba"}), | |||
| model_template=PickleModelTemplate( | |||
| pickle_filepath=pickle_filepath, | |||
| model_kwargs={"input_shape": (64,), "output_shape": (10,), "predict_method": "predict_proba"}, | |||
| ), | |||
| stat_spec_template=StatSpecTemplate(filepath=spec_filepath, type="RKMETableSpecification"), | |||
| requirements=["scikit-learn==0.22"], | |||
| ) | |||
| self.zip_path_list.append(learnware_zippath) | |||
| def test_upload_delete_learnware(self, learnware_num=5, delete=True): | |||
| @@ -70,7 +72,7 @@ class TestWorkflow(unittest.TestCase): | |||
| self.learnware_num = learnware_num | |||
| print("Total Item:", len(easy_market)) | |||
| assert len(easy_market) == 0, f"The market should be empty!" | |||
| assert len(easy_market) == 0, "The market should be empty!" | |||
| for idx, zip_path in enumerate(self.zip_path_list): | |||
| semantic_spec = generate_semantic_spec( | |||
| @@ -87,7 +89,7 @@ class TestWorkflow(unittest.TestCase): | |||
| "Dimension": 10, | |||
| "Description": {f"{i}": "The probability for each digit for 0 to 9." for i in range(10)}, | |||
| }, | |||
| **self.universal_semantic_config | |||
| **self.universal_semantic_config, | |||
| ) | |||
| easy_market.add_learnware(zip_path, semantic_spec) | |||
| @@ -105,7 +107,7 @@ class TestWorkflow(unittest.TestCase): | |||
| curr_inds = easy_market.get_learnware_ids() | |||
| print("Available ids After Deleting Learnwares:", curr_inds) | |||
| assert len(curr_inds) == 0, f"The market should be empty!" | |||
| assert len(curr_inds) == 0, "The market should be empty!" | |||
| return easy_market | |||
| @@ -113,7 +115,7 @@ class TestWorkflow(unittest.TestCase): | |||
| easy_market = self.test_upload_delete_learnware(learnware_num, delete=False) | |||
| print("Total Item:", len(easy_market)) | |||
| assert len(easy_market) == self.learnware_num, f"The number of learnwares must be {self.learnware_num}!" | |||
| with tempfile.TemporaryDirectory(prefix="learnware_test_workflow") as test_folder: | |||
| with zipfile.ZipFile(self.zip_path_list[0], "r") as zip_obj: | |||
| zip_obj.extractall(path=test_folder) | |||
| @@ -123,15 +125,15 @@ class TestWorkflow(unittest.TestCase): | |||
| description=f"test_learnware_number_{learnware_num - 1}", | |||
| **self.universal_semantic_config, | |||
| ) | |||
| user_info = BaseUserInfo(semantic_spec=semantic_spec) | |||
| search_result = easy_market.search_learnware(user_info) | |||
| single_result = search_result.get_single_results() | |||
| print(f"Search result:") | |||
| print("Search result:") | |||
| for search_item in single_result: | |||
| print("Choose learnware:",search_item.learnware.id) | |||
| print("Choose learnware:", search_item.learnware.id) | |||
| def test_stat_search(self, learnware_num=5): | |||
| easy_market = self.test_upload_delete_learnware(learnware_num, delete=False) | |||
| print("Total Item:", len(easy_market)) | |||
| @@ -150,7 +152,7 @@ class TestWorkflow(unittest.TestCase): | |||
| single_result = search_results.get_single_results() | |||
| multiple_result = search_results.get_multiple_results() | |||
| assert len(single_result) >= 1, f"Statistical search failed!" | |||
| assert len(single_result) >= 1, "Statistical search failed!" | |||
| print(f"search result of user{idx}:") | |||
| for search_item in single_result: | |||
| print(f"score: {search_item.score}, learnware_id: {search_item.learnware.id}") | |||