From f281e3efed2e85ef41906c010e237737bed17aae Mon Sep 17 00:00:00 2001 From: Asymptotez <201220101@smail.nju.edu.cn> Date: Wed, 6 Dec 2023 14:12:00 +0800 Subject: [PATCH] [MNT] add param search for original text benchmark and add a new text benchmark: 20newsgroups --- examples/dataset_text_workflow/main.py | 4 +- .../dataset_text_workflow/param_search.py | 115 +++++ examples/dataset_text_workflow/utils.py | 33 +- .../example_files/example_init.py | 30 ++ .../example_files/example_yaml.yaml | 8 + .../example_files/requirements.txt | 3 + examples/dataset_text_workflow2/get_data.py | 12 + examples/dataset_text_workflow2/main.py | 413 ++++++++++++++++++ examples/dataset_text_workflow2/utils.py | 145 ++++++ 9 files changed, 757 insertions(+), 6 deletions(-) create mode 100644 examples/dataset_text_workflow/param_search.py create mode 100644 examples/dataset_text_workflow2/example_files/example_init.py create mode 100644 examples/dataset_text_workflow2/example_files/example_yaml.yaml create mode 100644 examples/dataset_text_workflow2/example_files/requirements.txt create mode 100644 examples/dataset_text_workflow2/get_data.py create mode 100644 examples/dataset_text_workflow2/main.py create mode 100644 examples/dataset_text_workflow2/utils.py diff --git a/examples/dataset_text_workflow/main.py b/examples/dataset_text_workflow/main.py index 1153e7c..42fee03 100644 --- a/examples/dataset_text_workflow/main.py +++ b/examples/dataset_text_workflow/main.py @@ -402,10 +402,10 @@ class TextDatasetWorkflow: plt.xlabel("Labeled Data Size") plt.ylabel("Accuracy") - plt.title(f"User{name} Text Limited Labeled Data") + plt.title(f"{name} Text Limited Labeled Data") plt.legend() plt.tight_layout() - plt.savefig(os.path.join("figs", f"User{name}_text_labeled_curves.png"), bbox_inches="tight", dpi=700) + plt.savefig(os.path.join("figs", f"{name}_text_labeled_curves.png"), bbox_inches="tight", dpi=700) if __name__ == "__main__": diff --git a/examples/dataset_text_workflow/param_search.py b/examples/dataset_text_workflow/param_search.py new file mode 100644 index 0000000..db51fdd --- /dev/null +++ b/examples/dataset_text_workflow/param_search.py @@ -0,0 +1,115 @@ +import json +import pickle + +import joblib +import numpy as np +import os, warnings +import lightgbm as lgb +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.metrics import mean_squared_error, accuracy_score +from sklearn.model_selection import train_test_split +from hyperopt import fmin, tpe, hp, Trials, space_eval, STATUS_OK + +warnings.filterwarnings("ignore") + + +def train_lgb(params, train_x, val_x, train_y, val_y, num_class, classification, model_path, save=False): + train_data = lgb.Dataset(train_x, label=train_y) + val_data = lgb.Dataset(val_x, label=val_y) + + if classification: + lgb_params = { + **params, + # "boosting": "dart", + "learning_rate": 0.1, + "importance_type": "gain", + # "class_weight": 'balanced', + "objective": "multiclass", + "num_class": num_class, + "n_estimators": 1000, + "early_stopping_round": 30, + 'max_bin': 512, + "verbose": -1, + } + else: + lgb_params = { + **params, + # "boosting": "dart", + "learning_rate": 0.1, + "importance_type": "gain", + "objective": "regression", + "n_estimators": 1000, + "early_stopping_round": 30, + 'max_bin': 512, + "verbose": -1, + } + gbm = lgb.train(lgb_params, train_data, valid_sets=[val_data]) + pred_val_y = gbm.predict(val_x, num_iteration=gbm.best_iteration) + if classification: + pred_val_y = np.argmax(pred_val_y, 1) + res = 1 - accuracy_score(val_y, pred_val_y) + else: + res = mean_squared_error(val_y, pred_val_y, squared=False) + + pred_train_y = gbm.predict(train_x, num_iteration=gbm.best_iteration) + if classification: + pred_train_y = np.argmax(pred_train_y, 1) + res_train = 1 - accuracy_score(train_y, pred_train_y) + else: + res_train = mean_squared_error(train_y, pred_train_y, squared=False) + + # print(params, res) + if save: + # gbm.save_model(model_path) + # with open(model_path, 'wb') as f: + # pickle.dump(gbm, f) + print(params, res_train, res) + return {'loss': res, 'status': STATUS_OK, 'param': params} + + +def grid_search(train_x, val_x, train_y, val_y, num_class, classification, model_path): + def objective(params): + return train_lgb(params, train_x, val_x, train_y, val_y, num_class, classification, model_path, save=False) + + if classification: + space = { + 'max_depth': hp.choice('max_depth', [3, 4, 5]), + 'num_leaves': hp.choice('num_leaves', [5, 6, 7, 12, 13, 14, 15, 28, 29, 30, 31]), + 'subsample': hp.choice('subsample', [0.6, 0.8, 0.9, 1.0]), + 'colsample_bytree': hp.choice('colsample_bytree', [0.6, 0.8, 0.9, 1.0]), + 'reg_alpha': hp.loguniform('reg_alpha', np.log(0.01), np.log(1000)), + 'reg_lambda': hp.loguniform('reg_lambda', np.log(0.01), np.log(1000)), + 'min_data_in_leaf': hp.choice('min_data_in_leaf', [5, 20, 50]), + } + else: + space = { + 'max_depth': hp.choice('max_depth', [3, 4, 5]), + 'num_leaves': hp.choice('num_leaves', [5, 6, 7, 12, 13, 14, 15, 28, 29, 30, 31]), + 'subsample': hp.choice('subsample', [0.6, 0.8, 0.9, 1.0]), + 'colsample_bytree': hp.choice('colsample_bytree', [0.6, 0.8, 0.9, 1.0]), + 'reg_alpha': hp.loguniform('reg_alpha', np.log(0.01), np.log(1000)), + 'reg_lambda': hp.loguniform('reg_lambda', np.log(0.01), np.log(1000)) + } + trials = Trials() + best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=30, trials=trials) + best_params = trials.best_trial['result']['param'] + res = train_lgb(best_params, train_x, val_x, train_y, val_y, num_class, classification, model_path, save=True)[ + 'param'] + return res + + +if __name__ == "__main__": + X_path = os.path.join("data/processed_data/ae/uploader/uploader_0_X.pkl") + y_path = os.path.join("data/processed_data/ae/uploader/uploader_0_y.pkl") + with open(X_path, "rb") as f: + X = pickle.load(f) + with open(y_path, "rb") as f: + y = pickle.load(f) + + X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42) + + vectorizer = TfidfVectorizer(stop_words="english") + X_trian_tfidf = vectorizer.fit_transform(X_train) + X_valid_tfidf = vectorizer.transform(X_valid) + + res = grid_search(X_trian_tfidf, X_valid_tfidf, y_train, y_valid, 3, True, "models/model.txt") diff --git a/examples/dataset_text_workflow/utils.py b/examples/dataset_text_workflow/utils.py index 0e30d5b..68d7c80 100644 --- a/examples/dataset_text_workflow/utils.py +++ b/examples/dataset_text_workflow/utils.py @@ -3,8 +3,9 @@ import pickle import numpy as np import pandas as pd -from lightgbm import LGBMClassifier +from lightgbm import LGBMClassifier, Booster from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.model_selection import train_test_split class TextDataLoader: @@ -43,7 +44,7 @@ def generate_uploader(data_x: pd.Series, data_y: pd.Series, n_uploaders=50, data for i in range(n_uploaders): indices = data_x["discourse_type"] == types[i] - selected_X = data_x[indices]["discourse_text"].to_list() + selected_X = (types[i] + ' ' + data_x[indices]["discourse_text"]).to_list() selected_y = data_y[indices].to_list() X_save_dir = os.path.join(data_save_root, "uploader_%d_X.pkl" % (i)) @@ -65,7 +66,7 @@ def generate_user(data_x, data_y, n_users=50, data_save_root=None): for i in range(n_users): indices = data_x["discourse_type"] == types[i] - selected_X = data_x[indices]["discourse_text"].to_list() + selected_X = (types[i] + ' ' + data_x[indices]["discourse_text"]).to_list() selected_y = data_y[indices].to_list() X_save_dir = os.path.join(data_save_root, "user_%d_X.pkl" % (i)) @@ -78,12 +79,36 @@ def generate_user(data_x, data_y, n_users=50, data_save_root=None): print("Saving to %s" % (X_save_dir)) +from param_search import grid_search + + # Train Uploaders' models def train(X, y, out_classes): vectorizer = TfidfVectorizer(stop_words="english") X_tfidf = vectorizer.fit_transform(X) - lgbm = LGBMClassifier(boosting_type="dart", n_estimators=500, num_leaves=21, verbosity=-1) + X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42) + + X_trian_tfidf = vectorizer.transform(X_train) + X_valid_tfidf = vectorizer.transform(X_valid) + + model_path = "models/model.pkl" + best_params = grid_search(X_trian_tfidf, X_valid_tfidf, y_train, y_valid, out_classes, True, model_path) + + # lgbm = Booster(model_file="models/model.txt") + # with open(model_path, "rb") as f: + # lgbm = pickle.load(f) + + param = { + "learning_rate": 0.1, + "importance_type": "gain", + "objective": "multiclass", + "num_class": out_classes, + "n_estimators": 1000, + 'max_bin': 512, + "verbose": -1, + **best_params} + lgbm = LGBMClassifier(**param) lgbm.fit(X_tfidf, y) return vectorizer, lgbm diff --git a/examples/dataset_text_workflow2/example_files/example_init.py b/examples/dataset_text_workflow2/example_files/example_init.py new file mode 100644 index 0000000..eede4a0 --- /dev/null +++ b/examples/dataset_text_workflow2/example_files/example_init.py @@ -0,0 +1,30 @@ +import os +import pickle + +import numpy as np + +from learnware.model import BaseModel + + +class Model(BaseModel): + def __init__(self): + super(Model, self).__init__(input_shape=(1,), output_shape=(20,)) + dir_path = os.path.dirname(os.path.abspath(__file__)) + + modelv_path = os.path.join(dir_path, "modelv.pth") + with open(modelv_path, "rb") as f: + self.modelv = pickle.load(f) + + modell_path = os.path.join(dir_path, "modell.pth") + with open(modell_path, "rb") as f: + self.modell = pickle.load(f) + + def fit(self, X: np.ndarray, y: np.ndarray): + pass + + def predict(self, X: np.ndarray) -> np.ndarray: + # predict -> predict_proba + return self.modell.predict_proba(self.modelv.transform(X)) + + def finetune(self, X: np.ndarray, y: np.ndarray): + pass diff --git a/examples/dataset_text_workflow2/example_files/example_yaml.yaml b/examples/dataset_text_workflow2/example_files/example_yaml.yaml new file mode 100644 index 0000000..d29f7dd --- /dev/null +++ b/examples/dataset_text_workflow2/example_files/example_yaml.yaml @@ -0,0 +1,8 @@ +model: + class_name: Model + kwargs: { } +stat_specifications: + - module_path: learnware.specification + class_name: RKMETextSpecification + file_name: rkme.json + kwargs: { } \ No newline at end of file diff --git a/examples/dataset_text_workflow2/example_files/requirements.txt b/examples/dataset_text_workflow2/example_files/requirements.txt new file mode 100644 index 0000000..1a4b344 --- /dev/null +++ b/examples/dataset_text_workflow2/example_files/requirements.txt @@ -0,0 +1,3 @@ +numpy +pickle +scikit-learn \ No newline at end of file diff --git a/examples/dataset_text_workflow2/get_data.py b/examples/dataset_text_workflow2/get_data.py new file mode 100644 index 0000000..36e2161 --- /dev/null +++ b/examples/dataset_text_workflow2/get_data.py @@ -0,0 +1,12 @@ +import os +import json +import numpy as np +from sklearn.datasets import fetch_20newsgroups +import pandas as pd + + +def get_data(data_root): + X_train, y_train = fetch_20newsgroups(data_home=data_root, subset='train', return_X_y=True) + X_test, y_test = fetch_20newsgroups(data_home=data_root, subset='test', return_X_y=True) + + return X_train, y_train, X_test, y_test \ No newline at end of file diff --git a/examples/dataset_text_workflow2/main.py b/examples/dataset_text_workflow2/main.py new file mode 100644 index 0000000..75951b8 --- /dev/null +++ b/examples/dataset_text_workflow2/main.py @@ -0,0 +1,413 @@ +import os +import fire +import pickle +import time +import zipfile +from shutil import copyfile, rmtree +import random + +import numpy as np + +import learnware.specification as specification +from get_data import get_data +from learnware.logger import get_module_logger +from learnware.market import instantiate_learnware_market, BaseUserInfo +from learnware.reuse import JobSelectorReuser, AveragingReuser, EnsemblePruningReuser, FeatureAugmentReuser +from utils import generate_uploader, generate_user, TextDataLoader, train, eval_prediction +from learnware.client import LearnwareClient, SemanticSpecificationKey +import matplotlib.pyplot as plt + +# Login to Beiming system +client = LearnwareClient() + +logger = get_module_logger("text_workflow", level="INFO") +origin_data_root = "./data/origin_data" +processed_data_root = "./data/processed_data" +tmp_dir = "./data/tmp" +learnware_pool_dir = "./data/learnware_pool" +dataset = "20newsgroups" +n_uploaders = 5 +n_users = 5 +n_classes = 20 +n_labeled_list = [100, 200, 500, 1000, 2000, 4000, 6000, 8000, 10000] +repeated_list = [10, 10, 10, 3, 3, 3, 3, 3, 3] + +data_root = os.path.join(origin_data_root, dataset) +data_save_root = os.path.join(processed_data_root, dataset) +user_save_root = os.path.join(data_save_root, "user") +uploader_save_root = os.path.join(data_save_root, "uploader") +model_save_root = os.path.join(data_save_root, "uploader_model") +os.makedirs(data_root, exist_ok=True) +os.makedirs(user_save_root, exist_ok=True) +os.makedirs(uploader_save_root, exist_ok=True) +os.makedirs(model_save_root, exist_ok=True) + +output_description = { + "Dimension": 20, + "Description": {"0": "0", "1": "1", "2": "2", "3": "3", "4": "4", "5": "5", "6": "6", + "7": "7", "8": "8", "9": "9", "10": "10", "11": "11", "12": "12", "13": "13", + "14": "14", "15": "15", "16": "16", "17": "17", "18": "18", "19": "19"} +} + +semantic_spec = client.create_semantic_specification( + name="learnware_example", + description="Just a example for text learnware", + data_type="Text", + task_type="Classification", + library_type="Scikit-learn", + scenarios=["Education"], + license="MIT", + input_description=None, + output_description=output_description, +) + +user_semantic = client.create_semantic_specification( + # name="learnware_example", + description="Just a example for text learnware", + data_type="Text", + task_type="Classification", + library_type="Scikit-learn", + scenarios=["Education"], + license="MIT", + input_description=None, + output_description=output_description, +) + + +class TextDatasetWorkflow: + def _init_text_dataset(self): + self._prepare_data() + self._prepare_model() + + def _prepare_data(self): + X_train, y_train, X_test, y_test = get_data(data_root) + + generate_uploader(X_train, y_train, n_uploaders=n_uploaders, data_save_root=uploader_save_root) + generate_user(X_test, y_test, n_users=n_users, data_save_root=user_save_root) + + def _prepare_model(self): + dataloader = TextDataLoader(data_save_root, train=True) + for i in range(n_uploaders): + logger.info("Train on uploader: %d" % (i)) + X, y = dataloader.get_idx_data(i) + vectorizer, clf = train(X, y, out_classes=n_classes) + + modelv_save_path = os.path.join(model_save_root, "uploader_v_%d.pth" % (i)) + modell_save_path = os.path.join(model_save_root, "uploader_l_%d.pth" % (i)) + + with open(modelv_save_path, "wb") as f: + pickle.dump(vectorizer, f) + + with open(modell_save_path, "wb") as f: + pickle.dump(clf, f) + + logger.info("Model saved to '%s' and '%s'" % (modelv_save_path, modell_save_path)) + + def _prepare_learnware( + self, data_path, modelv_path, modell_path, init_file_path, yaml_path, env_file_path, save_root, zip_name + ): + os.makedirs(save_root, exist_ok=True) + tmp_spec_path = os.path.join(save_root, "rkme.json") + + tmp_modelv_path = os.path.join(save_root, "modelv.pth") + tmp_modell_path = os.path.join(save_root, "modell.pth") + + tmp_yaml_path = os.path.join(save_root, "learnware.yaml") + tmp_init_path = os.path.join(save_root, "__init__.py") + tmp_env_path = os.path.join(save_root, "requirements.txt") + + with open(data_path, "rb") as f: + X = pickle.load(f) + + st = time.time() + + user_spec = specification.RKMETextSpecification() + + user_spec.generate_stat_spec_from_data(X=X) + ed = time.time() + logger.info("Stat spec generated in %.3f s" % (ed - st)) + user_spec.save(tmp_spec_path) + + copyfile(modelv_path, tmp_modelv_path) + copyfile(modell_path, tmp_modell_path) + + copyfile(yaml_path, tmp_yaml_path) + copyfile(init_file_path, tmp_init_path) + copyfile(env_file_path, tmp_env_path) + zip_file_name = os.path.join(learnware_pool_dir, "%s.zip" % (zip_name)) + with zipfile.ZipFile(zip_file_name, "w", compression=zipfile.ZIP_DEFLATED) as zip_obj: + zip_obj.write(tmp_spec_path, "rkme.json") + + zip_obj.write(tmp_modelv_path, "modelv.pth") + zip_obj.write(tmp_modell_path, "modell.pth") + + zip_obj.write(tmp_yaml_path, "learnware.yaml") + zip_obj.write(tmp_init_path, "__init__.py") + zip_obj.write(tmp_env_path, "requirements.txt") + rmtree(save_root) + logger.info("New Learnware Saved to %s" % (zip_file_name)) + return zip_file_name + + def prepare_market(self, regenerate_flag=False): + if regenerate_flag: + self._init_text_dataset() + text_market = instantiate_learnware_market(market_id="ae", rebuild=True) + try: + rmtree(learnware_pool_dir) + except: + pass + os.makedirs(learnware_pool_dir, exist_ok=True) + for i in range(n_uploaders): + data_path = os.path.join(uploader_save_root, "uploader_%d_X.pkl" % (i)) + + modelv_path = os.path.join(model_save_root, "uploader_v_%d.pth" % (i)) + modell_path = os.path.join(model_save_root, "uploader_l_%d.pth" % (i)) + + init_file_path = "./example_files/example_init.py" + yaml_file_path = "./example_files/example_yaml.yaml" + env_file_path = "./example_files/requirements.txt" + new_learnware_path = self._prepare_learnware( + data_path, + modelv_path, + modell_path, + init_file_path, + yaml_file_path, + env_file_path, + tmp_dir, + "%s_%d" % (dataset, i), + ) + semantic_spec["Name"]["Values"] = "learnware_%d" % (i) + semantic_spec["Description"]["Values"] = "test_learnware_number_%d" % (i) + text_market.add_learnware(new_learnware_path, semantic_spec) + + logger.info("Total Item: %d" % (len(text_market))) + + def test_unlabeled(self, regenerate_flag=False): + self.prepare_market(regenerate_flag) + text_market = instantiate_learnware_market(market_id="ae") + print("Total Item: %d" % len(text_market)) + + select_list = [] + avg_list = [] + best_list = [] + improve_list = [] + job_selector_score_list = [] + ensemble_score_list = [] + all_learnwares = text_market.get_learnwares() + for i in range(n_users): + user_data_path = os.path.join(user_save_root, "user_%d_X.pkl" % (i)) + user_label_path = os.path.join(user_save_root, "user_%d_y.pkl" % (i)) + with open(user_data_path, "rb") as f: + user_data = pickle.load(f) + with open(user_label_path, "rb") as f: + user_label = pickle.load(f) + + user_stat_spec = specification.RKMETextSpecification() + user_stat_spec.generate_stat_spec_from_data(X=user_data) + user_info = BaseUserInfo(semantic_spec=user_semantic, stat_info={"RKMETextSpecification": user_stat_spec}) + logger.info("Searching Market for user: %d" % (i)) + + search_result = text_market.search_learnware(user_info) + single_result = search_result.get_single_results() + multiple_result = search_result.get_multiple_results() + + print(f"search result of user{i}:") + print( + f"single model num: {len(single_result)}, max_score: {single_result[0].score}, min_score: {single_result[-1].score}" + ) + + acc_list = [] + for idx in range(len(all_learnwares)): + learnware = all_learnwares[idx] + pred_y = learnware.predict(user_data) + acc = eval_prediction(pred_y, user_label) + acc_list.append(acc) + + learnware = single_result[0].learnware + pred_y = learnware.predict(user_data) + best_acc = eval_prediction(pred_y, user_label) + best_list.append(np.max(acc_list)) + select_list.append(best_acc) + avg_list.append(np.mean(acc_list)) + improve_list.append((best_acc - np.mean(acc_list)) / np.mean(acc_list)) + print(f"market mean accuracy: {np.mean(acc_list)}, market best accuracy: {np.max(acc_list)}") + print( + f"Top1-score: {single_result[0].score}, learnware_id: {single_result[0].learnware.id}, acc: {best_acc}" + ) + + if len(multiple_result) > 0: + mixture_id = " ".join([learnware.id for learnware in multiple_result[0].learnwares]) + print(f"mixture_score: {multiple_result[0].score}, mixture_learnware: {mixture_id}") + mixture_learnware_list = multiple_result[0].learnwares + else: + mixture_learnware_list = [single_result[0].learnware] + + # test reuse (job selector) + reuse_baseline = JobSelectorReuser(learnware_list=mixture_learnware_list, herding_num=100) + reuse_predict = reuse_baseline.predict(user_data=user_data) + reuse_score = eval_prediction(reuse_predict, user_label) + job_selector_score_list.append(reuse_score) + print(f"mixture reuse loss(job selector): {reuse_score}") + + # test reuse (ensemble) + reuse_ensemble = AveragingReuser(learnware_list=mixture_learnware_list, mode="vote_by_prob") + ensemble_predict_y = reuse_ensemble.predict(user_data=user_data) + ensemble_score = eval_prediction(ensemble_predict_y, user_label) + ensemble_score_list.append(ensemble_score) + print(f"mixture reuse accuracy (ensemble): {ensemble_score}") + + print("\n") + + logger.info( + "Accuracy of selected learnware: %.3f +/- %.3f, Average performance: %.3f +/- %.3f, Best performance: %.3f +/- %.3f" + % ( + 1 - np.mean(select_list), + np.std(select_list), + 1 - np.mean(avg_list), + np.std(avg_list), + 1 - np.mean(best_list), + np.std(best_list), + ) + ) + logger.info("Average performance improvement: %.3f" % (np.mean(improve_list))) + logger.info( + "Average Job Selector Reuse Performance: %.3f +/- %.3f" + % (1 - np.mean(job_selector_score_list), np.std(job_selector_score_list)) + ) + logger.info( + "Averaging Ensemble Reuse Performance: %.3f +/- %.3f" + % (1 - np.mean(ensemble_score_list), np.std(ensemble_score_list)) + ) + + def test_labeled(self, regenerate_flag=False, train_flag=True): + self.prepare_market(regenerate_flag) + text_market = instantiate_learnware_market(market_id="ae") + print("Total Item: %d" % len(text_market)) + + os.makedirs("./figs", exist_ok=True) + os.makedirs("./curves", exist_ok=True) + + for i in range(n_users): + user_model_score_mat = [] + pruning_score_mat = [] + single_score_mat = [] + if train_flag: + user_data_path = os.path.join(user_save_root, "user_%d_X.pkl" % (i)) + user_label_path = os.path.join(user_save_root, "user_%d_y.pkl" % (i)) + with open(user_data_path, "rb") as f: + test_x = pickle.load(f) + with open(user_label_path, "rb") as f: + test_y = pickle.load(f) + test_y = np.array(test_y) + + train_data_path = os.path.join(uploader_save_root, "uploader_%d_X.pkl" % (i)) + train_label_path = os.path.join(uploader_save_root, "uploader_%d_y.pkl" % (i)) + with open(train_data_path, "rb") as f: + train_x = pickle.load(f) + with open(train_label_path, "rb") as f: + train_y = pickle.load(f) + train_y = np.array(train_y) + + user_stat_spec = specification.RKMETextSpecification() + user_stat_spec.generate_stat_spec_from_data(X=test_x) + user_info = BaseUserInfo( + semantic_spec=user_semantic, stat_info={"RKMETextSpecification": user_stat_spec} + ) + logger.info(f"Searching Market for user_{i}") + + search_result = text_market.search_learnware(user_info) + single_result = search_result.get_single_results() + multiple_result = search_result.get_multiple_results() + + learnware = single_result[0].learnware + pred_y = learnware.predict(test_x) + best_acc = eval_prediction(pred_y, test_y) + + print(f"search result of user_{i}:") + print( + f"single model num: {len(single_result)}, max_score: {single_result[0].score}, min_score: {single_result[-1].score}, single model acc: {best_acc}" + ) + + if len(multiple_result) > 0: + mixture_id = " ".join([learnware.id for learnware in multiple_result[0].learnwares]) + print(f"mixture_score: {multiple_result[0].score}, mixture_learnware: {mixture_id}") + mixture_learnware_list = multiple_result[0].learnwares + else: + mixture_learnware_list = [single_result[0].learnware] + print(len(train_x)) + for n_label, repeated in zip(n_labeled_list, repeated_list): + user_model_score_list, reuse_pruning_score_list = [], [] + if n_label > len(train_x): + break + for _ in range(repeated): + # x_train, y_train = train_x[:n_label], train_y[:n_label] + x_train, y_train = zip(*random.sample(list(zip(train_x, train_y)), k=n_label)) + x_train = list(x_train) + y_train = np.array(list(y_train)) + + modelv, modell = train(x_train, y_train, out_classes=n_classes) + user_model_predict_y = modell.predict(modelv.transform(test_x)) + user_model_score = eval_prediction(user_model_predict_y, test_y) + user_model_score_list.append(user_model_score) + + reuse_pruning = EnsemblePruningReuser( + learnware_list=mixture_learnware_list, mode="classification" + ) + reuse_pruning.fit(x_train, y_train) + reuse_pruning_predict_y = reuse_pruning.predict(user_data=test_x) + reuse_pruning_score = eval_prediction(reuse_pruning_predict_y, test_y) + reuse_pruning_score_list.append(reuse_pruning_score) + + single_score_mat.append([best_acc] * repeated) + user_model_score_mat.append(user_model_score_list) + pruning_score_mat.append(reuse_pruning_score_list) + print(n_label, np.mean(user_model_score_mat[-1]), np.mean(pruning_score_mat[-1])) + + logger.info(f"Saving Curves for User_{i}") + user_curves_data = (single_score_mat, user_model_score_mat, pruning_score_mat) + # np.save("./curves/curve" + str(i), user_curves_data) + with open("./curves/curve" + str(i) + ".pkl", "wb") as f: + pickle.dump(user_curves_data, f) + + with open("./curves/curve" + str(i) + ".pkl", "rb") as f: + user_curves_data = pickle.load(f) + # user_curves_data = np.load("./curves/curve" + str(i) + ".npy") + + self._plot_labeled_peformance_curves("user_" + str(i), user_curves_data) + + def _plot_labeled_peformance_curves(self, name, user_curves_data): + plt.figure(figsize=(10, 6)) + plt.xticks(range(len(n_labeled_list)), n_labeled_list) + + styles = [ + {"color": "orange", "linestyle": "--", "marker": "s"}, + {"color": "navy", "linestyle": "-", "marker": "o"}, + {"color": "magenta", "linestyle": "-.", "marker": "d"}, + ] + + labels = ["Single Learnware Reuse", "User Model", "Multiple Learnware Reuse (EnsemblePrune)"] + + single_mat, user_mat, pruning_mat = user_curves_data + print(single_mat, user_mat, pruning_mat) + for mat, style, label in zip([single_mat, user_mat, pruning_mat], styles, labels): + mean_curve, std_curve = [np.mean(lst) for lst in mat], [np.std(lst) for lst in mat] + mean_curve, std_curve = np.array(mean_curve), np.array(std_curve) + plt.plot(mean_curve, **style, label=label) + plt.fill_between( + range(len(mean_curve)), + mean_curve - 0.5 * std_curve, + mean_curve + 0.5 * std_curve, + color=style["color"], + alpha=0.2, + ) + + plt.xlabel("Labeled Data Size") + plt.ylabel("Accuracy") + plt.title(f"{name} Text Limited Labeled Data") + plt.legend() + plt.tight_layout() + plt.savefig(os.path.join("figs", f"{name}_text_labeled_curves.png"), bbox_inches="tight", dpi=700) + + +if __name__ == "__main__": + fire.Fire(TextDatasetWorkflow) diff --git a/examples/dataset_text_workflow2/utils.py b/examples/dataset_text_workflow2/utils.py new file mode 100644 index 0000000..0e24466 --- /dev/null +++ b/examples/dataset_text_workflow2/utils.py @@ -0,0 +1,145 @@ +import os +import pickle + +import numpy as np +import pandas as pd +from lightgbm import LGBMClassifier, Booster +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.model_selection import train_test_split, StratifiedShuffleSplit +from sklearn.naive_bayes import MultinomialNB +from sklearn.metrics import accuracy_score, f1_score + + +class TextDataLoader: + def __init__(self, data_root, train: bool = True): + self.data_root = data_root + self.train = train + + def get_idx_data(self, idx=0): + if self.train: + X_path = os.path.join(self.data_root, "uploader", "uploader_%d_X.pkl" % (idx)) + y_path = os.path.join(self.data_root, "uploader", "uploader_%d_y.pkl" % (idx)) + if not (os.path.exists(X_path) and os.path.exists(y_path)): + raise Exception("Index Error") + with open(X_path, "rb") as f: + X = pickle.load(f) + with open(y_path, "rb") as f: + y = pickle.load(f) + else: + X_path = os.path.join(self.data_root, "user", "user_%d_X.pkl" % (idx)) + y_path = os.path.join(self.data_root, "user", "user_%d_y.pkl" % (idx)) + if not (os.path.exists(X_path) and os.path.exists(y_path)): + raise Exception("Index Error") + with open(X_path, "rb") as f: + X = pickle.load(f) + with open(y_path, "rb") as f: + y = pickle.load(f) + return X, y + + +def generate_uploader(data_x, data_y, n_uploaders=50, data_save_root=None): + if data_save_root is None: + return + os.makedirs(data_save_root, exist_ok=True) + n = len(data_x) + for i in range(n_uploaders): + selected_X = data_x[i * (n // n_uploaders): (i + 1) * (n // n_uploaders)] + selected_y = data_y[i * (n // n_uploaders): (i + 1) * (n // n_uploaders)] + X_save_dir = os.path.join(data_save_root, "uploader_%d_X.pkl" % (i)) + y_save_dir = os.path.join(data_save_root, "uploader_%d_y.pkl" % (i)) + with open(X_save_dir, "wb") as f: + pickle.dump(selected_X, f) + with open(y_save_dir, "wb") as f: + pickle.dump(selected_y, f) + print("Saving to %s" % (X_save_dir)) + + +def generate_user(data_x, data_y, n_users=50, data_save_root=None): + if data_save_root is None: + return + os.makedirs(data_save_root, exist_ok=True) + n = len(data_x) + for i in range(n_users): + selected_X = data_x[i * (n // n_users): (i + 1) * (n // n_users)] + selected_y = data_y[i * (n // n_users): (i + 1) * (n // n_users)] + X_save_dir = os.path.join(data_save_root, "user_%d_X.pkl" % (i)) + y_save_dir = os.path.join(data_save_root, "user_%d_y.pkl" % (i)) + with open(X_save_dir, "wb") as f: + pickle.dump(selected_X, f) + with open(y_save_dir, "wb") as f: + pickle.dump(selected_y, f) + print("Saving to %s" % (X_save_dir)) + +# 分层抽样 +# def generate_uploader(data_x, data_y, n_uploaders=50, data_save_root=None): +# if data_save_root is None: +# return +# os.makedirs(data_save_root, exist_ok=True) +# +# sss = StratifiedShuffleSplit(n_splits=n_uploaders, test_size=1 / n_uploaders, random_state=0) +# +# # 使用 StratifiedShuffleSplit 对象来分割数据 +# i = 0 +# for train_index, test_index in sss.split(data_x, data_y): +# selected_X = [data_x[i] for i in test_index] +# selected_y = data_y[test_index] +# +# X_save_dir = os.path.join(data_save_root, "uploader_%d_X.pkl" % (i)) +# y_save_dir = os.path.join(data_save_root, "uploader_%d_y.pkl" % (i)) +# with open(X_save_dir, "wb") as f: +# pickle.dump(selected_X, f) +# with open(y_save_dir, "wb") as f: +# pickle.dump(selected_y, f) +# +# i += 1 +# print("Saving to %s" % (X_save_dir)) +# +# +# def generate_user(data_x, data_y, n_users=50, data_save_root=None): +# if data_save_root is None: +# return +# os.makedirs(data_save_root, exist_ok=True) +# +# sss = StratifiedShuffleSplit(n_splits=n_users, test_size=1 / n_users, random_state=0) +# +# # 使用 StratifiedShuffleSplit 对象来分割数据 +# i = 0 +# for train_index, test_index in sss.split(data_x, data_y): +# selected_X = [data_x[i] for i in test_index] +# selected_y = data_y[test_index] +# +# X_save_dir = os.path.join(data_save_root, "user_%d_X.pkl" % (i)) +# y_save_dir = os.path.join(data_save_root, "user_%d_y.pkl" % (i)) +# with open(X_save_dir, "wb") as f: +# pickle.dump(selected_X, f) +# with open(y_save_dir, "wb") as f: +# pickle.dump(selected_y, f) +# +# i += 1 +# print("Saving to %s" % (X_save_dir)) + + +# Train Uploaders' models +def train(X, y, out_classes): + vectorizer = TfidfVectorizer(stop_words="english") + X_tfidf = vectorizer.fit_transform(X) + + clf = MultinomialNB(alpha=0.1) + clf.fit(X_tfidf, y) + + return vectorizer, clf + + +def eval_prediction(pred_y, target_y): + if not isinstance(pred_y, np.ndarray): + pred_y = pred_y.detach().cpu().numpy() + if len(pred_y.shape) == 1: + predicted = np.array(pred_y) + else: + predicted = np.argmax(pred_y, 1) + annos = np.array(target_y) + + total = predicted.shape[0] + correct = (predicted == annos).sum().item() + + return correct / total