From 07725b4697ca1c5bcc024c8d353cbd0c8fa4c3f9 Mon Sep 17 00:00:00 2001 From: Asymptotez <201220101@smail.nju.edu.cn> Date: Thu, 7 Dec 2023 11:06:52 +0800 Subject: [PATCH] [MNT] fix some bug and optimize `generate_uploader` and `generate_user` for 20newgroups benchmark --- .../example_files/example_init.py | 5 +- examples/dataset_text_workflow2/get_data.py | 12 ++- examples/dataset_text_workflow2/main.py | 23 ++--- examples/dataset_text_workflow2/utils.py | 88 +++++++------------ 4 files changed, 58 insertions(+), 70 deletions(-) diff --git a/examples/dataset_text_workflow2/example_files/example_init.py b/examples/dataset_text_workflow2/example_files/example_init.py index eede4a0..1772a19 100644 --- a/examples/dataset_text_workflow2/example_files/example_init.py +++ b/examples/dataset_text_workflow2/example_files/example_init.py @@ -8,7 +8,7 @@ from learnware.model import BaseModel class Model(BaseModel): def __init__(self): - super(Model, self).__init__(input_shape=(1,), output_shape=(20,)) + super(Model, self).__init__(input_shape=(1,), output_shape=(1,)) dir_path = os.path.dirname(os.path.abspath(__file__)) modelv_path = os.path.join(dir_path, "modelv.pth") @@ -23,8 +23,7 @@ class Model(BaseModel): pass def predict(self, X: np.ndarray) -> np.ndarray: - # predict -> predict_proba - return self.modell.predict_proba(self.modelv.transform(X)) + return self.modell.predict(self.modelv.transform(X)) def finetune(self, X: np.ndarray, y: np.ndarray): pass diff --git a/examples/dataset_text_workflow2/get_data.py b/examples/dataset_text_workflow2/get_data.py index 36e2161..f2c3891 100644 --- a/examples/dataset_text_workflow2/get_data.py +++ b/examples/dataset_text_workflow2/get_data.py @@ -4,9 +4,17 @@ import numpy as np from sklearn.datasets import fetch_20newsgroups import pandas as pd - def get_data(data_root): - X_train, y_train = fetch_20newsgroups(data_home=data_root, subset='train', return_X_y=True) + dataset_train = fetch_20newsgroups(data_home=data_root, subset='train') + target_names = dataset_train["target_names"] + + X_train = np.array(dataset_train["data"]) + y_train = pd.Categorical.from_codes(dataset_train["target"], categories=target_names) + + # y_train = [target_names[label] for label in dataset_train["target"]] + X_test, y_test = fetch_20newsgroups(data_home=data_root, subset='test', return_X_y=True) + X_test = np.array(X_test) + y_test = pd.Categorical.from_codes(y_test, categories=target_names) return X_train, y_train, X_test, y_test \ No newline at end of file diff --git a/examples/dataset_text_workflow2/main.py b/examples/dataset_text_workflow2/main.py index 75951b8..561212a 100644 --- a/examples/dataset_text_workflow2/main.py +++ b/examples/dataset_text_workflow2/main.py @@ -26,8 +26,8 @@ processed_data_root = "./data/processed_data" tmp_dir = "./data/tmp" learnware_pool_dir = "./data/learnware_pool" dataset = "20newsgroups" -n_uploaders = 5 -n_users = 5 +n_uploaders = 10 # max num = 10 +n_users = 5 # max num = 10 n_classes = 20 n_labeled_list = [100, 200, 500, 1000, 2000, 4000, 6000, 8000, 10000] repeated_list = [10, 10, 10, 3, 3, 3, 3, 3, 3] @@ -151,7 +151,7 @@ class TextDatasetWorkflow: def prepare_market(self, regenerate_flag=False): if regenerate_flag: self._init_text_dataset() - text_market = instantiate_learnware_market(market_id="ae", rebuild=True) + text_market = instantiate_learnware_market(market_id=dataset, rebuild=True) try: rmtree(learnware_pool_dir) except: @@ -184,7 +184,7 @@ class TextDatasetWorkflow: def test_unlabeled(self, regenerate_flag=False): self.prepare_market(regenerate_flag) - text_market = instantiate_learnware_market(market_id="ae") + text_market = instantiate_learnware_market(market_id=dataset) print("Total Item: %d" % len(text_market)) select_list = [] @@ -250,7 +250,8 @@ class TextDatasetWorkflow: print(f"mixture reuse loss(job selector): {reuse_score}") # test reuse (ensemble) - reuse_ensemble = AveragingReuser(learnware_list=mixture_learnware_list, mode="vote_by_prob") + # be careful with the ensemble mode + reuse_ensemble = AveragingReuser(learnware_list=mixture_learnware_list, mode="vote_by_label") ensemble_predict_y = reuse_ensemble.predict(user_data=user_data) ensemble_score = eval_prediction(ensemble_predict_y, user_label) ensemble_score_list.append(ensemble_score) @@ -261,27 +262,27 @@ class TextDatasetWorkflow: logger.info( "Accuracy of selected learnware: %.3f +/- %.3f, Average performance: %.3f +/- %.3f, Best performance: %.3f +/- %.3f" % ( - 1 - np.mean(select_list), + np.mean(select_list), np.std(select_list), - 1 - np.mean(avg_list), + np.mean(avg_list), np.std(avg_list), - 1 - np.mean(best_list), + np.mean(best_list), np.std(best_list), ) ) logger.info("Average performance improvement: %.3f" % (np.mean(improve_list))) logger.info( "Average Job Selector Reuse Performance: %.3f +/- %.3f" - % (1 - np.mean(job_selector_score_list), np.std(job_selector_score_list)) + % (np.mean(job_selector_score_list), np.std(job_selector_score_list)) ) logger.info( "Averaging Ensemble Reuse Performance: %.3f +/- %.3f" - % (1 - np.mean(ensemble_score_list), np.std(ensemble_score_list)) + % (np.mean(ensemble_score_list), np.std(ensemble_score_list)) ) def test_labeled(self, regenerate_flag=False, train_flag=True): self.prepare_market(regenerate_flag) - text_market = instantiate_learnware_market(market_id="ae") + text_market = instantiate_learnware_market(market_id=dataset) print("Total Item: %d" % len(text_market)) os.makedirs("./figs", exist_ok=True) diff --git a/examples/dataset_text_workflow2/utils.py b/examples/dataset_text_workflow2/utils.py index 0e24466..5dc95eb 100644 --- a/examples/dataset_text_workflow2/utils.py +++ b/examples/dataset_text_workflow2/utils.py @@ -1,5 +1,7 @@ import os import pickle +import random +from itertools import combinations import numpy as np import pandas as pd @@ -9,6 +11,10 @@ from sklearn.model_selection import train_test_split, StratifiedShuffleSplit from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import accuracy_score, f1_score +super_classes = ["comp", "rec", "sci", "talk", "misc"] +super_classes_select2 = list(combinations(super_classes, 2)) +super_classes_select3 = list(combinations(super_classes, 3)) + class TextDataLoader: def __init__(self, data_root, train: bool = True): @@ -42,26 +48,48 @@ def generate_uploader(data_x, data_y, n_uploaders=50, data_save_root=None): return os.makedirs(data_save_root, exist_ok=True) n = len(data_x) - for i in range(n_uploaders): - selected_X = data_x[i * (n // n_uploaders): (i + 1) * (n // n_uploaders)] - selected_y = data_y[i * (n // n_uploaders): (i + 1) * (n // n_uploaders)] + + for i, labels in enumerate(super_classes_select3[:n_uploaders]): + indices = [idx for idx, label in enumerate(data_y) if label.split('.')[0] in labels] + selected_X = data_x[indices] + selected_y = data_y[indices].codes + X_save_dir = os.path.join(data_save_root, "uploader_%d_X.pkl" % (i)) y_save_dir = os.path.join(data_save_root, "uploader_%d_y.pkl" % (i)) + with open(X_save_dir, "wb") as f: pickle.dump(selected_X, f) with open(y_save_dir, "wb") as f: pickle.dump(selected_y, f) print("Saving to %s" % (X_save_dir)) +# 随机选取 +# def generate_user(data_x, data_y, n_users=50, data_save_root=None): +# if data_save_root is None: +# return +# os.makedirs(data_save_root, exist_ok=True) +# n = len(data_x) +# for i in range(n_users): +# selected_X = data_x[i * (n // n_users): (i + 1) * (n // n_users)] +# selected_y = data_y[i * (n // n_users): (i + 1) * (n // n_users)].codes +# X_save_dir = os.path.join(data_save_root, "user_%d_X.pkl" % (i)) +# y_save_dir = os.path.join(data_save_root, "user_%d_y.pkl" % (i)) +# with open(X_save_dir, "wb") as f: +# pickle.dump(selected_X, f) +# with open(y_save_dir, "wb") as f: +# pickle.dump(selected_y, f) +# print("Saving to %s" % (X_save_dir)) def generate_user(data_x, data_y, n_users=50, data_save_root=None): if data_save_root is None: return os.makedirs(data_save_root, exist_ok=True) n = len(data_x) - for i in range(n_users): - selected_X = data_x[i * (n // n_users): (i + 1) * (n // n_users)] - selected_y = data_y[i * (n // n_users): (i + 1) * (n // n_users)] + for i, labels in enumerate(super_classes_select3[:n_users]): + indices = [idx for idx, label in enumerate(data_y) if label.split('.')[0] in labels] + selected_X = data_x[indices] + selected_y = data_y[indices].codes + X_save_dir = os.path.join(data_save_root, "user_%d_X.pkl" % (i)) y_save_dir = os.path.join(data_save_root, "user_%d_y.pkl" % (i)) with open(X_save_dir, "wb") as f: @@ -70,54 +98,6 @@ def generate_user(data_x, data_y, n_users=50, data_save_root=None): pickle.dump(selected_y, f) print("Saving to %s" % (X_save_dir)) -# 分层抽样 -# def generate_uploader(data_x, data_y, n_uploaders=50, data_save_root=None): -# if data_save_root is None: -# return -# os.makedirs(data_save_root, exist_ok=True) -# -# sss = StratifiedShuffleSplit(n_splits=n_uploaders, test_size=1 / n_uploaders, random_state=0) -# -# # 使用 StratifiedShuffleSplit 对象来分割数据 -# i = 0 -# for train_index, test_index in sss.split(data_x, data_y): -# selected_X = [data_x[i] for i in test_index] -# selected_y = data_y[test_index] -# -# X_save_dir = os.path.join(data_save_root, "uploader_%d_X.pkl" % (i)) -# y_save_dir = os.path.join(data_save_root, "uploader_%d_y.pkl" % (i)) -# with open(X_save_dir, "wb") as f: -# pickle.dump(selected_X, f) -# with open(y_save_dir, "wb") as f: -# pickle.dump(selected_y, f) -# -# i += 1 -# print("Saving to %s" % (X_save_dir)) -# -# -# def generate_user(data_x, data_y, n_users=50, data_save_root=None): -# if data_save_root is None: -# return -# os.makedirs(data_save_root, exist_ok=True) -# -# sss = StratifiedShuffleSplit(n_splits=n_users, test_size=1 / n_users, random_state=0) -# -# # 使用 StratifiedShuffleSplit 对象来分割数据 -# i = 0 -# for train_index, test_index in sss.split(data_x, data_y): -# selected_X = [data_x[i] for i in test_index] -# selected_y = data_y[test_index] -# -# X_save_dir = os.path.join(data_save_root, "user_%d_X.pkl" % (i)) -# y_save_dir = os.path.join(data_save_root, "user_%d_y.pkl" % (i)) -# with open(X_save_dir, "wb") as f: -# pickle.dump(selected_X, f) -# with open(y_save_dir, "wb") as f: -# pickle.dump(selected_y, f) -# -# i += 1 -# print("Saving to %s" % (X_save_dir)) - # Train Uploaders' models def train(X, y, out_classes):