Browse Source

[MNT] fix some bug and optimize `generate_uploader` and `generate_user` for 20newgroups benchmark

tags/v0.3.2
Asymptotez 2 years ago
parent
commit
07725b4697
4 changed files with 58 additions and 70 deletions
  1. +2
    -3
      examples/dataset_text_workflow2/example_files/example_init.py
  2. +10
    -2
      examples/dataset_text_workflow2/get_data.py
  3. +12
    -11
      examples/dataset_text_workflow2/main.py
  4. +34
    -54
      examples/dataset_text_workflow2/utils.py

+ 2
- 3
examples/dataset_text_workflow2/example_files/example_init.py View File

@@ -8,7 +8,7 @@ from learnware.model import BaseModel

class Model(BaseModel):
def __init__(self):
super(Model, self).__init__(input_shape=(1,), output_shape=(20,))
super(Model, self).__init__(input_shape=(1,), output_shape=(1,))
dir_path = os.path.dirname(os.path.abspath(__file__))

modelv_path = os.path.join(dir_path, "modelv.pth")
@@ -23,8 +23,7 @@ class Model(BaseModel):
pass

def predict(self, X: np.ndarray) -> np.ndarray:
# predict -> predict_proba
return self.modell.predict_proba(self.modelv.transform(X))
return self.modell.predict(self.modelv.transform(X))

def finetune(self, X: np.ndarray, y: np.ndarray):
pass

+ 10
- 2
examples/dataset_text_workflow2/get_data.py View File

@@ -4,9 +4,17 @@ import numpy as np
from sklearn.datasets import fetch_20newsgroups
import pandas as pd


def get_data(data_root):
X_train, y_train = fetch_20newsgroups(data_home=data_root, subset='train', return_X_y=True)
dataset_train = fetch_20newsgroups(data_home=data_root, subset='train')
target_names = dataset_train["target_names"]

X_train = np.array(dataset_train["data"])
y_train = pd.Categorical.from_codes(dataset_train["target"], categories=target_names)

# y_train = [target_names[label] for label in dataset_train["target"]]

X_test, y_test = fetch_20newsgroups(data_home=data_root, subset='test', return_X_y=True)
X_test = np.array(X_test)
y_test = pd.Categorical.from_codes(y_test, categories=target_names)

return X_train, y_train, X_test, y_test

+ 12
- 11
examples/dataset_text_workflow2/main.py View File

@@ -26,8 +26,8 @@ processed_data_root = "./data/processed_data"
tmp_dir = "./data/tmp"
learnware_pool_dir = "./data/learnware_pool"
dataset = "20newsgroups"
n_uploaders = 5
n_users = 5
n_uploaders = 10 # max num = 10
n_users = 5 # max num = 10
n_classes = 20
n_labeled_list = [100, 200, 500, 1000, 2000, 4000, 6000, 8000, 10000]
repeated_list = [10, 10, 10, 3, 3, 3, 3, 3, 3]
@@ -151,7 +151,7 @@ class TextDatasetWorkflow:
def prepare_market(self, regenerate_flag=False):
if regenerate_flag:
self._init_text_dataset()
text_market = instantiate_learnware_market(market_id="ae", rebuild=True)
text_market = instantiate_learnware_market(market_id=dataset, rebuild=True)
try:
rmtree(learnware_pool_dir)
except:
@@ -184,7 +184,7 @@ class TextDatasetWorkflow:

def test_unlabeled(self, regenerate_flag=False):
self.prepare_market(regenerate_flag)
text_market = instantiate_learnware_market(market_id="ae")
text_market = instantiate_learnware_market(market_id=dataset)
print("Total Item: %d" % len(text_market))

select_list = []
@@ -250,7 +250,8 @@ class TextDatasetWorkflow:
print(f"mixture reuse loss(job selector): {reuse_score}")

# test reuse (ensemble)
reuse_ensemble = AveragingReuser(learnware_list=mixture_learnware_list, mode="vote_by_prob")
# be careful with the ensemble mode
reuse_ensemble = AveragingReuser(learnware_list=mixture_learnware_list, mode="vote_by_label")
ensemble_predict_y = reuse_ensemble.predict(user_data=user_data)
ensemble_score = eval_prediction(ensemble_predict_y, user_label)
ensemble_score_list.append(ensemble_score)
@@ -261,27 +262,27 @@ class TextDatasetWorkflow:
logger.info(
"Accuracy of selected learnware: %.3f +/- %.3f, Average performance: %.3f +/- %.3f, Best performance: %.3f +/- %.3f"
% (
1 - np.mean(select_list),
np.mean(select_list),
np.std(select_list),
1 - np.mean(avg_list),
np.mean(avg_list),
np.std(avg_list),
1 - np.mean(best_list),
np.mean(best_list),
np.std(best_list),
)
)
logger.info("Average performance improvement: %.3f" % (np.mean(improve_list)))
logger.info(
"Average Job Selector Reuse Performance: %.3f +/- %.3f"
% (1 - np.mean(job_selector_score_list), np.std(job_selector_score_list))
% (np.mean(job_selector_score_list), np.std(job_selector_score_list))
)
logger.info(
"Averaging Ensemble Reuse Performance: %.3f +/- %.3f"
% (1 - np.mean(ensemble_score_list), np.std(ensemble_score_list))
% (np.mean(ensemble_score_list), np.std(ensemble_score_list))
)

def test_labeled(self, regenerate_flag=False, train_flag=True):
self.prepare_market(regenerate_flag)
text_market = instantiate_learnware_market(market_id="ae")
text_market = instantiate_learnware_market(market_id=dataset)
print("Total Item: %d" % len(text_market))

os.makedirs("./figs", exist_ok=True)


+ 34
- 54
examples/dataset_text_workflow2/utils.py View File

@@ -1,5 +1,7 @@
import os
import pickle
import random
from itertools import combinations

import numpy as np
import pandas as pd
@@ -9,6 +11,10 @@ from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score

super_classes = ["comp", "rec", "sci", "talk", "misc"]
super_classes_select2 = list(combinations(super_classes, 2))
super_classes_select3 = list(combinations(super_classes, 3))


class TextDataLoader:
def __init__(self, data_root, train: bool = True):
@@ -42,26 +48,48 @@ def generate_uploader(data_x, data_y, n_uploaders=50, data_save_root=None):
return
os.makedirs(data_save_root, exist_ok=True)
n = len(data_x)
for i in range(n_uploaders):
selected_X = data_x[i * (n // n_uploaders): (i + 1) * (n // n_uploaders)]
selected_y = data_y[i * (n // n_uploaders): (i + 1) * (n // n_uploaders)]

for i, labels in enumerate(super_classes_select3[:n_uploaders]):
indices = [idx for idx, label in enumerate(data_y) if label.split('.')[0] in labels]
selected_X = data_x[indices]
selected_y = data_y[indices].codes

X_save_dir = os.path.join(data_save_root, "uploader_%d_X.pkl" % (i))
y_save_dir = os.path.join(data_save_root, "uploader_%d_y.pkl" % (i))

with open(X_save_dir, "wb") as f:
pickle.dump(selected_X, f)
with open(y_save_dir, "wb") as f:
pickle.dump(selected_y, f)
print("Saving to %s" % (X_save_dir))

# 随机选取
# def generate_user(data_x, data_y, n_users=50, data_save_root=None):
# if data_save_root is None:
# return
# os.makedirs(data_save_root, exist_ok=True)
# n = len(data_x)
# for i in range(n_users):
# selected_X = data_x[i * (n // n_users): (i + 1) * (n // n_users)]
# selected_y = data_y[i * (n // n_users): (i + 1) * (n // n_users)].codes
# X_save_dir = os.path.join(data_save_root, "user_%d_X.pkl" % (i))
# y_save_dir = os.path.join(data_save_root, "user_%d_y.pkl" % (i))
# with open(X_save_dir, "wb") as f:
# pickle.dump(selected_X, f)
# with open(y_save_dir, "wb") as f:
# pickle.dump(selected_y, f)
# print("Saving to %s" % (X_save_dir))

def generate_user(data_x, data_y, n_users=50, data_save_root=None):
if data_save_root is None:
return
os.makedirs(data_save_root, exist_ok=True)
n = len(data_x)
for i in range(n_users):
selected_X = data_x[i * (n // n_users): (i + 1) * (n // n_users)]
selected_y = data_y[i * (n // n_users): (i + 1) * (n // n_users)]
for i, labels in enumerate(super_classes_select3[:n_users]):
indices = [idx for idx, label in enumerate(data_y) if label.split('.')[0] in labels]
selected_X = data_x[indices]
selected_y = data_y[indices].codes

X_save_dir = os.path.join(data_save_root, "user_%d_X.pkl" % (i))
y_save_dir = os.path.join(data_save_root, "user_%d_y.pkl" % (i))
with open(X_save_dir, "wb") as f:
@@ -70,54 +98,6 @@ def generate_user(data_x, data_y, n_users=50, data_save_root=None):
pickle.dump(selected_y, f)
print("Saving to %s" % (X_save_dir))

# 分层抽样
# def generate_uploader(data_x, data_y, n_uploaders=50, data_save_root=None):
# if data_save_root is None:
# return
# os.makedirs(data_save_root, exist_ok=True)
#
# sss = StratifiedShuffleSplit(n_splits=n_uploaders, test_size=1 / n_uploaders, random_state=0)
#
# # 使用 StratifiedShuffleSplit 对象来分割数据
# i = 0
# for train_index, test_index in sss.split(data_x, data_y):
# selected_X = [data_x[i] for i in test_index]
# selected_y = data_y[test_index]
#
# X_save_dir = os.path.join(data_save_root, "uploader_%d_X.pkl" % (i))
# y_save_dir = os.path.join(data_save_root, "uploader_%d_y.pkl" % (i))
# with open(X_save_dir, "wb") as f:
# pickle.dump(selected_X, f)
# with open(y_save_dir, "wb") as f:
# pickle.dump(selected_y, f)
#
# i += 1
# print("Saving to %s" % (X_save_dir))
#
#
# def generate_user(data_x, data_y, n_users=50, data_save_root=None):
# if data_save_root is None:
# return
# os.makedirs(data_save_root, exist_ok=True)
#
# sss = StratifiedShuffleSplit(n_splits=n_users, test_size=1 / n_users, random_state=0)
#
# # 使用 StratifiedShuffleSplit 对象来分割数据
# i = 0
# for train_index, test_index in sss.split(data_x, data_y):
# selected_X = [data_x[i] for i in test_index]
# selected_y = data_y[test_index]
#
# X_save_dir = os.path.join(data_save_root, "user_%d_X.pkl" % (i))
# y_save_dir = os.path.join(data_save_root, "user_%d_y.pkl" % (i))
# with open(X_save_dir, "wb") as f:
# pickle.dump(selected_X, f)
# with open(y_save_dir, "wb") as f:
# pickle.dump(selected_y, f)
#
# i += 1
# print("Saving to %s" % (X_save_dir))


# Train Uploaders' models
def train(X, y, out_classes):


Loading…
Cancel
Save