Browse Source

[MNT] add param search for original text benchmark and add a new text benchmark: 20newsgroups

tags/v0.3.2
Asymptotez 2 years ago
parent
commit
f281e3efed
9 changed files with 757 additions and 6 deletions
  1. +2
    -2
      examples/dataset_text_workflow/main.py
  2. +115
    -0
      examples/dataset_text_workflow/param_search.py
  3. +29
    -4
      examples/dataset_text_workflow/utils.py
  4. +30
    -0
      examples/dataset_text_workflow2/example_files/example_init.py
  5. +8
    -0
      examples/dataset_text_workflow2/example_files/example_yaml.yaml
  6. +3
    -0
      examples/dataset_text_workflow2/example_files/requirements.txt
  7. +12
    -0
      examples/dataset_text_workflow2/get_data.py
  8. +413
    -0
      examples/dataset_text_workflow2/main.py
  9. +145
    -0
      examples/dataset_text_workflow2/utils.py

+ 2
- 2
examples/dataset_text_workflow/main.py View File

@@ -402,10 +402,10 @@ class TextDatasetWorkflow:

plt.xlabel("Labeled Data Size")
plt.ylabel("Accuracy")
plt.title(f"User{name} Text Limited Labeled Data")
plt.title(f"{name} Text Limited Labeled Data")
plt.legend()
plt.tight_layout()
plt.savefig(os.path.join("figs", f"User{name}_text_labeled_curves.png"), bbox_inches="tight", dpi=700)
plt.savefig(os.path.join("figs", f"{name}_text_labeled_curves.png"), bbox_inches="tight", dpi=700)


if __name__ == "__main__":


+ 115
- 0
examples/dataset_text_workflow/param_search.py View File

@@ -0,0 +1,115 @@
import json
import pickle

import joblib
import numpy as np
import os, warnings
import lightgbm as lgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.model_selection import train_test_split
from hyperopt import fmin, tpe, hp, Trials, space_eval, STATUS_OK

warnings.filterwarnings("ignore")


def train_lgb(params, train_x, val_x, train_y, val_y, num_class, classification, model_path, save=False):
train_data = lgb.Dataset(train_x, label=train_y)
val_data = lgb.Dataset(val_x, label=val_y)

if classification:
lgb_params = {
**params,
# "boosting": "dart",
"learning_rate": 0.1,
"importance_type": "gain",
# "class_weight": 'balanced',
"objective": "multiclass",
"num_class": num_class,
"n_estimators": 1000,
"early_stopping_round": 30,
'max_bin': 512,
"verbose": -1,
}
else:
lgb_params = {
**params,
# "boosting": "dart",
"learning_rate": 0.1,
"importance_type": "gain",
"objective": "regression",
"n_estimators": 1000,
"early_stopping_round": 30,
'max_bin': 512,
"verbose": -1,
}
gbm = lgb.train(lgb_params, train_data, valid_sets=[val_data])
pred_val_y = gbm.predict(val_x, num_iteration=gbm.best_iteration)
if classification:
pred_val_y = np.argmax(pred_val_y, 1)
res = 1 - accuracy_score(val_y, pred_val_y)
else:
res = mean_squared_error(val_y, pred_val_y, squared=False)

pred_train_y = gbm.predict(train_x, num_iteration=gbm.best_iteration)
if classification:
pred_train_y = np.argmax(pred_train_y, 1)
res_train = 1 - accuracy_score(train_y, pred_train_y)
else:
res_train = mean_squared_error(train_y, pred_train_y, squared=False)

# print(params, res)
if save:
# gbm.save_model(model_path)
# with open(model_path, 'wb') as f:
# pickle.dump(gbm, f)
print(params, res_train, res)
return {'loss': res, 'status': STATUS_OK, 'param': params}


def grid_search(train_x, val_x, train_y, val_y, num_class, classification, model_path):
def objective(params):
return train_lgb(params, train_x, val_x, train_y, val_y, num_class, classification, model_path, save=False)

if classification:
space = {
'max_depth': hp.choice('max_depth', [3, 4, 5]),
'num_leaves': hp.choice('num_leaves', [5, 6, 7, 12, 13, 14, 15, 28, 29, 30, 31]),
'subsample': hp.choice('subsample', [0.6, 0.8, 0.9, 1.0]),
'colsample_bytree': hp.choice('colsample_bytree', [0.6, 0.8, 0.9, 1.0]),
'reg_alpha': hp.loguniform('reg_alpha', np.log(0.01), np.log(1000)),
'reg_lambda': hp.loguniform('reg_lambda', np.log(0.01), np.log(1000)),
'min_data_in_leaf': hp.choice('min_data_in_leaf', [5, 20, 50]),
}
else:
space = {
'max_depth': hp.choice('max_depth', [3, 4, 5]),
'num_leaves': hp.choice('num_leaves', [5, 6, 7, 12, 13, 14, 15, 28, 29, 30, 31]),
'subsample': hp.choice('subsample', [0.6, 0.8, 0.9, 1.0]),
'colsample_bytree': hp.choice('colsample_bytree', [0.6, 0.8, 0.9, 1.0]),
'reg_alpha': hp.loguniform('reg_alpha', np.log(0.01), np.log(1000)),
'reg_lambda': hp.loguniform('reg_lambda', np.log(0.01), np.log(1000))
}
trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=30, trials=trials)
best_params = trials.best_trial['result']['param']
res = train_lgb(best_params, train_x, val_x, train_y, val_y, num_class, classification, model_path, save=True)[
'param']
return res


if __name__ == "__main__":
X_path = os.path.join("data/processed_data/ae/uploader/uploader_0_X.pkl")
y_path = os.path.join("data/processed_data/ae/uploader/uploader_0_y.pkl")
with open(X_path, "rb") as f:
X = pickle.load(f)
with open(y_path, "rb") as f:
y = pickle.load(f)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(stop_words="english")
X_trian_tfidf = vectorizer.fit_transform(X_train)
X_valid_tfidf = vectorizer.transform(X_valid)

res = grid_search(X_trian_tfidf, X_valid_tfidf, y_train, y_valid, 3, True, "models/model.txt")

+ 29
- 4
examples/dataset_text_workflow/utils.py View File

@@ -3,8 +3,9 @@ import pickle

import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from lightgbm import LGBMClassifier, Booster
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


class TextDataLoader:
@@ -43,7 +44,7 @@ def generate_uploader(data_x: pd.Series, data_y: pd.Series, n_uploaders=50, data

for i in range(n_uploaders):
indices = data_x["discourse_type"] == types[i]
selected_X = data_x[indices]["discourse_text"].to_list()
selected_X = (types[i] + ' ' + data_x[indices]["discourse_text"]).to_list()
selected_y = data_y[indices].to_list()

X_save_dir = os.path.join(data_save_root, "uploader_%d_X.pkl" % (i))
@@ -65,7 +66,7 @@ def generate_user(data_x, data_y, n_users=50, data_save_root=None):

for i in range(n_users):
indices = data_x["discourse_type"] == types[i]
selected_X = data_x[indices]["discourse_text"].to_list()
selected_X = (types[i] + ' ' + data_x[indices]["discourse_text"]).to_list()
selected_y = data_y[indices].to_list()

X_save_dir = os.path.join(data_save_root, "user_%d_X.pkl" % (i))
@@ -78,12 +79,36 @@ def generate_user(data_x, data_y, n_users=50, data_save_root=None):
print("Saving to %s" % (X_save_dir))


from param_search import grid_search


# Train Uploaders' models
def train(X, y, out_classes):
vectorizer = TfidfVectorizer(stop_words="english")
X_tfidf = vectorizer.fit_transform(X)

lgbm = LGBMClassifier(boosting_type="dart", n_estimators=500, num_leaves=21, verbosity=-1)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

X_trian_tfidf = vectorizer.transform(X_train)
X_valid_tfidf = vectorizer.transform(X_valid)

model_path = "models/model.pkl"
best_params = grid_search(X_trian_tfidf, X_valid_tfidf, y_train, y_valid, out_classes, True, model_path)

# lgbm = Booster(model_file="models/model.txt")
# with open(model_path, "rb") as f:
# lgbm = pickle.load(f)

param = {
"learning_rate": 0.1,
"importance_type": "gain",
"objective": "multiclass",
"num_class": out_classes,
"n_estimators": 1000,
'max_bin': 512,
"verbose": -1,
**best_params}
lgbm = LGBMClassifier(**param)
lgbm.fit(X_tfidf, y)

return vectorizer, lgbm


+ 30
- 0
examples/dataset_text_workflow2/example_files/example_init.py View File

@@ -0,0 +1,30 @@
import os
import pickle

import numpy as np

from learnware.model import BaseModel


class Model(BaseModel):
def __init__(self):
super(Model, self).__init__(input_shape=(1,), output_shape=(20,))
dir_path = os.path.dirname(os.path.abspath(__file__))

modelv_path = os.path.join(dir_path, "modelv.pth")
with open(modelv_path, "rb") as f:
self.modelv = pickle.load(f)

modell_path = os.path.join(dir_path, "modell.pth")
with open(modell_path, "rb") as f:
self.modell = pickle.load(f)

def fit(self, X: np.ndarray, y: np.ndarray):
pass

def predict(self, X: np.ndarray) -> np.ndarray:
# predict -> predict_proba
return self.modell.predict_proba(self.modelv.transform(X))

def finetune(self, X: np.ndarray, y: np.ndarray):
pass

+ 8
- 0
examples/dataset_text_workflow2/example_files/example_yaml.yaml View File

@@ -0,0 +1,8 @@
model:
class_name: Model
kwargs: { }
stat_specifications:
- module_path: learnware.specification
class_name: RKMETextSpecification
file_name: rkme.json
kwargs: { }

+ 3
- 0
examples/dataset_text_workflow2/example_files/requirements.txt View File

@@ -0,0 +1,3 @@
numpy
pickle
scikit-learn

+ 12
- 0
examples/dataset_text_workflow2/get_data.py View File

@@ -0,0 +1,12 @@
import os
import json
import numpy as np
from sklearn.datasets import fetch_20newsgroups
import pandas as pd


def get_data(data_root):
X_train, y_train = fetch_20newsgroups(data_home=data_root, subset='train', return_X_y=True)
X_test, y_test = fetch_20newsgroups(data_home=data_root, subset='test', return_X_y=True)

return X_train, y_train, X_test, y_test

+ 413
- 0
examples/dataset_text_workflow2/main.py View File

@@ -0,0 +1,413 @@
import os
import fire
import pickle
import time
import zipfile
from shutil import copyfile, rmtree
import random

import numpy as np

import learnware.specification as specification
from get_data import get_data
from learnware.logger import get_module_logger
from learnware.market import instantiate_learnware_market, BaseUserInfo
from learnware.reuse import JobSelectorReuser, AveragingReuser, EnsemblePruningReuser, FeatureAugmentReuser
from utils import generate_uploader, generate_user, TextDataLoader, train, eval_prediction
from learnware.client import LearnwareClient, SemanticSpecificationKey
import matplotlib.pyplot as plt

# Login to Beiming system
client = LearnwareClient()

logger = get_module_logger("text_workflow", level="INFO")
origin_data_root = "./data/origin_data"
processed_data_root = "./data/processed_data"
tmp_dir = "./data/tmp"
learnware_pool_dir = "./data/learnware_pool"
dataset = "20newsgroups"
n_uploaders = 5
n_users = 5
n_classes = 20
n_labeled_list = [100, 200, 500, 1000, 2000, 4000, 6000, 8000, 10000]
repeated_list = [10, 10, 10, 3, 3, 3, 3, 3, 3]

data_root = os.path.join(origin_data_root, dataset)
data_save_root = os.path.join(processed_data_root, dataset)
user_save_root = os.path.join(data_save_root, "user")
uploader_save_root = os.path.join(data_save_root, "uploader")
model_save_root = os.path.join(data_save_root, "uploader_model")
os.makedirs(data_root, exist_ok=True)
os.makedirs(user_save_root, exist_ok=True)
os.makedirs(uploader_save_root, exist_ok=True)
os.makedirs(model_save_root, exist_ok=True)

output_description = {
"Dimension": 20,
"Description": {"0": "0", "1": "1", "2": "2", "3": "3", "4": "4", "5": "5", "6": "6",
"7": "7", "8": "8", "9": "9", "10": "10", "11": "11", "12": "12", "13": "13",
"14": "14", "15": "15", "16": "16", "17": "17", "18": "18", "19": "19"}
}

semantic_spec = client.create_semantic_specification(
name="learnware_example",
description="Just a example for text learnware",
data_type="Text",
task_type="Classification",
library_type="Scikit-learn",
scenarios=["Education"],
license="MIT",
input_description=None,
output_description=output_description,
)

user_semantic = client.create_semantic_specification(
# name="learnware_example",
description="Just a example for text learnware",
data_type="Text",
task_type="Classification",
library_type="Scikit-learn",
scenarios=["Education"],
license="MIT",
input_description=None,
output_description=output_description,
)


class TextDatasetWorkflow:
def _init_text_dataset(self):
self._prepare_data()
self._prepare_model()

def _prepare_data(self):
X_train, y_train, X_test, y_test = get_data(data_root)

generate_uploader(X_train, y_train, n_uploaders=n_uploaders, data_save_root=uploader_save_root)
generate_user(X_test, y_test, n_users=n_users, data_save_root=user_save_root)

def _prepare_model(self):
dataloader = TextDataLoader(data_save_root, train=True)
for i in range(n_uploaders):
logger.info("Train on uploader: %d" % (i))
X, y = dataloader.get_idx_data(i)
vectorizer, clf = train(X, y, out_classes=n_classes)

modelv_save_path = os.path.join(model_save_root, "uploader_v_%d.pth" % (i))
modell_save_path = os.path.join(model_save_root, "uploader_l_%d.pth" % (i))

with open(modelv_save_path, "wb") as f:
pickle.dump(vectorizer, f)

with open(modell_save_path, "wb") as f:
pickle.dump(clf, f)

logger.info("Model saved to '%s' and '%s'" % (modelv_save_path, modell_save_path))

def _prepare_learnware(
self, data_path, modelv_path, modell_path, init_file_path, yaml_path, env_file_path, save_root, zip_name
):
os.makedirs(save_root, exist_ok=True)
tmp_spec_path = os.path.join(save_root, "rkme.json")

tmp_modelv_path = os.path.join(save_root, "modelv.pth")
tmp_modell_path = os.path.join(save_root, "modell.pth")

tmp_yaml_path = os.path.join(save_root, "learnware.yaml")
tmp_init_path = os.path.join(save_root, "__init__.py")
tmp_env_path = os.path.join(save_root, "requirements.txt")

with open(data_path, "rb") as f:
X = pickle.load(f)

st = time.time()

user_spec = specification.RKMETextSpecification()

user_spec.generate_stat_spec_from_data(X=X)
ed = time.time()
logger.info("Stat spec generated in %.3f s" % (ed - st))
user_spec.save(tmp_spec_path)

copyfile(modelv_path, tmp_modelv_path)
copyfile(modell_path, tmp_modell_path)

copyfile(yaml_path, tmp_yaml_path)
copyfile(init_file_path, tmp_init_path)
copyfile(env_file_path, tmp_env_path)
zip_file_name = os.path.join(learnware_pool_dir, "%s.zip" % (zip_name))
with zipfile.ZipFile(zip_file_name, "w", compression=zipfile.ZIP_DEFLATED) as zip_obj:
zip_obj.write(tmp_spec_path, "rkme.json")

zip_obj.write(tmp_modelv_path, "modelv.pth")
zip_obj.write(tmp_modell_path, "modell.pth")

zip_obj.write(tmp_yaml_path, "learnware.yaml")
zip_obj.write(tmp_init_path, "__init__.py")
zip_obj.write(tmp_env_path, "requirements.txt")
rmtree(save_root)
logger.info("New Learnware Saved to %s" % (zip_file_name))
return zip_file_name

def prepare_market(self, regenerate_flag=False):
if regenerate_flag:
self._init_text_dataset()
text_market = instantiate_learnware_market(market_id="ae", rebuild=True)
try:
rmtree(learnware_pool_dir)
except:
pass
os.makedirs(learnware_pool_dir, exist_ok=True)
for i in range(n_uploaders):
data_path = os.path.join(uploader_save_root, "uploader_%d_X.pkl" % (i))

modelv_path = os.path.join(model_save_root, "uploader_v_%d.pth" % (i))
modell_path = os.path.join(model_save_root, "uploader_l_%d.pth" % (i))

init_file_path = "./example_files/example_init.py"
yaml_file_path = "./example_files/example_yaml.yaml"
env_file_path = "./example_files/requirements.txt"
new_learnware_path = self._prepare_learnware(
data_path,
modelv_path,
modell_path,
init_file_path,
yaml_file_path,
env_file_path,
tmp_dir,
"%s_%d" % (dataset, i),
)
semantic_spec["Name"]["Values"] = "learnware_%d" % (i)
semantic_spec["Description"]["Values"] = "test_learnware_number_%d" % (i)
text_market.add_learnware(new_learnware_path, semantic_spec)

logger.info("Total Item: %d" % (len(text_market)))

def test_unlabeled(self, regenerate_flag=False):
self.prepare_market(regenerate_flag)
text_market = instantiate_learnware_market(market_id="ae")
print("Total Item: %d" % len(text_market))

select_list = []
avg_list = []
best_list = []
improve_list = []
job_selector_score_list = []
ensemble_score_list = []
all_learnwares = text_market.get_learnwares()
for i in range(n_users):
user_data_path = os.path.join(user_save_root, "user_%d_X.pkl" % (i))
user_label_path = os.path.join(user_save_root, "user_%d_y.pkl" % (i))
with open(user_data_path, "rb") as f:
user_data = pickle.load(f)
with open(user_label_path, "rb") as f:
user_label = pickle.load(f)

user_stat_spec = specification.RKMETextSpecification()
user_stat_spec.generate_stat_spec_from_data(X=user_data)
user_info = BaseUserInfo(semantic_spec=user_semantic, stat_info={"RKMETextSpecification": user_stat_spec})
logger.info("Searching Market for user: %d" % (i))

search_result = text_market.search_learnware(user_info)
single_result = search_result.get_single_results()
multiple_result = search_result.get_multiple_results()

print(f"search result of user{i}:")
print(
f"single model num: {len(single_result)}, max_score: {single_result[0].score}, min_score: {single_result[-1].score}"
)

acc_list = []
for idx in range(len(all_learnwares)):
learnware = all_learnwares[idx]
pred_y = learnware.predict(user_data)
acc = eval_prediction(pred_y, user_label)
acc_list.append(acc)

learnware = single_result[0].learnware
pred_y = learnware.predict(user_data)
best_acc = eval_prediction(pred_y, user_label)
best_list.append(np.max(acc_list))
select_list.append(best_acc)
avg_list.append(np.mean(acc_list))
improve_list.append((best_acc - np.mean(acc_list)) / np.mean(acc_list))
print(f"market mean accuracy: {np.mean(acc_list)}, market best accuracy: {np.max(acc_list)}")
print(
f"Top1-score: {single_result[0].score}, learnware_id: {single_result[0].learnware.id}, acc: {best_acc}"
)

if len(multiple_result) > 0:
mixture_id = " ".join([learnware.id for learnware in multiple_result[0].learnwares])
print(f"mixture_score: {multiple_result[0].score}, mixture_learnware: {mixture_id}")
mixture_learnware_list = multiple_result[0].learnwares
else:
mixture_learnware_list = [single_result[0].learnware]

# test reuse (job selector)
reuse_baseline = JobSelectorReuser(learnware_list=mixture_learnware_list, herding_num=100)
reuse_predict = reuse_baseline.predict(user_data=user_data)
reuse_score = eval_prediction(reuse_predict, user_label)
job_selector_score_list.append(reuse_score)
print(f"mixture reuse loss(job selector): {reuse_score}")

# test reuse (ensemble)
reuse_ensemble = AveragingReuser(learnware_list=mixture_learnware_list, mode="vote_by_prob")
ensemble_predict_y = reuse_ensemble.predict(user_data=user_data)
ensemble_score = eval_prediction(ensemble_predict_y, user_label)
ensemble_score_list.append(ensemble_score)
print(f"mixture reuse accuracy (ensemble): {ensemble_score}")

print("\n")

logger.info(
"Accuracy of selected learnware: %.3f +/- %.3f, Average performance: %.3f +/- %.3f, Best performance: %.3f +/- %.3f"
% (
1 - np.mean(select_list),
np.std(select_list),
1 - np.mean(avg_list),
np.std(avg_list),
1 - np.mean(best_list),
np.std(best_list),
)
)
logger.info("Average performance improvement: %.3f" % (np.mean(improve_list)))
logger.info(
"Average Job Selector Reuse Performance: %.3f +/- %.3f"
% (1 - np.mean(job_selector_score_list), np.std(job_selector_score_list))
)
logger.info(
"Averaging Ensemble Reuse Performance: %.3f +/- %.3f"
% (1 - np.mean(ensemble_score_list), np.std(ensemble_score_list))
)

def test_labeled(self, regenerate_flag=False, train_flag=True):
self.prepare_market(regenerate_flag)
text_market = instantiate_learnware_market(market_id="ae")
print("Total Item: %d" % len(text_market))

os.makedirs("./figs", exist_ok=True)
os.makedirs("./curves", exist_ok=True)

for i in range(n_users):
user_model_score_mat = []
pruning_score_mat = []
single_score_mat = []
if train_flag:
user_data_path = os.path.join(user_save_root, "user_%d_X.pkl" % (i))
user_label_path = os.path.join(user_save_root, "user_%d_y.pkl" % (i))
with open(user_data_path, "rb") as f:
test_x = pickle.load(f)
with open(user_label_path, "rb") as f:
test_y = pickle.load(f)
test_y = np.array(test_y)

train_data_path = os.path.join(uploader_save_root, "uploader_%d_X.pkl" % (i))
train_label_path = os.path.join(uploader_save_root, "uploader_%d_y.pkl" % (i))
with open(train_data_path, "rb") as f:
train_x = pickle.load(f)
with open(train_label_path, "rb") as f:
train_y = pickle.load(f)
train_y = np.array(train_y)

user_stat_spec = specification.RKMETextSpecification()
user_stat_spec.generate_stat_spec_from_data(X=test_x)
user_info = BaseUserInfo(
semantic_spec=user_semantic, stat_info={"RKMETextSpecification": user_stat_spec}
)
logger.info(f"Searching Market for user_{i}")

search_result = text_market.search_learnware(user_info)
single_result = search_result.get_single_results()
multiple_result = search_result.get_multiple_results()

learnware = single_result[0].learnware
pred_y = learnware.predict(test_x)
best_acc = eval_prediction(pred_y, test_y)

print(f"search result of user_{i}:")
print(
f"single model num: {len(single_result)}, max_score: {single_result[0].score}, min_score: {single_result[-1].score}, single model acc: {best_acc}"
)

if len(multiple_result) > 0:
mixture_id = " ".join([learnware.id for learnware in multiple_result[0].learnwares])
print(f"mixture_score: {multiple_result[0].score}, mixture_learnware: {mixture_id}")
mixture_learnware_list = multiple_result[0].learnwares
else:
mixture_learnware_list = [single_result[0].learnware]
print(len(train_x))
for n_label, repeated in zip(n_labeled_list, repeated_list):
user_model_score_list, reuse_pruning_score_list = [], []
if n_label > len(train_x):
break
for _ in range(repeated):
# x_train, y_train = train_x[:n_label], train_y[:n_label]
x_train, y_train = zip(*random.sample(list(zip(train_x, train_y)), k=n_label))
x_train = list(x_train)
y_train = np.array(list(y_train))

modelv, modell = train(x_train, y_train, out_classes=n_classes)
user_model_predict_y = modell.predict(modelv.transform(test_x))
user_model_score = eval_prediction(user_model_predict_y, test_y)
user_model_score_list.append(user_model_score)

reuse_pruning = EnsemblePruningReuser(
learnware_list=mixture_learnware_list, mode="classification"
)
reuse_pruning.fit(x_train, y_train)
reuse_pruning_predict_y = reuse_pruning.predict(user_data=test_x)
reuse_pruning_score = eval_prediction(reuse_pruning_predict_y, test_y)
reuse_pruning_score_list.append(reuse_pruning_score)

single_score_mat.append([best_acc] * repeated)
user_model_score_mat.append(user_model_score_list)
pruning_score_mat.append(reuse_pruning_score_list)
print(n_label, np.mean(user_model_score_mat[-1]), np.mean(pruning_score_mat[-1]))

logger.info(f"Saving Curves for User_{i}")
user_curves_data = (single_score_mat, user_model_score_mat, pruning_score_mat)
# np.save("./curves/curve" + str(i), user_curves_data)
with open("./curves/curve" + str(i) + ".pkl", "wb") as f:
pickle.dump(user_curves_data, f)

with open("./curves/curve" + str(i) + ".pkl", "rb") as f:
user_curves_data = pickle.load(f)
# user_curves_data = np.load("./curves/curve" + str(i) + ".npy")

self._plot_labeled_peformance_curves("user_" + str(i), user_curves_data)

def _plot_labeled_peformance_curves(self, name, user_curves_data):
plt.figure(figsize=(10, 6))
plt.xticks(range(len(n_labeled_list)), n_labeled_list)

styles = [
{"color": "orange", "linestyle": "--", "marker": "s"},
{"color": "navy", "linestyle": "-", "marker": "o"},
{"color": "magenta", "linestyle": "-.", "marker": "d"},
]

labels = ["Single Learnware Reuse", "User Model", "Multiple Learnware Reuse (EnsemblePrune)"]

single_mat, user_mat, pruning_mat = user_curves_data
print(single_mat, user_mat, pruning_mat)
for mat, style, label in zip([single_mat, user_mat, pruning_mat], styles, labels):
mean_curve, std_curve = [np.mean(lst) for lst in mat], [np.std(lst) for lst in mat]
mean_curve, std_curve = np.array(mean_curve), np.array(std_curve)
plt.plot(mean_curve, **style, label=label)
plt.fill_between(
range(len(mean_curve)),
mean_curve - 0.5 * std_curve,
mean_curve + 0.5 * std_curve,
color=style["color"],
alpha=0.2,
)

plt.xlabel("Labeled Data Size")
plt.ylabel("Accuracy")
plt.title(f"{name} Text Limited Labeled Data")
plt.legend()
plt.tight_layout()
plt.savefig(os.path.join("figs", f"{name}_text_labeled_curves.png"), bbox_inches="tight", dpi=700)


if __name__ == "__main__":
fire.Fire(TextDatasetWorkflow)

+ 145
- 0
examples/dataset_text_workflow2/utils.py View File

@@ -0,0 +1,145 @@
import os
import pickle

import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier, Booster
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score


class TextDataLoader:
def __init__(self, data_root, train: bool = True):
self.data_root = data_root
self.train = train

def get_idx_data(self, idx=0):
if self.train:
X_path = os.path.join(self.data_root, "uploader", "uploader_%d_X.pkl" % (idx))
y_path = os.path.join(self.data_root, "uploader", "uploader_%d_y.pkl" % (idx))
if not (os.path.exists(X_path) and os.path.exists(y_path)):
raise Exception("Index Error")
with open(X_path, "rb") as f:
X = pickle.load(f)
with open(y_path, "rb") as f:
y = pickle.load(f)
else:
X_path = os.path.join(self.data_root, "user", "user_%d_X.pkl" % (idx))
y_path = os.path.join(self.data_root, "user", "user_%d_y.pkl" % (idx))
if not (os.path.exists(X_path) and os.path.exists(y_path)):
raise Exception("Index Error")
with open(X_path, "rb") as f:
X = pickle.load(f)
with open(y_path, "rb") as f:
y = pickle.load(f)
return X, y


def generate_uploader(data_x, data_y, n_uploaders=50, data_save_root=None):
if data_save_root is None:
return
os.makedirs(data_save_root, exist_ok=True)
n = len(data_x)
for i in range(n_uploaders):
selected_X = data_x[i * (n // n_uploaders): (i + 1) * (n // n_uploaders)]
selected_y = data_y[i * (n // n_uploaders): (i + 1) * (n // n_uploaders)]
X_save_dir = os.path.join(data_save_root, "uploader_%d_X.pkl" % (i))
y_save_dir = os.path.join(data_save_root, "uploader_%d_y.pkl" % (i))
with open(X_save_dir, "wb") as f:
pickle.dump(selected_X, f)
with open(y_save_dir, "wb") as f:
pickle.dump(selected_y, f)
print("Saving to %s" % (X_save_dir))


def generate_user(data_x, data_y, n_users=50, data_save_root=None):
if data_save_root is None:
return
os.makedirs(data_save_root, exist_ok=True)
n = len(data_x)
for i in range(n_users):
selected_X = data_x[i * (n // n_users): (i + 1) * (n // n_users)]
selected_y = data_y[i * (n // n_users): (i + 1) * (n // n_users)]
X_save_dir = os.path.join(data_save_root, "user_%d_X.pkl" % (i))
y_save_dir = os.path.join(data_save_root, "user_%d_y.pkl" % (i))
with open(X_save_dir, "wb") as f:
pickle.dump(selected_X, f)
with open(y_save_dir, "wb") as f:
pickle.dump(selected_y, f)
print("Saving to %s" % (X_save_dir))

# 分层抽样
# def generate_uploader(data_x, data_y, n_uploaders=50, data_save_root=None):
# if data_save_root is None:
# return
# os.makedirs(data_save_root, exist_ok=True)
#
# sss = StratifiedShuffleSplit(n_splits=n_uploaders, test_size=1 / n_uploaders, random_state=0)
#
# # 使用 StratifiedShuffleSplit 对象来分割数据
# i = 0
# for train_index, test_index in sss.split(data_x, data_y):
# selected_X = [data_x[i] for i in test_index]
# selected_y = data_y[test_index]
#
# X_save_dir = os.path.join(data_save_root, "uploader_%d_X.pkl" % (i))
# y_save_dir = os.path.join(data_save_root, "uploader_%d_y.pkl" % (i))
# with open(X_save_dir, "wb") as f:
# pickle.dump(selected_X, f)
# with open(y_save_dir, "wb") as f:
# pickle.dump(selected_y, f)
#
# i += 1
# print("Saving to %s" % (X_save_dir))
#
#
# def generate_user(data_x, data_y, n_users=50, data_save_root=None):
# if data_save_root is None:
# return
# os.makedirs(data_save_root, exist_ok=True)
#
# sss = StratifiedShuffleSplit(n_splits=n_users, test_size=1 / n_users, random_state=0)
#
# # 使用 StratifiedShuffleSplit 对象来分割数据
# i = 0
# for train_index, test_index in sss.split(data_x, data_y):
# selected_X = [data_x[i] for i in test_index]
# selected_y = data_y[test_index]
#
# X_save_dir = os.path.join(data_save_root, "user_%d_X.pkl" % (i))
# y_save_dir = os.path.join(data_save_root, "user_%d_y.pkl" % (i))
# with open(X_save_dir, "wb") as f:
# pickle.dump(selected_X, f)
# with open(y_save_dir, "wb") as f:
# pickle.dump(selected_y, f)
#
# i += 1
# print("Saving to %s" % (X_save_dir))


# Train Uploaders' models
def train(X, y, out_classes):
vectorizer = TfidfVectorizer(stop_words="english")
X_tfidf = vectorizer.fit_transform(X)

clf = MultinomialNB(alpha=0.1)
clf.fit(X_tfidf, y)

return vectorizer, clf


def eval_prediction(pred_y, target_y):
if not isinstance(pred_y, np.ndarray):
pred_y = pred_y.detach().cpu().numpy()
if len(pred_y.shape) == 1:
predicted = np.array(pred_y)
else:
predicted = np.argmax(pred_y, 1)
annos = np.array(target_y)

total = predicted.shape[0]
correct = (predicted == annos).sum().item()

return correct / total

Loading…
Cancel
Save