|
|
|
@@ -1,5 +1,7 @@ |
|
|
|
import os |
|
|
|
import pickle |
|
|
|
import random |
|
|
|
from itertools import combinations |
|
|
|
|
|
|
|
import numpy as np |
|
|
|
import pandas as pd |
|
|
|
@@ -9,6 +11,10 @@ from sklearn.model_selection import train_test_split, StratifiedShuffleSplit |
|
|
|
from sklearn.naive_bayes import MultinomialNB |
|
|
|
from sklearn.metrics import accuracy_score, f1_score |
|
|
|
|
|
|
|
super_classes = ["comp", "rec", "sci", "talk", "misc"] |
|
|
|
super_classes_select2 = list(combinations(super_classes, 2)) |
|
|
|
super_classes_select3 = list(combinations(super_classes, 3)) |
|
|
|
|
|
|
|
|
|
|
|
class TextDataLoader: |
|
|
|
def __init__(self, data_root, train: bool = True): |
|
|
|
@@ -42,26 +48,48 @@ def generate_uploader(data_x, data_y, n_uploaders=50, data_save_root=None): |
|
|
|
return |
|
|
|
os.makedirs(data_save_root, exist_ok=True) |
|
|
|
n = len(data_x) |
|
|
|
for i in range(n_uploaders): |
|
|
|
selected_X = data_x[i * (n // n_uploaders): (i + 1) * (n // n_uploaders)] |
|
|
|
selected_y = data_y[i * (n // n_uploaders): (i + 1) * (n // n_uploaders)] |
|
|
|
|
|
|
|
for i, labels in enumerate(super_classes_select3[:n_uploaders]): |
|
|
|
indices = [idx for idx, label in enumerate(data_y) if label.split('.')[0] in labels] |
|
|
|
selected_X = data_x[indices] |
|
|
|
selected_y = data_y[indices].codes |
|
|
|
|
|
|
|
X_save_dir = os.path.join(data_save_root, "uploader_%d_X.pkl" % (i)) |
|
|
|
y_save_dir = os.path.join(data_save_root, "uploader_%d_y.pkl" % (i)) |
|
|
|
|
|
|
|
with open(X_save_dir, "wb") as f: |
|
|
|
pickle.dump(selected_X, f) |
|
|
|
with open(y_save_dir, "wb") as f: |
|
|
|
pickle.dump(selected_y, f) |
|
|
|
print("Saving to %s" % (X_save_dir)) |
|
|
|
|
|
|
|
# 随机选取 |
|
|
|
# def generate_user(data_x, data_y, n_users=50, data_save_root=None): |
|
|
|
# if data_save_root is None: |
|
|
|
# return |
|
|
|
# os.makedirs(data_save_root, exist_ok=True) |
|
|
|
# n = len(data_x) |
|
|
|
# for i in range(n_users): |
|
|
|
# selected_X = data_x[i * (n // n_users): (i + 1) * (n // n_users)] |
|
|
|
# selected_y = data_y[i * (n // n_users): (i + 1) * (n // n_users)].codes |
|
|
|
# X_save_dir = os.path.join(data_save_root, "user_%d_X.pkl" % (i)) |
|
|
|
# y_save_dir = os.path.join(data_save_root, "user_%d_y.pkl" % (i)) |
|
|
|
# with open(X_save_dir, "wb") as f: |
|
|
|
# pickle.dump(selected_X, f) |
|
|
|
# with open(y_save_dir, "wb") as f: |
|
|
|
# pickle.dump(selected_y, f) |
|
|
|
# print("Saving to %s" % (X_save_dir)) |
|
|
|
|
|
|
|
def generate_user(data_x, data_y, n_users=50, data_save_root=None): |
|
|
|
if data_save_root is None: |
|
|
|
return |
|
|
|
os.makedirs(data_save_root, exist_ok=True) |
|
|
|
n = len(data_x) |
|
|
|
for i in range(n_users): |
|
|
|
selected_X = data_x[i * (n // n_users): (i + 1) * (n // n_users)] |
|
|
|
selected_y = data_y[i * (n // n_users): (i + 1) * (n // n_users)] |
|
|
|
for i, labels in enumerate(super_classes_select3[:n_users]): |
|
|
|
indices = [idx for idx, label in enumerate(data_y) if label.split('.')[0] in labels] |
|
|
|
selected_X = data_x[indices] |
|
|
|
selected_y = data_y[indices].codes |
|
|
|
|
|
|
|
X_save_dir = os.path.join(data_save_root, "user_%d_X.pkl" % (i)) |
|
|
|
y_save_dir = os.path.join(data_save_root, "user_%d_y.pkl" % (i)) |
|
|
|
with open(X_save_dir, "wb") as f: |
|
|
|
@@ -70,54 +98,6 @@ def generate_user(data_x, data_y, n_users=50, data_save_root=None): |
|
|
|
pickle.dump(selected_y, f) |
|
|
|
print("Saving to %s" % (X_save_dir)) |
|
|
|
|
|
|
|
# 分层抽样 |
|
|
|
# def generate_uploader(data_x, data_y, n_uploaders=50, data_save_root=None): |
|
|
|
# if data_save_root is None: |
|
|
|
# return |
|
|
|
# os.makedirs(data_save_root, exist_ok=True) |
|
|
|
# |
|
|
|
# sss = StratifiedShuffleSplit(n_splits=n_uploaders, test_size=1 / n_uploaders, random_state=0) |
|
|
|
# |
|
|
|
# # 使用 StratifiedShuffleSplit 对象来分割数据 |
|
|
|
# i = 0 |
|
|
|
# for train_index, test_index in sss.split(data_x, data_y): |
|
|
|
# selected_X = [data_x[i] for i in test_index] |
|
|
|
# selected_y = data_y[test_index] |
|
|
|
# |
|
|
|
# X_save_dir = os.path.join(data_save_root, "uploader_%d_X.pkl" % (i)) |
|
|
|
# y_save_dir = os.path.join(data_save_root, "uploader_%d_y.pkl" % (i)) |
|
|
|
# with open(X_save_dir, "wb") as f: |
|
|
|
# pickle.dump(selected_X, f) |
|
|
|
# with open(y_save_dir, "wb") as f: |
|
|
|
# pickle.dump(selected_y, f) |
|
|
|
# |
|
|
|
# i += 1 |
|
|
|
# print("Saving to %s" % (X_save_dir)) |
|
|
|
# |
|
|
|
# |
|
|
|
# def generate_user(data_x, data_y, n_users=50, data_save_root=None): |
|
|
|
# if data_save_root is None: |
|
|
|
# return |
|
|
|
# os.makedirs(data_save_root, exist_ok=True) |
|
|
|
# |
|
|
|
# sss = StratifiedShuffleSplit(n_splits=n_users, test_size=1 / n_users, random_state=0) |
|
|
|
# |
|
|
|
# # 使用 StratifiedShuffleSplit 对象来分割数据 |
|
|
|
# i = 0 |
|
|
|
# for train_index, test_index in sss.split(data_x, data_y): |
|
|
|
# selected_X = [data_x[i] for i in test_index] |
|
|
|
# selected_y = data_y[test_index] |
|
|
|
# |
|
|
|
# X_save_dir = os.path.join(data_save_root, "user_%d_X.pkl" % (i)) |
|
|
|
# y_save_dir = os.path.join(data_save_root, "user_%d_y.pkl" % (i)) |
|
|
|
# with open(X_save_dir, "wb") as f: |
|
|
|
# pickle.dump(selected_X, f) |
|
|
|
# with open(y_save_dir, "wb") as f: |
|
|
|
# pickle.dump(selected_y, f) |
|
|
|
# |
|
|
|
# i += 1 |
|
|
|
# print("Saving to %s" % (X_save_dir)) |
|
|
|
|
|
|
|
|
|
|
|
# Train Uploaders' models |
|
|
|
def train(X, y, out_classes): |
|
|
|
|