|
- import os
- import numpy as np
- import random
- import math
-
- import torch
- import torch.nn as nn
- import torch.optim as optim
-
- import pickle
- import torchtext.transforms as T
- from torch.hub import load_state_dict_from_url
- from torchtext.models import RobertaClassificationHead, XLMR_BASE_ENCODER
- import torchtext.functional as F
- from torch.optim import AdamW
- from torch.utils.data import DataLoader
-
-
- class TextDataLoader:
- def __init__(self, data_root, train: bool = True):
- self.data_root = data_root
- self.train = train
-
- def get_idx_data(self, idx=0):
- if self.train:
- X_path = os.path.join(self.data_root, "uploader", "uploader_%d_X.pkl" % (idx))
- y_path = os.path.join(self.data_root, "uploader", "uploader_%d_y.pkl" % (idx))
- if not (os.path.exists(X_path) and os.path.exists(y_path)):
- raise Exception("Index Error")
- with open(X_path, "rb") as f:
- X = pickle.load(f)
- with open(y_path, "rb") as f:
- y = pickle.load(f)
- else:
- X_path = os.path.join(self.data_root, "user", "user_%d_X.pkl" % (idx))
- y_path = os.path.join(self.data_root, "user", "user_%d_y.pkl" % (idx))
- if not (os.path.exists(X_path) and os.path.exists(y_path)):
- raise Exception("Index Error")
- with open(X_path, "rb") as f:
- X = pickle.load(f)
- with open(y_path, "rb") as f:
- y = pickle.load(f)
- return X, y
-
-
- def generate_uploader(data_x, data_y, n_uploaders=50, data_save_root=None):
- if data_save_root is None:
- return
- os.makedirs(data_save_root, exist_ok=True)
- n = len(data_x)
- for i in range(n_uploaders):
- selected_X = data_x[i * (n // n_uploaders) : (i + 1) * (n // n_uploaders)]
- selected_y = data_y[i * (n // n_uploaders) : (i + 1) * (n // n_uploaders)]
- X_save_dir = os.path.join(data_save_root, "uploader_%d_X.pkl" % (i))
- y_save_dir = os.path.join(data_save_root, "uploader_%d_y.pkl" % (i))
- with open(X_save_dir, "wb") as f:
- pickle.dump(selected_X, f)
- with open(y_save_dir, "wb") as f:
- pickle.dump(selected_y, f)
- print("Saving to %s" % (X_save_dir))
-
-
- def generate_user(data_x, data_y, n_users=50, data_save_root=None):
- if data_save_root is None:
- return
- os.makedirs(data_save_root, exist_ok=True)
- n = len(data_x)
- for i in range(n_users):
- selected_X = data_x[i * (n // n_users) : (i + 1) * (n // n_users)]
- selected_y = data_y[i * (n // n_users) : (i + 1) * (n // n_users)]
- X_save_dir = os.path.join(data_save_root, "user_%d_X.pkl" % (i))
- y_save_dir = os.path.join(data_save_root, "user_%d_y.pkl" % (i))
- with open(X_save_dir, "wb") as f:
- pickle.dump(selected_X, f)
- with open(y_save_dir, "wb") as f:
- pickle.dump(selected_y, f)
- print("Saving to %s" % (X_save_dir))
-
-
- def sentence_preprocess(x_datapipe):
- padding_idx = 1
- bos_idx = 0
- eos_idx = 2
- max_seq_len = 256
- xlmr_vocab_path = r"https://download.pytorch.org/models/text/xlmr.vocab.pt"
- xlmr_spm_model_path = r"https://download.pytorch.org/models/text/xlmr.sentencepiece.bpe.model"
-
- text_transform = T.Sequential(
- T.SentencePieceTokenizer(xlmr_spm_model_path),
- T.VocabTransform(load_state_dict_from_url(xlmr_vocab_path)),
- T.Truncate(max_seq_len - 2),
- T.AddToken(token=bos_idx, begin=True),
- T.AddToken(token=eos_idx, begin=False),
- )
-
- x_datapipe = [text_transform(x) for x in x_datapipe]
- # x_datapipe = x_datapipe.map(text_transform)
- return x_datapipe
-
-
- def train_step(model, criteria, optim, input, target):
- output = model(input)
- loss = criteria(output, target)
- optim.zero_grad()
- loss.backward()
- optim.step()
-
-
- def eval_step(model, criteria, input, target):
- output = model(input)
- loss = criteria(output, target).item()
- return float(loss), (output.argmax(1) == target).type(torch.float).sum().item()
-
-
- def evaluate(model, criteria, dev_dataloader):
- model.eval()
- total_loss = 0
- correct_predictions = 0
- total_predictions = 0
- counter = 0
- with torch.no_grad():
- for batch in dev_dataloader:
- input = F.to_tensor(batch["token_ids"], padding_value=1).to(DEVICE)
- target = torch.tensor(batch["target"]).to(DEVICE)
- loss, predictions = eval_step(model, criteria, input, target)
- total_loss += loss
- correct_predictions += predictions
- total_predictions += len(target)
- counter += 1
-
- return total_loss / counter, correct_predictions / total_predictions
-
-
- DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
-
- # Train Uploaders' models
- def train(X, y, out_classes, epochs=35, batch_size=128):
- # print(X.shape, y.shape)
- from torchdata.datapipes.iter import IterableWrapper
-
- X = sentence_preprocess(X)
- data_size = len(X)
- train_datapipe = list(zip(X, y))
- train_datapipe = IterableWrapper(train_datapipe)
- train_datapipe = train_datapipe.batch(batch_size)
- train_datapipe = train_datapipe.rows2columnar(["token_ids", "target"])
- train_dataloader = DataLoader(train_datapipe, batch_size=None)
-
- num_classes = 2
- input_dim = 768
- classifier_head = RobertaClassificationHead(num_classes=num_classes, input_dim=input_dim)
- model = XLMR_BASE_ENCODER.get_model(head=classifier_head)
- learning_rate = 1e-5
- optim = AdamW(model.parameters(), lr=learning_rate)
- criteria = nn.CrossEntropyLoss()
-
- model.to(DEVICE)
-
- num_epochs = 10
-
- for e in range(num_epochs):
- for batch in train_dataloader:
- input = F.to_tensor(batch["token_ids"], padding_value=1).to(DEVICE)
- target = torch.tensor(batch["target"]).to(DEVICE)
- train_step(model, criteria, optim, input, target)
-
- loss, accuracy = evaluate(model, criteria, train_dataloader)
- print("Epoch = [{}], loss = [{}], accuracy = [{}]".format(e, loss, accuracy))
- return model
-
-
- def eval_prediction(pred_y, target_y):
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
- if not isinstance(pred_y, np.ndarray):
- pred_y = pred_y.detach().cpu().numpy()
- if len(pred_y.shape) == 1:
- predicted = np.array(pred_y)
- else:
- predicted = np.argmax(pred_y, 1)
- annos = np.array(target_y)
- # print(predicted, annos)
- # annos = target_y
- total = predicted.shape[0]
- correct = (predicted == annos).sum().item()
- criterion = nn.CrossEntropyLoss()
- return correct / total
|