|
- import os
- import joblib
- import numpy as np
- from learnware.model import BaseModel
- import torch
- from torchtext.models import RobertaClassificationHead, XLMR_BASE_ENCODER
- import torchtext.functional as F
- import torchtext.transforms as T
- from torch.hub import load_state_dict_from_url
-
-
- class Model(BaseModel):
- def __init__(self):
- super().__init__(input_shape=None, output_shape=(2,))
- dir_path = os.path.dirname(os.path.abspath(__file__))
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
- num_classes = 2
- input_dim = 768
- classifier_head = RobertaClassificationHead(num_classes=num_classes, input_dim=input_dim)
- self.model = XLMR_BASE_ENCODER.get_model(head=classifier_head).to(self.device)
- self.model.load_state_dict(torch.load(os.path.join(dir_path, "model.pth")))
-
- def fit(self, X: np.ndarray, y: np.ndarray):
- pass
-
- def predict(self, X: np.ndarray) -> np.ndarray:
- X = sentence_preprocess(X)
- X = F.to_tensor(X, padding_value=1).to(self.device)
- return self.model(X)
-
- def finetune(self, X: np.ndarray, y: np.ndarray):
- pass
-
-
- def sentence_preprocess(x_datapipe):
- padding_idx = 1
- bos_idx = 0
- eos_idx = 2
- max_seq_len = 256
- xlmr_vocab_path = r"https://download.pytorch.org/models/text/xlmr.vocab.pt"
- xlmr_spm_model_path = r"https://download.pytorch.org/models/text/xlmr.sentencepiece.bpe.model"
-
- text_transform = T.Sequential(
- T.SentencePieceTokenizer(xlmr_spm_model_path),
- T.VocabTransform(load_state_dict_from_url(xlmr_vocab_path)),
- T.Truncate(max_seq_len - 2),
- T.AddToken(token=bos_idx, begin=True),
- T.AddToken(token=eos_idx, begin=False),
- )
-
- x_datapipe = [text_transform(x) for x in x_datapipe]
- # x_datapipe = x_datapipe.map(text_transform)
- return x_datapipe
|