- [action] add k-means bucketing, partition sequences into buckets of nearly the same length - [trainer] print train loss every 10 steps - [loader] cws pku loader split sequence longer than max_seq_len into several shorter sequencestags/v0.1.0
| @@ -14,7 +14,7 @@ class Action(object): | |||||
| def k_means_1d(x, k, max_iter=100): | def k_means_1d(x, k, max_iter=100): | ||||
| """ | """ | ||||
| Perform k-means on 1-D data. | |||||
| :param x: list of int, representing points in 1-D. | :param x: list of int, representing points in 1-D. | ||||
| :param k: the number of clusters required. | :param k: the number of clusters required. | ||||
| :param max_iter: maximum iteration | :param max_iter: maximum iteration | ||||
| @@ -117,12 +117,12 @@ class BucketSampler(BaseSampler): | |||||
| def __init__(self, data_set): | def __init__(self, data_set): | ||||
| super(BucketSampler, self).__init__(data_set) | super(BucketSampler, self).__init__(data_set) | ||||
| BUCKETS = ([None] * 10) | |||||
| BUCKETS = ([None] * 20) | |||||
| self.length_freq = dict(Counter([len(example) for example in data_set])) | self.length_freq = dict(Counter([len(example) for example in data_set])) | ||||
| self.buckets = k_means_bucketing(data_set, BUCKETS) | self.buckets = k_means_bucketing(data_set, BUCKETS) | ||||
| def __iter__(self): | def __iter__(self): | ||||
| bucket_samples = self.buckets[np.random.randint(0, len(self.buckets) + 1)] | |||||
| bucket_samples = self.buckets[np.random.randint(0, len(self.buckets))] | |||||
| np.random.shuffle(bucket_samples) | np.random.shuffle(bucket_samples) | ||||
| return iter(bucket_samples) | return iter(bucket_samples) | ||||
| @@ -140,10 +140,11 @@ class Batchifier(object): | |||||
| def __iter__(self): | def __iter__(self): | ||||
| batch = [] | batch = [] | ||||
| for idx in self.sampler: | |||||
| batch.append(idx) | |||||
| if len(batch) == self.batch_size: | |||||
| while True: | |||||
| for idx in self.sampler: | |||||
| batch.append(idx) | |||||
| if len(batch) == self.batch_size: | |||||
| yield batch | |||||
| batch = [] | |||||
| if 0 < len(batch) < self.batch_size and self.drop_last is False: | |||||
| yield batch | yield batch | ||||
| batch = [] | |||||
| if 0 < len(batch) < self.batch_size and self.drop_last is False: | |||||
| yield batch | |||||
| @@ -174,7 +174,7 @@ class POSTester(BaseTester): | |||||
| truth = torch.Tensor(truth) | truth = torch.Tensor(truth) | ||||
| if torch.cuda.is_available() and self.use_cuda: | if torch.cuda.is_available() and self.use_cuda: | ||||
| truth = truth.cuda() | truth = truth.cuda() | ||||
| loss = self.model.loss(predict, truth, self.seq_len) | |||||
| loss = self.model.loss(predict, truth, self.seq_len) / self.batch_size | |||||
| prediction = self.model.prediction(predict, self.seq_len) | prediction = self.model.prediction(predict, self.seq_len) | ||||
| results = torch.Tensor(prediction).view(-1,) | results = torch.Tensor(prediction).view(-1,) | ||||
| if torch.cuda.is_available() and self.use_cuda: | if torch.cuda.is_available() and self.use_cuda: | ||||
| @@ -101,6 +101,9 @@ class BaseTrainer(Action): | |||||
| self.grad_backward(loss) | self.grad_backward(loss) | ||||
| self.update() | self.update() | ||||
| if step % 10 == 0: | |||||
| print("[epoch {} step {}] train loss={:.2f}".format(epoch, step, loss.data)) | |||||
| if self.validate: | if self.validate: | ||||
| if data_dev is None: | if data_dev is None: | ||||
| raise RuntimeError("No validation data provided.") | raise RuntimeError("No validation data provided.") | ||||
| @@ -30,6 +30,7 @@ class POSDatasetLoader(DatasetLoader): | |||||
| and "Hello world !". Each word has its own label from label1 | and "Hello world !". Each word has its own label from label1 | ||||
| to label5. | to label5. | ||||
| """ | """ | ||||
| def __init__(self, data_name, data_path): | def __init__(self, data_name, data_path): | ||||
| super(POSDatasetLoader, self).__init__(data_name, data_path) | super(POSDatasetLoader, self).__init__(data_name, data_path) | ||||
| @@ -86,7 +87,7 @@ class TokenizeDatasetLoader(DatasetLoader): | |||||
| def __init__(self, data_name, data_path): | def __init__(self, data_name, data_path): | ||||
| super(TokenizeDatasetLoader, self).__init__(data_name, data_path) | super(TokenizeDatasetLoader, self).__init__(data_name, data_path) | ||||
| def load_pku(self, max_seq_len=64): | |||||
| def load_pku(self, max_seq_len=32): | |||||
| """ | """ | ||||
| load pku dataset for Chinese word segmentation | load pku dataset for Chinese word segmentation | ||||
| CWS (Chinese Word Segmentation) pku training dataset format: | CWS (Chinese Word Segmentation) pku training dataset format: | ||||
| @@ -107,12 +108,10 @@ class TokenizeDatasetLoader(DatasetLoader): | |||||
| sentences = f.readlines() | sentences = f.readlines() | ||||
| data = [] | data = [] | ||||
| for sent in sentences: | for sent in sentences: | ||||
| tokens = sent.strip().split() | |||||
| words = [] | words = [] | ||||
| labels = [] | labels = [] | ||||
| tokens = sent.strip().split() | |||||
| for start in range(len(tokens) // max_seq_len): | |||||
| for token in token_seq: | |||||
| for token in tokens: | |||||
| if len(token) == 1: | if len(token) == 1: | ||||
| words.append(token) | words.append(token) | ||||
| labels.append("S") | labels.append("S") | ||||
| @@ -124,7 +123,15 @@ class TokenizeDatasetLoader(DatasetLoader): | |||||
| labels.append("M") | labels.append("M") | ||||
| words.append(token[-1]) | words.append(token[-1]) | ||||
| labels.append("E") | labels.append("E") | ||||
| data.append([words, labels]) | |||||
| num_samples = len(words) // max_seq_len | |||||
| if len(words) % max_seq_len != 0: | |||||
| num_samples += 1 | |||||
| for sample_idx in range(num_samples): | |||||
| start = sample_idx * max_seq_len | |||||
| end = (sample_idx + 1) * max_seq_len | |||||
| seq_words = words[start:end] | |||||
| seq_labels = labels[start:end] | |||||
| data.append([seq_words, seq_labels]) | |||||
| return data | return data | ||||
| @@ -23,6 +23,7 @@ def save_pickle(obj, pickle_path, file_name): | |||||
| def load_pickle(pickle_path, file_name): | def load_pickle(pickle_path, file_name): | ||||
| with open(os.path.join(pickle_path, file_name), "rb") as f: | with open(os.path.join(pickle_path, file_name), "rb") as f: | ||||
| obj = _pickle.load(f) | obj = _pickle.load(f) | ||||
| print("{} loaded. ".format(file_name)) | |||||
| return obj | return obj | ||||