Merge pull request !2390 from qianlong21st/wordpiece_tokenizer_1Dtags/v0.5.0-beta
| @@ -32,23 +32,6 @@ WordpieceTokenizerOp::WordpieceTokenizerOp(const std::shared_ptr<Vocab> &vocab, | |||||
| max_bytes_per_token_(max_bytes_per_token), | max_bytes_per_token_(max_bytes_per_token), | ||||
| unknown_token_(unknown_token) {} | unknown_token_(unknown_token) {} | ||||
| void WordpieceTokenizerOp::PadTokens(const std::vector<std::vector<std::string>> &tokens, const std::string &padded_str, | |||||
| std::vector<std::string> *out_padded_tokens, int *out_cols) const { | |||||
| int rows = tokens.size(); | |||||
| int max_cols = 0; | |||||
| for (int i = 0; i < rows; i++) { | |||||
| max_cols = std::max(max_cols, static_cast<int>(tokens[i].size())); | |||||
| } | |||||
| out_padded_tokens->resize(rows * max_cols, padded_str); | |||||
| for (int i = 0; i < rows; i++) { | |||||
| int index = i * max_cols; | |||||
| for (int j = 0; j < tokens[i].size(); j++) { | |||||
| (*out_padded_tokens)[index++] = tokens[i][j]; | |||||
| } | |||||
| } | |||||
| *out_cols = max_cols; | |||||
| } | |||||
| Status WordpieceTokenizerOp::LookupWord(const std::string &input_token, const RuneStrArray &runes, const int start, | Status WordpieceTokenizerOp::LookupWord(const std::string &input_token, const RuneStrArray &runes, const int start, | ||||
| bool *out_found, int *out_end) const { | bool *out_found, int *out_end) const { | ||||
| CHECK_FAIL_RETURN_UNEXPECTED(start >= 0 && start < input_token.size(), "Out of range"); | CHECK_FAIL_RETURN_UNEXPECTED(start >= 0 && start < input_token.size(), "Out of range"); | ||||
| @@ -117,20 +100,16 @@ Status WordpieceTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std:: | |||||
| if (input->Rank() > 1 || input->type() != DataType::DE_STRING) { | if (input->Rank() > 1 || input->type() != DataType::DE_STRING) { | ||||
| RETURN_STATUS_UNEXPECTED("The input tensor should be scalar or 1-D string tensor"); | RETURN_STATUS_UNEXPECTED("The input tensor should be scalar or 1-D string tensor"); | ||||
| } | } | ||||
| std::vector<std::vector<std::string>> out_tokens(input->Size()); | |||||
| int i = 0; | |||||
| std::vector<std::string> out_tokens; | |||||
| for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) { | for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) { | ||||
| RETURN_IF_NOT_OK(GetTokens(std::string(*iter), &out_tokens[i++])); | |||||
| std::vector<std::string> temp_tokens; | |||||
| RETURN_IF_NOT_OK(GetTokens(std::string(*iter), &temp_tokens)); | |||||
| out_tokens.insert(out_tokens.end(), temp_tokens.begin(), temp_tokens.end()); | |||||
| } | } | ||||
| std::vector<std::string> padded_tokens; | |||||
| int cols = 0; | |||||
| PadTokens(out_tokens, "<pad>", &padded_tokens, &cols); | |||||
| std::vector<dsize_t> shapes; | |||||
| if (input->Rank() == 1) { | |||||
| shapes.push_back(out_tokens.size()); | |||||
| if (out_tokens.empty()) { | |||||
| out_tokens.emplace_back(""); | |||||
| } | } | ||||
| shapes.push_back(cols); | |||||
| *output = std::make_shared<Tensor>(std::move(padded_tokens), TensorShape(shapes)); | |||||
| *output = std::make_shared<Tensor>(out_tokens, TensorShape({(dsize_t)out_tokens.size()})); | |||||
| return Status::OK(); | return Status::OK(); | ||||
| } | } | ||||
| @@ -48,8 +48,6 @@ class WordpieceTokenizerOp : public TensorOp { | |||||
| Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override; | Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override; | ||||
| protected: | protected: | ||||
| void PadTokens(const std::vector<std::vector<std::string>> &tokens, const std::string &padded_str, | |||||
| std::vector<std::string> *out_padded_tokens, int *out_cols) const; | |||||
| Status AddSubword(const std::string &input_token, const int start, const int end, | Status AddSubword(const std::string &input_token, const int start, const int end, | ||||
| std::vector<std::string> *out_token) const; | std::vector<std::string> *out_token) const; | ||||
| Status FoundNoToken(const std::string &input_token, std::vector<std::string> *out_tokens) const; | Status FoundNoToken(const std::string &input_token, std::vector<std::string> *out_tokens) const; | ||||
| @@ -188,7 +188,7 @@ class UnicodeCharTokenizer(cde.UnicodeCharTokenizerOp): | |||||
| class WordpieceTokenizer(cde.WordpieceTokenizerOp): | class WordpieceTokenizer(cde.WordpieceTokenizerOp): | ||||
| """ | """ | ||||
| Tokenize scalar token or 1-D tokens to subword tokens. | |||||
| Tokenize scalar token or 1-D tokens to 1-D subword tokens. | |||||
| Args | Args | ||||
| vocab(Vocab): a Vocab object. | vocab(Vocab): a Vocab object. | ||||
| @@ -35,38 +35,24 @@ test_paras = [ | |||||
| dict( | dict( | ||||
| first=1, | first=1, | ||||
| last=4, | last=4, | ||||
| expect_str=[[['床'], ['前'], ['明'], ['月'], ['光']], | |||||
| [['疑'], ['是'], ['地'], ['上'], ['霜']], | |||||
| [['举'], ['头'], ['望'], ['明'], ['月']], | |||||
| [['低'], ['头'], ['思'], ['故'], ['乡']]], | |||||
| expect_str=[['床', '前', '明', '月', '光'], | |||||
| ['疑', '是', '地', '上', '霜'], | |||||
| ['举', '头', '望', '明', '月'], | |||||
| ['低', '头', '思', '故', '乡']], | |||||
| vocab_list=vocab_bert | vocab_list=vocab_bert | ||||
| ), | ), | ||||
| # test english text | # test english text | ||||
| dict( | dict( | ||||
| first=5, | first=5, | ||||
| last=5, | last=5, | ||||
| expect_str=[[['i', pad], | |||||
| ["am", pad], | |||||
| ['mak', '##ing'], | |||||
| ['small', pad], | |||||
| ['mistake', '##s'], | |||||
| ['during', pad], | |||||
| ['work', '##ing'], | |||||
| ['hour', '##s']]], | |||||
| expect_str=[['i', 'am', 'mak', '##ing', 'small', 'mistake', '##s', 'during', 'work', '##ing', 'hour', '##s']], | |||||
| lower_case=True, | lower_case=True, | ||||
| vocab_list=vocab_bert | vocab_list=vocab_bert | ||||
| ), | ), | ||||
| dict( | dict( | ||||
| first=5, | first=5, | ||||
| last=5, | last=5, | ||||
| expect_str=[[['I', pad], | |||||
| ["am", pad], | |||||
| ['mak', '##ing'], | |||||
| ['small', pad], | |||||
| ['mistake', '##s'], | |||||
| ['during', pad], | |||||
| ['work', '##ing'], | |||||
| ['hour', '##s']]], | |||||
| expect_str=[['I', "am", 'mak', '##ing', 'small', 'mistake', '##s', 'during', 'work', '##ing', 'hour', '##s']], | |||||
| lower_case=False, | lower_case=False, | ||||
| vocab_list=vocab_bert | vocab_list=vocab_bert | ||||
| ), | ), | ||||
| @@ -75,8 +61,8 @@ test_paras = [ | |||||
| first=6, | first=6, | ||||
| last=7, | last=7, | ||||
| expect_str=[ | expect_str=[ | ||||
| [['😀'], ['嘿'], ['嘿'], ['😃'], ['哈'], ['哈'], ['😄'], ['大'], ['笑'], ['😁'], ['嘻'], ['嘻']], | |||||
| [['繁'], ['體'], ['字']]], | |||||
| ['😀', '嘿', '嘿', '😃', '哈', '哈', '😄', '大', '笑', '😁', '嘻', '嘻'], | |||||
| ['繁', '體', '字']], | |||||
| normalization_form=nlp.utils.NormalizeForm.NFKC, | normalization_form=nlp.utils.NormalizeForm.NFKC, | ||||
| vocab_list=vocab_bert | vocab_list=vocab_bert | ||||
| ), | ), | ||||
| @@ -85,11 +71,11 @@ test_paras = [ | |||||
| first=8, | first=8, | ||||
| last=12, | last=12, | ||||
| expect_str=[ | expect_str=[ | ||||
| [['[UNK]'], ['[CLS]']], | |||||
| [['[UNK]'], ['[SEP]']], | |||||
| [['[UNK]'], ['[UNK]']], | |||||
| [['[UNK]'], ['[PAD]']], | |||||
| [['[UNK]'], ['[MASK]']], | |||||
| ['[UNK]', '[CLS]'], | |||||
| ['[UNK]', '[SEP]'], | |||||
| ['[UNK]', '[UNK]'], | |||||
| ['[UNK]', '[PAD]'], | |||||
| ['[UNK]', '[MASK]'], | |||||
| ], | ], | ||||
| lower_case=False, | lower_case=False, | ||||
| vocab_list=vocab_bert, | vocab_list=vocab_bert, | ||||
| @@ -99,7 +85,7 @@ test_paras = [ | |||||
| dict( | dict( | ||||
| first=13, | first=13, | ||||
| last=13, | last=13, | ||||
| expect_str=[[['12'], ['+'], ['/'], ['-'], ['28'], ['='], ['40'], ['/'], ['-'], ['16']]], | |||||
| expect_str=[['12', '+', '/', '-', '28', '=', '40', '/', '-', '16']], | |||||
| preserve_unused_token=True, | preserve_unused_token=True, | ||||
| vocab_list=vocab_bert | vocab_list=vocab_bert | ||||
| ), | ), | ||||
| @@ -107,9 +93,7 @@ test_paras = [ | |||||
| dict( | dict( | ||||
| first=8, | first=8, | ||||
| last=8, | last=8, | ||||
| expect_str=[ | |||||
| [['[UNK]'], [' '], ['[CLS]']], | |||||
| ], | |||||
| expect_str=[['[UNK]', ' ', '[CLS]']], | |||||
| lower_case=False, | lower_case=False, | ||||
| vocab_list=vocab_bert, | vocab_list=vocab_bert, | ||||
| preserve_unused_token=True, | preserve_unused_token=True, | ||||
| @@ -118,9 +102,7 @@ test_paras = [ | |||||
| dict( | dict( | ||||
| first=8, | first=8, | ||||
| last=8, | last=8, | ||||
| expect_str=[ | |||||
| [['unused'], [' '], ['[CLS]']], | |||||
| ], | |||||
| expect_str=[['unused', ' ', '[CLS]']], | |||||
| lower_case=False, | lower_case=False, | ||||
| vocab_list=vocab_bert, | vocab_list=vocab_bert, | ||||
| preserve_unused_token=True, | preserve_unused_token=True, | ||||
| @@ -130,9 +112,7 @@ test_paras = [ | |||||
| dict( | dict( | ||||
| first=8, | first=8, | ||||
| last=8, | last=8, | ||||
| expect_str=[ | |||||
| [['unused'], [' '], ['['], ['CLS'], [']']], | |||||
| ], | |||||
| expect_str=[['unused', ' ', '[', 'CLS', ']']], | |||||
| lower_case=False, | lower_case=False, | ||||
| vocab_list=vocab_bert, | vocab_list=vocab_bert, | ||||
| preserve_unused_token=False, | preserve_unused_token=False, | ||||