zzy34407230
/
mindspore2022

/**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <unistd.h>
#include "minddata/dataset/include/text.h"
#include "minddata/dataset/text/kernels/lookup_op.h"
#include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h"
#include "minddata/dataset/util/path.h"

namespace mindspore {
namespace dataset {

// Transform operations for text.
namespace text {

// FUNCTIONS TO CREATE TEXT OPERATIONS
// (In alphabetical order)

std::shared_ptr<LookupOperation> Lookup(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
                                        const DataType &data_type) {
  auto op = std::make_shared<LookupOperation>(vocab, unknown_token, data_type);

  return op->ValidateParams() ? op : nullptr;
}

std::shared_ptr<SentencePieceTokenizerOperation> SentencePieceTokenizer(
  const std::shared_ptr<SentencePieceVocab> &vocab, SPieceTokenizerOutType out_type) {
  auto op = std::make_shared<SentencePieceTokenizerOperation>(vocab, out_type);

  return op->ValidateParams() ? op : nullptr;
}

std::shared_ptr<SentencePieceTokenizerOperation> SentencePieceTokenizer(const std::string &vocab_path,
                                                                        SPieceTokenizerOutType out_type) {
  auto op = std::make_shared<SentencePieceTokenizerOperation>(vocab_path, out_type);

  return op->ValidateParams() ? op : nullptr;
}

/* ####################################### Validator Functions ############################################ */

/* ####################################### Derived TensorOperation classes ################################# */

// (In alphabetical order)

// LookupOperation
LookupOperation::LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
                                 const DataType &data_type)
    : vocab_(vocab), unknown_token_(unknown_token), default_id_(Vocab::kNoTokenExists), data_type_(data_type) {}

Status LookupOperation::ValidateParams() {
  if (vocab_ == nullptr) {
    std::string err_msg = "Lookup: vocab object type is incorrect or null.";
    MS_LOG(ERROR) << err_msg;
    RETURN_STATUS_SYNTAX_ERROR(err_msg);
  }

  default_id_ = vocab_->Lookup(unknown_token_);
  if (default_id_ == Vocab::kNoTokenExists) {
    std::string err_msg = "Lookup: " + unknown_token_ + " doesn't exist in vocab.";
    MS_LOG(ERROR) << err_msg;
    RETURN_STATUS_SYNTAX_ERROR(err_msg);
  }

  return Status::OK();
}

std::shared_ptr<TensorOp> LookupOperation::Build() {
  std::shared_ptr<LookupOp> tensor_op = std::make_shared<LookupOp>(vocab_, default_id_, data_type_);
  return tensor_op;
}

// SentencePieceTokenizerOperation
SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::shared_ptr<SentencePieceVocab> &vocab,
                                                                 SPieceTokenizerOutType out_type)
    : vocab_(vocab), vocab_path_(std::string()), load_type_(SPieceTokenizerLoadType::kModel), out_type_(out_type) {}

SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::string &vocab_path,
                                                                 SPieceTokenizerOutType out_type)
    : vocab_(nullptr), vocab_path_(vocab_path), load_type_(SPieceTokenizerLoadType::kFile), out_type_(out_type) {}

Status SentencePieceTokenizerOperation::ValidateParams() {
  if (load_type_ == SPieceTokenizerLoadType::kModel) {
    if (vocab_ == nullptr) {
      std::string err_msg = "SentencePieceTokenizer: vocab object type is incorrect or null.";
      MS_LOG(ERROR) << err_msg;
      RETURN_STATUS_SYNTAX_ERROR(err_msg);
    }
  } else {
    Path vocab_file(vocab_path_);
    if (!vocab_file.Exists() || vocab_file.IsDirectory()) {
      std::string err_msg = "SentencePieceTokenizer : vocab file: [" + vocab_path_ + "] is invalid or does not exist.";
      MS_LOG(ERROR) << err_msg;
      RETURN_STATUS_SYNTAX_ERROR(err_msg);
    }
    if (access(vocab_file.toString().c_str(), R_OK) == -1) {
      std::string err_msg = "SentencePieceTokenizer : no access to specified dataset file: " + vocab_path_;
      MS_LOG(ERROR) << err_msg;
      RETURN_STATUS_SYNTAX_ERROR(err_msg);
    }
  }
  return Status::OK();
}

std::shared_ptr<TensorOp> SentencePieceTokenizerOperation::Build() {
  std::shared_ptr<SentencePieceTokenizerOp> tensor_op;
  if (load_type_ == SPieceTokenizerLoadType::kModel) {
    tensor_op = std::make_shared<SentencePieceTokenizerOp>(vocab_, load_type_, out_type_);
  } else {
    Path vocab_file(vocab_path_);
    std::string model_path = vocab_file.ParentPath();
    std::string model_filename = vocab_file.Basename();
    tensor_op = std::make_shared<SentencePieceTokenizerOp>(model_path, model_filename, load_type_, out_type_);
  }
  return tensor_op;
}

}  // namespace text
}  // namespace dataset
}  // namespace mindspore