!26611 [MD] fix mindrecord log message part01

Merge pull request !26611 from liyong126/fix_mindrecord_log_msg
4 years ago · bf59d75604
--- a/mindspore/ccsrc/minddata/mindrecord/io/shard_writer.cc
+++ b/mindspore/ccsrc/minddata/mindrecord/io/shard_writer.cc
@@ -44,19 +44,22 @@ Status ShardWriter::GetFullPathFromFileName(const std::vector<std::string> &path
  // Get full path from file name
  for (const auto &path : paths) {
    CHECK_FAIL_RETURN_UNEXPECTED(CheckIsValidUtf8(path),
                                 "Invalid data, file name: " + path + " contains invalid uft-8 character.");
                                 "Invalid file, mindrecord file name: " + path +
                                   " contains invalid uft-8 character. Please rename mindrecord file name.");
    char resolved_path[PATH_MAX] = {0};
    char buf[PATH_MAX] = {0};
    CHECK_FAIL_RETURN_UNEXPECTED(strncpy_s(buf, PATH_MAX, common::SafeCStr(path), path.length()) == EOK,
                                 "Failed to call securec func [strncpy_s], path: " + path);
                                 "[Internal ERROR] Failed to call securec func [strncpy_s], path: " + path);
 #if defined(_WIN32) || defined(_WIN64)
    RETURN_UNEXPECTED_IF_NULL(_fullpath(resolved_path, dirname(&(buf[0])), PATH_MAX));
    RETURN_UNEXPECTED_IF_NULL(_fullpath(resolved_path, common::SafeCStr(path), PATH_MAX));
 #else
    CHECK_FAIL_RETURN_UNEXPECTED(realpath(dirname(&(buf[0])), resolved_path) != nullptr,
                                 "Invalid file, path: " + std::string(resolved_path));
    CHECK_FAIL_RETURN_UNEXPECTED(
      realpath(dirname(&(buf[0])), resolved_path) != nullptr,
      "Invalid file, failed to get the realpath of mindrecord files. Please check file path: " +
        std::string(resolved_path));
    if (realpath(common::SafeCStr(path), resolved_path) == nullptr) {
      MS_LOG(DEBUG) << "Path: " << common::SafeCStr(path) << "check success.";
      MS_LOG(DEBUG) << "Succeed to check path: " << common::SafeCStr(path);
    }
 #endif
    file_paths_.emplace_back(string(resolved_path));
@@ -75,7 +78,8 @@ Status ShardWriter::OpenDataFiles(bool append, bool overwrite) {
    }

    auto realpath = FileUtils::GetRealPath(dir.value().data());
    CHECK_FAIL_RETURN_UNEXPECTED(realpath.has_value(), "Failed to get real path, path: " + file);
    CHECK_FAIL_RETURN_UNEXPECTED(
      realpath.has_value(), "Invalid file, failed to get the realpath of mindrecord files. Please check file: " + file);

    std::optional<std::string> whole_path = "";
    FileUtils::ConcatDirAndFileName(&realpath, &local_file_name, &whole_path);
@@ -91,19 +95,25 @@ Status ShardWriter::OpenDataFiles(bool append, bool overwrite) {
        if (overwrite) {
          auto res1 = std::remove(whole_path.value().c_str());
          CHECK_FAIL_RETURN_UNEXPECTED(!std::ifstream(whole_path.value()) == true,
                                       "Failed to delete file, path: " + file);
                                       "Invalid file, failed to remove the old files when trying to overwrite "
                                       "mindrecord files. Please check file path and permission: " +
                                         file);
          if (res1 == 0) {
            MS_LOG(WARNING) << "Succeed to delete file, path: " << file;
            MS_LOG(WARNING) << "Succeed to remove the old mindrecord files, path: " << file;
          }
          auto db_file = whole_path.value() + ".db";
          auto res2 = std::remove(db_file.c_str());
          CHECK_FAIL_RETURN_UNEXPECTED(!std::ifstream(whole_path.value() + ".db") == true,
                                       "Failed to delete db file, path: " + file + ".db");
                                       "Invalid file, failed to remove the old mindrecord meta files when trying to "
                                       "overwrite mindrecord files. Please check file path and permission: " +
                                         file + ".db");
          if (res2 == 0) {
            MS_LOG(WARNING) << "Succeed to delete metadata file, path: " << file + ".db";
            MS_LOG(WARNING) << "Succeed to remove the old mindrecord metadata files, path: " << file + ".db";
          }
        } else {
          RETURN_STATUS_UNEXPECTED("Invalid file, Mindrecord files already existed in path: " + file);
          RETURN_STATUS_UNEXPECTED(
            "Invalid file, mindrecord files already exist. Please check file path: " + file +
            +".\nIf you do not want to keep the files, set the 'overwrite' parameter to True and try again.");
        }
      } else {
        fs->close();
@@ -112,17 +122,23 @@ Status ShardWriter::OpenDataFiles(bool append, bool overwrite) {
      // open the mindrecord file to write
      fs->open(common::SafeCStr(file), std::ios::out | std::ios::in | std::ios::binary | std::ios::trunc);
      if (!fs->good()) {
        RETURN_STATUS_UNEXPECTED("Failed to open file, path: " + file);
        RETURN_STATUS_UNEXPECTED(
          "Invalid file, failed to open files for writing mindrecord files. Please check file path, permission and "
          "open file limit: " +
          file);
      }
    } else {
      // open the mindrecord file to append
      fs->open(common::SafeCStr(file), std::ios::out | std::ios::in | std::ios::binary);
      if (!fs->good()) {
        fs->close();
        RETURN_STATUS_UNEXPECTED("Failed to open file for append data, path: " + file);
        RETURN_STATUS_UNEXPECTED(
          "Invalid file, failed to open files for appending mindrecord files. Please check file path, permission and "
          "open file limit: " +
          file);
      }
    }
    MS_LOG(INFO) << "Succeed to open shard file, path: " << file;
    MS_LOG(INFO) << "Succeed to open mindrecord shard file, path: " << file;
    file_streams_.push_back(fs);
  }
  return Status::OK();
@@ -143,7 +159,7 @@ Status ShardWriter::RemoveLockFile() {
 }

 Status ShardWriter::InitLockFile() {
  CHECK_FAIL_RETURN_UNEXPECTED(file_paths_.size() != 0, "Invalid data, file_paths_ is not initialized.");
  CHECK_FAIL_RETURN_UNEXPECTED(file_paths_.size() != 0, "[Internal ERROR] 'file_paths_' is not initialized.");

  lock_file_ = file_paths_[0] + kLockFileSuffix;
  pages_file_ = file_paths_[0] + kPageFileSuffix;
@@ -154,8 +170,8 @@ Status ShardWriter::InitLockFile() {
 Status ShardWriter::Open(const std::vector<std::string> &paths, bool append, bool overwrite) {
  shard_count_ = paths.size();
  CHECK_FAIL_RETURN_UNEXPECTED(schema_count_ <= kMaxSchemaCount,
                               "Invalid data, schema_count_ must be less than or equal to " +
                                 std::to_string(kMaxSchemaCount) + ", but got " + std::to_string(schema_count_));
                               "[Internal ERROR] 'schema_count_' must be less than or equal to " +
                                 std::to_string(kMaxSchemaCount) + ", but got: " + std::to_string(schema_count_));

  // Get full path from file name
  RETURN_IF_NOT_OK(GetFullPathFromFileName(paths));
@@ -167,7 +183,8 @@ Status ShardWriter::Open(const std::vector<std::string> &paths, bool append, boo
 }

 Status ShardWriter::OpenForAppend(const std::string &path) {
  CHECK_FAIL_RETURN_UNEXPECTED(IsLegalFile(path), "Invalid file, path: " + path);
  CHECK_FAIL_RETURN_UNEXPECTED(
    IsLegalFile(path), "Invalid file, failed to verify files for append mindrecord files. Please check file: " + path);
  std::shared_ptr<json> header_ptr;
  RETURN_IF_NOT_OK(ShardHeader::BuildSingleHeader(path, &header_ptr));
  auto ds = std::make_shared<std::vector<std::string>>();
@@ -262,7 +279,9 @@ void ShardWriter::DeleteErrorData(std::map<uint64_t, std::vector<json>> &raw_dat
    for (auto &subMg : sub_err_mg) {
      int loc = subMg.first;
      std::string message = subMg.second;
      MS_LOG(ERROR) << "Invalid input, the " << loc + 1 << " th data is invalid, " << message;
      MS_LOG(ERROR) << "Invalid input, the " << loc + 1
                    << " th data provided by user is invalid while writing mindrecord files. Please fix the error: "
                    << message;
      (void)delete_set.insert(loc);
    }
  }
@@ -299,8 +318,8 @@ Status ShardWriter::CheckDataTypeAndValue(const std::string &key, const json &va
      (data_type == "int64" && !data[key].is_number_integer()) ||
      (data_type == "float32" && !data[key].is_number_float()) ||
      (data_type == "float64" && !data[key].is_number_float()) || (data_type == "string" && !data[key].is_string())) {
    std::string message =
      "field: " + key + " ,type : " + data_type + " ,value: " + data[key].dump() + " is not matched.";
    std::string message = "Invalid input, for field: " + key + ", type: " + data_type +
                          " and value: " + data[key].dump() + " do not match while writing mindrecord files.";
    PopulateMutexErrorData(i, message, err_raw_data);
    RETURN_STATUS_UNEXPECTED(message);
  }
@@ -309,8 +328,8 @@ Status ShardWriter::CheckDataTypeAndValue(const std::string &key, const json &va
    int64_t temp_value = data[key];
    if (static_cast<int64_t>(temp_value) < static_cast<int64_t>(std::numeric_limits<int32_t>::min()) &&
        static_cast<int64_t>(temp_value) > static_cast<int64_t>(std::numeric_limits<int32_t>::max())) {
      std::string message =
        "field: " + key + " ,type : " + data_type + " ,value: " + data[key].dump() + " is out of range.";
      std::string message = "Invalid input, for field: " + key + "and its type: " + data_type +
                            ", value: " + data[key].dump() + " is out of range while writing mindrecord files.";
      PopulateMutexErrorData(i, message, err_raw_data);
      RETURN_STATUS_UNEXPECTED(message);
    }
@@ -366,7 +385,7 @@ Status ShardWriter::CheckData(const std::map<uint64_t, std::vector<json>> &raw_d
    // calculate start position and end position for each thread
    int batch_size = rawdata_iter->second.size() / shard_count_;
    int thread_num = shard_count_;
    CHECK_FAIL_RETURN_UNEXPECTED(thread_num > 0, "Invalid data, thread_num should be positive.");
    CHECK_FAIL_RETURN_UNEXPECTED(thread_num > 0, "[Internal ERROR] 'thread_num' should be positive.");
    if (thread_num > kMaxThreadCount) {
      thread_num = kMaxThreadCount;
    }
@@ -387,7 +406,7 @@ Status ShardWriter::CheckData(const std::map<uint64_t, std::vector<json>> &raw_d
    }
    CHECK_FAIL_RETURN_UNEXPECTED(
      thread_num <= kMaxThreadCount,
      "Invalid data, thread_num should be less than or equal to " + std::to_string(kMaxThreadCount));
      "[Internal ERROR] 'thread_num' should be less than or equal to " + std::to_string(kMaxThreadCount));
    // Wait for threads done
    for (int x = 0; x < thread_num; ++x) {
      thread_set[x].join();
@@ -404,7 +423,8 @@ Status ShardWriter::ValidateRawData(std::map<uint64_t, std::vector<json>> &raw_d
  RETURN_UNEXPECTED_IF_NULL(count_ptr);
  auto rawdata_iter = raw_data.begin();
  schema_count_ = raw_data.size();
  CHECK_FAIL_RETURN_UNEXPECTED(schema_count_ > 0, "Invalid data, schema count should be positive.");
  CHECK_FAIL_RETURN_UNEXPECTED(schema_count_ > 0, "Invalid data, the number of schema should be positive but got: " +
                                                    std::to_string(schema_count_) + ". Please check the input schema.");

  // keep schema_id
  std::set<int64_t> schema_ids;
@@ -412,17 +432,18 @@ Status ShardWriter::ValidateRawData(std::map<uint64_t, std::vector<json>> &raw_d

  // Determine if the number of schemas is the same
  CHECK_FAIL_RETURN_UNEXPECTED(shard_header_->GetSchemas().size() == schema_count_,
                               "Invalid data, schema count: " + std::to_string(schema_count_) + " is not matched.");
                               "[Internal ERROR] 'schema_count_' and the schema count in schema: " +
                                 std::to_string(schema_count_) + " do not match.");
  // Determine raw_data size == blob_data size
  CHECK_FAIL_RETURN_UNEXPECTED(raw_data[0].size() == blob_data.size(),
                               "Invalid data, raw data size: " + std::to_string(raw_data[0].size()) +
                               "[Internal ERROR] raw data size: " + std::to_string(raw_data[0].size()) +
                                 " is not equal to blob data size: " + std::to_string(blob_data.size()) + ".");

  // Determine whether the number of samples corresponding to each schema is the same
  for (rawdata_iter = raw_data.begin(); rawdata_iter != raw_data.end(); ++rawdata_iter) {
    CHECK_FAIL_RETURN_UNEXPECTED(row_count_ == rawdata_iter->second.size(),
                                 "Invalid data, number of samples: " + std::to_string(rawdata_iter->second.size()) +
                                   " for schema is not matched.");
                                 "[Internal ERROR] 'row_count_': " + std::to_string(rawdata_iter->second.size()) +
                                   " for each schema is not the same.");
    (void)schema_ids.insert(rawdata_iter->first);
  }
  const std::vector<std::shared_ptr<Schema>> &schemas = shard_header_->GetSchemas();
@@ -431,7 +452,7 @@ Status ShardWriter::ValidateRawData(std::map<uint64_t, std::vector<json>> &raw_d
                                            [schema_ids](const std::shared_ptr<Schema> &schema) {
                                              return schema_ids.find(schema->GetSchemaID()) == schema_ids.end();
                                            }),
                               "Invalid data, schema id of data is not matched.");
                               "[Internal ERROR] schema id in 'schemas' can not found in 'schema_ids'.");
  if (!sign) {
    *count_ptr = std::make_shared<std::pair<int, int>>(schema_count_, row_count_);
    return Status::OK();
@@ -487,7 +508,7 @@ Status ShardWriter::LockWriter(bool parallel_writer, std::unique_ptr<int> *fd_pt
    flock(fd, LOCK_EX);
  } else {
    close(fd);
    RETURN_STATUS_UNEXPECTED("Failed to lock file, path: " + lock_file_);
    RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to lock file, path: " + lock_file_);
  }
 #endif

@@ -497,20 +518,20 @@ Status ShardWriter::LockWriter(bool parallel_writer, std::unique_ptr<int> *fd_pt
    auto realpath = FileUtils::GetRealPath(file.data());
    if (!realpath.has_value()) {
      close(fd);
      RETURN_STATUS_UNEXPECTED("Failed to get real path, path: " + file);
      RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to get real path, path: " + file);
    }
    std::shared_ptr<std::fstream> fs = std::make_shared<std::fstream>();
    fs->open(realpath.value(), std::ios::in | std::ios::out | std::ios::binary);
    if (fs->fail()) {
      close(fd);
      RETURN_STATUS_UNEXPECTED("Failed to open file, path: " + file);
      RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to open file, path: " + file);
    }
    file_streams_.push_back(fs);
  }
  auto status = shard_header_->FileToPages(pages_file_);
  if (status.IsError()) {
    close(fd);
    RETURN_STATUS_UNEXPECTED("Error raised in FileToPages function.");
    RETURN_STATUS_UNEXPECTED("[Internal ERROR] Error raised in FileToPages function.");
  }
  *fd_ptr = std::make_unique<int>(fd);
  return Status::OK();
@@ -540,8 +561,9 @@ Status ShardWriter::WriteRawDataPreCheck(std::map<uint64_t, std::vector<json>> &
  // check the free disk size
  std::shared_ptr<uint64_t> size_ptr;
  RETURN_IF_NOT_OK(GetDiskSize(file_paths_[0], kFreeSize, &size_ptr));
  CHECK_FAIL_RETURN_UNEXPECTED(*size_ptr >= kMinFreeDiskSize,
                               "No free disk to be used, free disk size: " + std::to_string(*size_ptr));
  CHECK_FAIL_RETURN_UNEXPECTED(
    *size_ptr >= kMinFreeDiskSize,
    "No free disk to be used while writing mindrecord files, available free disk size: " + std::to_string(*size_ptr));
  // compress blob
  if (shard_column_->CheckCompressBlob()) {
    for (auto &blob : blob_data) {
@@ -615,7 +637,7 @@ Status ShardWriter::WriteRawData(std::map<uint64_t, std::vector<json>> &raw_data

  // Serialize raw data
  RETURN_IF_NOT_OK(WriteRawDataPreCheck(raw_data, blob_data, sign, &schema_count, &row_count));
  CHECK_FAIL_RETURN_UNEXPECTED(row_count >= kInt0, "Invalid data, raw data size should be positive.");
  CHECK_FAIL_RETURN_UNEXPECTED(row_count >= kInt0, "[Internal ERROR] the size of raw data should be positive.");
  if (row_count == kInt0) {
    return Status::OK();
  }
@@ -676,7 +698,7 @@ Status ShardWriter::ParallelWriteData(const std::vector<std::vector<uint8_t>> &b
  auto shards = BreakIntoShards();
  // define the number of thread
  int thread_num = static_cast<int>(shard_count_);
  CHECK_FAIL_RETURN_UNEXPECTED(thread_num > 0, "Invalid data, thread_num should be positive.");
  CHECK_FAIL_RETURN_UNEXPECTED(thread_num > 0, "[Internal ERROR] 'thread_num' should be positive.");
  if (thread_num > kMaxThreadCount) {
    thread_num = kMaxThreadCount;
  }
@@ -741,13 +763,13 @@ Status ShardWriter::CutRowGroup(int start_row, int end_row, const std::vector<st

  int page_start_row = start_row;
  CHECK_FAIL_RETURN_UNEXPECTED(start_row <= end_row,
                               "Invalid data, start row: " + std::to_string(start_row) +
                                 " should be less than or equal to end row: " + std::to_string(end_row));
                               "[Internal ERROR] 'start_row': " + std::to_string(start_row) +
                                 " should be less than or equal to 'end_row': " + std::to_string(end_row));

  CHECK_FAIL_RETURN_UNEXPECTED(
    end_row <= static_cast<int>(blob_data_size_.size()) && end_row <= static_cast<int>(raw_data_size_.size()),
    "Invalid data, end row: " + std::to_string(end_row) + " should be less than blob data size: " +
      std::to_string(blob_data_size_.size()) + " and raw data size: " + std::to_string(raw_data_size_.size()) + ".");
    "[Internal ERROR] 'end_row': " + std::to_string(end_row) + " should be less than 'blob_data_size': " +
      std::to_string(blob_data_size_.size()) + " and 'raw_data_size': " + std::to_string(raw_data_size_.size()) + ".");
  for (int i = start_row; i < end_row; ++i) {
    // n_byte_blob(0) indicate appendBlobPage
    if (n_byte_blob == 0 || n_byte_blob + blob_data_size_[i] > page_size_ ||
@@ -780,7 +802,7 @@ Status ShardWriter::AppendBlobPage(const int &shard_id, const std::vector<std::v
  auto &io_seekp = file_streams_[shard_id]->seekp(page_size_ * page_id + header_size_ + bytes_page, std::ios::beg);
  if (!io_seekp.good() || io_seekp.fail() || io_seekp.bad()) {
    file_streams_[shard_id]->close();
    RETURN_STATUS_UNEXPECTED("Failed to seekg file.");
    RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to seekg file.");
  }

  (void)FlushBlobChunk(file_streams_[shard_id], blob_data, blob_row);
@@ -808,7 +830,7 @@ Status ShardWriter::NewBlobPage(const int &shard_id, const std::vector<std::vect
    auto &io_seekp = file_streams_[shard_id]->seekp(page_size_ * (page_id + 1) + header_size_, std::ios::beg);
    if (!io_seekp.good() || io_seekp.fail() || io_seekp.bad()) {
      file_streams_[shard_id]->close();
      RETURN_STATUS_UNEXPECTED("Failed to seekg file.");
      RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to seekg file.");
    }

    (void)FlushBlobChunk(file_streams_[shard_id], blob_data, blob_row);
@@ -847,32 +869,32 @@ Status ShardWriter::ShiftRawPage(const int &shard_id, const std::vector<std::pai
  // Read last row group from previous raw data page
  CHECK_FAIL_RETURN_UNEXPECTED(
    shard_id >= 0 && shard_id < file_streams_.size(),
    "Invalid data, shard_id should be in range [0, " + std::to_string(file_streams_.size()) + ").");
    "[Internal ERROR] 'shard_id' should be in range [0, " + std::to_string(file_streams_.size()) + ").");

  auto &io_seekg = file_streams_[shard_id]->seekg(
    page_size_ * last_raw_page_id + header_size_ + last_row_group_id_offset, std::ios::beg);
  if (!io_seekg.good() || io_seekg.fail() || io_seekg.bad()) {
    file_streams_[shard_id]->close();
    RETURN_STATUS_UNEXPECTED("Failed to seekg file.");
    RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to seekg file.");
  }

  auto &io_read = file_streams_[shard_id]->read(reinterpret_cast<char *>(&buf[0]), buf.size());
  if (!io_read.good() || io_read.fail() || io_read.bad()) {
    file_streams_[shard_id]->close();
    RETURN_STATUS_UNEXPECTED("Failed to read file.");
    RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to read file.");
  }

  // Merge into new row group at new raw data page
  auto &io_seekp = file_streams_[shard_id]->seekp(page_size_ * (page_id + 1) + header_size_, std::ios::beg);
  if (!io_seekp.good() || io_seekp.fail() || io_seekp.bad()) {
    file_streams_[shard_id]->close();
    RETURN_STATUS_UNEXPECTED("Failed to seekg file.");
    RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to seekg file.");
  }

  auto &io_handle = file_streams_[shard_id]->write(reinterpret_cast<char *>(&buf[0]), buf.size());
  if (!io_handle.good() || io_handle.fail() || io_handle.bad()) {
    file_streams_[shard_id]->close();
    RETURN_STATUS_UNEXPECTED("Failed to write file.");
    RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to write file.");
  }
  last_raw_page->DeleteLastGroupId();
  (void)shard_header_->SetPage(last_raw_page);
@@ -935,7 +957,7 @@ Status ShardWriter::AppendRawPage(const int &shard_id, const std::vector<std::pa
    file_streams_[shard_id]->seekp(page_size_ * last_raw_page_id + header_size_ + n_bytes, std::ios::beg);
  if (!io_seekp.good() || io_seekp.fail() || io_seekp.bad()) {
    file_streams_[shard_id]->close();
    RETURN_STATUS_UNEXPECTED("Failed to seekg file.");
    RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to seekg file.");
  }

  if (chunk_id > 0) {
@@ -958,7 +980,7 @@ Status ShardWriter::FlushBlobChunk(const std::shared_ptr<std::fstream> &out,
                                   const std::pair<int, int> &blob_row) {
  CHECK_FAIL_RETURN_UNEXPECTED(
    blob_row.first <= blob_row.second && blob_row.second <= static_cast<int>(blob_data.size()) && blob_row.first >= 0,
    "Invalid data, blob_row: " + std::to_string(blob_row.first) + ", " + std::to_string(blob_row.second) +
    "[Internal ERROR] 'blob_row': " + std::to_string(blob_row.first) + ", " + std::to_string(blob_row.second) +
      " is invalid.");
  for (int j = blob_row.first; j < blob_row.second; ++j) {
    // Write the size of blob
@@ -966,7 +988,7 @@ Status ShardWriter::FlushBlobChunk(const std::shared_ptr<std::fstream> &out,
    auto &io_handle = out->write(reinterpret_cast<char *>(&line_len), kInt64Len);
    if (!io_handle.good() || io_handle.fail() || io_handle.bad()) {
      out->close();
      RETURN_STATUS_UNEXPECTED("Failed to write file.");
      RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to write file.");
    }

    // Write the data of blob
@@ -974,7 +996,7 @@ Status ShardWriter::FlushBlobChunk(const std::shared_ptr<std::fstream> &out,
    auto &io_handle_data = out->write(reinterpret_cast<char *>(&line[0]), line_len);
    if (!io_handle_data.good() || io_handle_data.fail() || io_handle_data.bad()) {
      out->close();
      RETURN_STATUS_UNEXPECTED("Failed to write file.");
      RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to write file.");
    }
  }
  return Status::OK();
@@ -990,7 +1012,7 @@ Status ShardWriter::FlushRawChunk(const std::shared_ptr<std::fstream> &out,
      auto &io_handle = out->write(reinterpret_cast<char *>(&line_len), kInt64Len);
      if (!io_handle.good() || io_handle.fail() || io_handle.bad()) {
        out->close();
        RETURN_STATUS_UNEXPECTED("Failed to write file.");
        RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to write file.");
      }
    }
    // Write the data of multi schemas
@@ -999,7 +1021,7 @@ Status ShardWriter::FlushRawChunk(const std::shared_ptr<std::fstream> &out,
      auto &io_handle = out->write(reinterpret_cast<char *>(&line[0]), line.size());
      if (!io_handle.good() || io_handle.fail() || io_handle.bad()) {
        out->close();
        RETURN_STATUS_UNEXPECTED("Failed to write file.");
        RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to write file.");
      }
    }
  }
@@ -1041,32 +1063,32 @@ Status ShardWriter::WriteShardHeader() {
  // Write header data to multi files
  CHECK_FAIL_RETURN_UNEXPECTED(
    shard_count_ <= static_cast<int>(file_streams_.size()) && shard_count_ <= static_cast<int>(shard_header.size()),
    "Invalid data, shard count should be less than or equal to file size: " + std::to_string(file_streams_.size()) +
      ", and header size: " + std::to_string(shard_header.size()) + ".");
    "[Internal ERROR] 'shard_count_' should be less than or equal to 'file_stream_' size: " +
      std::to_string(file_streams_.size()) + ", and 'shard_header' size: " + std::to_string(shard_header.size()) + ".");
  if (shard_count_ <= kMaxShardCount) {
    for (int shard_id = 0; shard_id < shard_count_; ++shard_id) {
      auto &io_seekp = file_streams_[shard_id]->seekp(0, std::ios::beg);
      if (!io_seekp.good() || io_seekp.fail() || io_seekp.bad()) {
        file_streams_[shard_id]->close();
        RETURN_STATUS_UNEXPECTED("Failed to seekp file.");
        RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to seekp file.");
      }

      std::vector<uint8_t> bin_header(shard_header[shard_id].begin(), shard_header[shard_id].end());
      uint64_t line_len = bin_header.size();
      if (line_len + kInt64Len > header_size_) {
        file_streams_[shard_id]->close();
        RETURN_STATUS_UNEXPECTED("shard header is too big.");
        RETURN_STATUS_UNEXPECTED("[Internal ERROR] shard header is too big.");
      }
      auto &io_handle = file_streams_[shard_id]->write(reinterpret_cast<char *>(&line_len), kInt64Len);
      if (!io_handle.good() || io_handle.fail() || io_handle.bad()) {
        file_streams_[shard_id]->close();
        RETURN_STATUS_UNEXPECTED("Failed to write file.");
        RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to write file.");
      }

      auto &io_handle_header = file_streams_[shard_id]->write(reinterpret_cast<char *>(&bin_header[0]), line_len);
      if (!io_handle_header.good() || io_handle_header.fail() || io_handle_header.bad()) {
        file_streams_[shard_id]->close();
        RETURN_STATUS_UNEXPECTED("Failed to write file.");
        RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to write file.");
      }
      file_streams_[shard_id]->close();
    }
@@ -1100,7 +1122,7 @@ Status ShardWriter::SerializeRawData(std::map<uint64_t, std::vector<json>> &raw_
    // Set obstacles to prevent the main thread from running
    thread_set[x].join();
  }
  CHECK_FAIL_RETURN_SYNTAX_ERROR(flag_ != true, "Error raised in FillArray function.");
  CHECK_FAIL_RETURN_SYNTAX_ERROR(flag_ != true, "[Internal ERROR] Error raised in FillArray function.");
  return Status::OK();
 }

@@ -1111,9 +1133,10 @@ Status ShardWriter::SetRawDataSize(const std::vector<std::vector<uint8_t>> &bin_
      bin_raw_data.begin() + (i * schema_count_), bin_raw_data.begin() + (i * schema_count_) + schema_count_, 0,
      [](uint64_t accumulator, const std::vector<uint8_t> &row) { return accumulator + kInt64Len + row.size(); });
  }
  CHECK_FAIL_RETURN_SYNTAX_ERROR(
    *std::max_element(raw_data_size_.begin(), raw_data_size_.end()) <= page_size_,
    "Invalid data, Page size: " + std::to_string(page_size_) + " is too small to save a raw row!");
  CHECK_FAIL_RETURN_SYNTAX_ERROR(*std::max_element(raw_data_size_.begin(), raw_data_size_.end()) <= page_size_,
                                 "Invalid data, Page size: " + std::to_string(page_size_) +
                                   " is too small to save a raw row. Please try to use the mindrecord api "
                                   "'set_page_size(1<<25)' to enable 64MB page size.");
  return Status::OK();
 }

@@ -1121,16 +1144,17 @@ Status ShardWriter::SetBlobDataSize(const std::vector<std::vector<uint8_t>> &blo
  blob_data_size_ = std::vector<uint64_t>(row_count_);
  (void)std::transform(blob_data.begin(), blob_data.end(), blob_data_size_.begin(),
                       [](const std::vector<uint8_t> &row) { return kInt64Len + row.size(); });
  CHECK_FAIL_RETURN_SYNTAX_ERROR(
    *std::max_element(blob_data_size_.begin(), blob_data_size_.end()) <= page_size_,
    "Invalid data, Page size: " + std::to_string(page_size_) + " is too small to save a blob row!");
  CHECK_FAIL_RETURN_SYNTAX_ERROR(*std::max_element(blob_data_size_.begin(), blob_data_size_.end()) <= page_size_,
                                 "Invalid data, Page size: " + std::to_string(page_size_) +
                                   " is too small to save a blob row. Please try to use the mindrecord api "
                                   "'set_page_size(1<<25)' to enable 64MB page size.");
  return Status::OK();
 }

 Status ShardWriter::SetLastRawPage(const int &shard_id, std::shared_ptr<Page> &last_raw_page) {
  // Get last raw page
  auto last_raw_page_id = shard_header_->GetLastPageIdByType(shard_id, kPageTypeRaw);
  CHECK_FAIL_RETURN_SYNTAX_ERROR(last_raw_page_id >= 0, "Invalid data, last_raw_page_id: " +
  CHECK_FAIL_RETURN_SYNTAX_ERROR(last_raw_page_id >= 0, "[Internal ERROR] 'last_raw_page_id': " +
                                                          std::to_string(last_raw_page_id) + " should be positive.");
  RETURN_IF_NOT_OK(shard_header_->GetPage(shard_id, last_raw_page_id, &last_raw_page));
  return Status::OK();
@@ -1139,7 +1163,7 @@ Status ShardWriter::SetLastRawPage(const int &shard_id, std::shared_ptr<Page> &l
 Status ShardWriter::SetLastBlobPage(const int &shard_id, std::shared_ptr<Page> &last_blob_page) {
  // Get last blob page
  auto last_blob_page_id = shard_header_->GetLastPageIdByType(shard_id, kPageTypeBlob);
  CHECK_FAIL_RETURN_SYNTAX_ERROR(last_blob_page_id >= 0, "Invalid data, last_blob_page_id: " +
  CHECK_FAIL_RETURN_SYNTAX_ERROR(last_blob_page_id >= 0, "[Internal ERROR] 'last_blob_page_id': " +
                                                           std::to_string(last_blob_page_id) + " should be positive.");
  RETURN_IF_NOT_OK(shard_header_->GetPage(shard_id, last_blob_page_id, &last_blob_page));
  return Status::OK();
--- a/mindspore/ccsrc/minddata/mindrecord/meta/shard_header.cc
+++ b/mindspore/ccsrc/minddata/mindrecord/meta/shard_header.cc
@@ -61,20 +61,27 @@ Status ShardHeader::InitializeHeader(const std::vector<json> &headers, bool load

 Status ShardHeader::CheckFileStatus(const std::string &path) {
  auto realpath = FileUtils::GetRealPath(path.data());
  CHECK_FAIL_RETURN_UNEXPECTED(realpath.has_value(), "Failed to get real path, path: " + path);
  CHECK_FAIL_RETURN_UNEXPECTED(
    realpath.has_value(),
    "Invalid file, failed to get the realpath of mindrecord files. Please check file path: " + path);
  std::ifstream fin(realpath.value(), std::ios::in | std::ios::binary);
  CHECK_FAIL_RETURN_UNEXPECTED(fin, "Failed to open file, file path: " + path);
  CHECK_FAIL_RETURN_UNEXPECTED(fin,
                               "Invalid file, failed to open files for loading mindrecord files. Please check file "
                               "path, permission and open file limit: " +
                                 path);
  // fetch file size
  auto &io_seekg = fin.seekg(0, std::ios::end);
  if (!io_seekg.good() || io_seekg.fail() || io_seekg.bad()) {
    fin.close();
    RETURN_STATUS_UNEXPECTED("Failed to seekg file, file path: " + path);
    RETURN_STATUS_UNEXPECTED("[Internal ERROR] failed to seekg file, file path: " + path);
  }

  size_t file_size = fin.tellg();
  if (file_size < kMinFileSize) {
    fin.close();
    RETURN_STATUS_UNEXPECTED("Invalid file content, file " + path + " size is smaller than the lower limit.");
    RETURN_STATUS_UNEXPECTED("Invalid file, the size of mindrecord file: " + std::to_string(file_size) +
                             " is smaller than the lower limit: " + std::to_string(kMinFileSize) +
                             ".\n Please use 'FileWriter' to generate valid mindrecord files.");
  }
  fin.close();
  return Status::OK();
@@ -86,18 +93,23 @@ Status ShardHeader::ValidateHeader(const std::string &path, std::shared_ptr<json
  // read header size
  json json_header;
  std::ifstream fin(common::SafeCStr(path), std::ios::in | std::ios::binary);
  CHECK_FAIL_RETURN_UNEXPECTED(fin.is_open(), "Failed to open file, file path: " + path);
  CHECK_FAIL_RETURN_UNEXPECTED(fin.is_open(),
                               "Invalid file, failed to open files for loading mindrecord files. Please check file "
                               "path, permission and open file limit: " +
                                 path);

  uint64_t header_size = 0;
  auto &io_read = fin.read(reinterpret_cast<char *>(&header_size), kInt64Len);
  if (!io_read.good() || io_read.fail() || io_read.bad()) {
    fin.close();
    RETURN_STATUS_UNEXPECTED("Failed to read file, file path: " + path);
    RETURN_STATUS_UNEXPECTED("[Internal ERROR] failed to read file, file path: " + path);
  }

  if (header_size > kMaxHeaderSize) {
    fin.close();
    RETURN_STATUS_UNEXPECTED("Invalid file content, incorrect file or file header is exceeds the upper limit.");
    RETURN_STATUS_UNEXPECTED(
      "Invalid file, the size of mindrecord file header is larger than the upper limit. \nPlease use 'FileWriter' to "
      "generate valid mindrecord files.");
  }

  // read header content
@@ -105,7 +117,7 @@ Status ShardHeader::ValidateHeader(const std::string &path, std::shared_ptr<json
  auto &io_read_content = fin.read(reinterpret_cast<char *>(&header_content[0]), header_size);
  if (!io_read_content.good() || io_read_content.fail() || io_read_content.bad()) {
    fin.close();
    RETURN_STATUS_UNEXPECTED("Failed to read file, file path: " + path);
    RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to read file, file path: " + path);
  }

  fin.close();
@@ -114,7 +126,8 @@ Status ShardHeader::ValidateHeader(const std::string &path, std::shared_ptr<json
  try {
    json_header = json::parse(raw_header_content);
  } catch (json::parse_error &e) {
    RETURN_STATUS_UNEXPECTED("Json parse failed: " + std::string(e.what()));
    RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to parse the metadata in JSON format in the mindrecord files: " +
                             std::string(e.what()));
  }
  *header_ptr = std::make_shared<json>(json_header);
  return Status::OK();
@@ -165,7 +178,7 @@ Status ShardHeader::BuildDataset(const std::vector<std::string> &file_paths, boo
  }
  if (thread_status) {
    thread_status = false;
    RETURN_STATUS_UNEXPECTED("Error occurred in GetHeadersOneTask thread.");
    RETURN_STATUS_UNEXPECTED("[Internal ERROR] Error raised in GetHeadersOneTask function.");
  }
  RETURN_IF_NOT_OK(InitializeHeader(headers, load_dataset));
  return Status::OK();
@@ -186,8 +199,8 @@ void ShardHeader::GetHeadersOneTask(int start, int end, std::vector<json> &heade
    (*header)["shard_addresses"] = realAddresses;
    if (std::find(kSupportedVersion.begin(), kSupportedVersion.end(), (*header)["version"]) ==
        kSupportedVersion.end()) {
      MS_LOG(ERROR) << "Invalid version, file version " << (*header)["version"].dump() << " can not match lib version "
                    << kVersion << ".";
      MS_LOG(ERROR) << "Invalid file, the version of mindrecord files" << (*header)["version"].dump()
                    << " is not supported.\nPlease use 'FileWriter' to generate valid mindrecord files.";
      thread_status = true;
      return;
    }
@@ -205,8 +218,8 @@ Status ShardHeader::InitByFiles(const std::vector<std::string> &file_paths) {
  shard_addresses_ = std::move(file_names);
  shard_count_ = file_paths.size();
  CHECK_FAIL_RETURN_UNEXPECTED(shard_count_ != 0 && (shard_count_ <= kMaxShardCount),
                               "Invalid input, The number of MindRecord files " + std::to_string(shard_count_) +
                                 "is not int range (0, " + std::to_string(kMaxShardCount) + "].");
                               "[Internal ERROR] 'shard_count_': " + std::to_string(shard_count_) +
                                 "is not in range (0, " + std::to_string(kMaxShardCount) + "].");
  pages_.resize(shard_count_);
  return Status::OK();
 }
@@ -225,9 +238,10 @@ Status ShardHeader::ParseIndexFields(const json &index_fields) {

 Status ShardHeader::ParsePage(const json &pages, int shard_index, bool load_dataset) {
  // set shard_index when load_dataset is false
  CHECK_FAIL_RETURN_UNEXPECTED(shard_count_ <= kMaxFileCount, "Invalid input, The number of MindRecord files " +
                                                                std::to_string(shard_count_) + "is not int range (0, " +
                                                                std::to_string(kMaxFileCount) + "].");
  CHECK_FAIL_RETURN_UNEXPECTED(shard_count_ <= kMaxFileCount,
                               "Invalid file, the number of mindrecord files: " + std::to_string(shard_count_) +
                                 "is not in range (0, " + std::to_string(kMaxFileCount) +
                                 "].\nPlease use 'FileWriter' to generate fewer mindrecord files.");
  if (pages_.empty()) {
    pages_.resize(shard_count_);
  }
@@ -261,7 +275,7 @@ Status ShardHeader::ParseStatistics(const json &statistics) {
  for (auto &statistic : statistics) {
    CHECK_FAIL_RETURN_UNEXPECTED(
      statistic.find("desc") != statistic.end() && statistic.find("statistics") != statistic.end(),
      "Failed to deserialize statistics, statistic info: " + statistics.dump());
      "[Internal ERROR] Failed to deserialize statistics: " + statistics.dump());
    std::string statistic_description = statistic["desc"].get<std::string>();
    json statistic_body = statistic["statistics"];
    std::shared_ptr<Statistics> parsed_statistic = Statistics::Build(statistic_description, statistic_body);
@@ -276,7 +290,7 @@ Status ShardHeader::ParseSchema(const json &schemas) {
    // change how we get schemaBody once design is finalized
    CHECK_FAIL_RETURN_UNEXPECTED(schema.find("desc") != schema.end() && schema.find("blob_fields") != schema.end() &&
                                   schema.find("schema") != schema.end(),
                                 "Failed to deserialize schema, schema info: " + schema.dump());
                                 "[Internal ERROR] Failed to deserialize schema: " + schema.dump());
    std::string schema_description = schema["desc"].get<std::string>();
    std::vector<std::string> blob_fields = schema["blob_fields"].get<std::vector<std::string>>();
    json schema_body = schema["schema"];
@@ -373,7 +387,7 @@ Status ShardHeader::GetPage(const int &shard_id, const int &page_id, std::shared
    return Status::OK();
  }
  page_ptr = nullptr;
  RETURN_STATUS_UNEXPECTED("Failed to get Page, 'page_id': " + std::to_string(page_id));
  RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to get Page, 'page_id': " + std::to_string(page_id));
 }

 Status ShardHeader::SetPage(const std::shared_ptr<Page> &new_page) {
@@ -383,7 +397,7 @@ Status ShardHeader::SetPage(const std::shared_ptr<Page> &new_page) {
    pages_[shard_id][page_id] = new_page;
    return Status::OK();
  }
  RETURN_STATUS_UNEXPECTED("Failed to set Page, 'page_id': " + std::to_string(page_id));
  RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to set Page, 'page_id': " + std::to_string(page_id));
 }

 Status ShardHeader::AddPage(const std::shared_ptr<Page> &new_page) {
@@ -393,7 +407,7 @@ Status ShardHeader::AddPage(const std::shared_ptr<Page> &new_page) {
    pages_[shard_id].push_back(new_page);
    return Status::OK();
  }
  RETURN_STATUS_UNEXPECTED("Failed to add Page, 'page_id': " + std::to_string(page_id));
  RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to add Page, 'page_id': " + std::to_string(page_id));
 }

 int64_t ShardHeader::GetLastPageId(const int &shard_id) {
@@ -419,7 +433,10 @@ int ShardHeader::GetLastPageIdByType(const int &shard_id, const std::string &pag

 Status ShardHeader::GetPageByGroupId(const int &group_id, const int &shard_id, std::shared_ptr<Page> *page_ptr) {
  RETURN_UNEXPECTED_IF_NULL(page_ptr);
  CHECK_FAIL_RETURN_UNEXPECTED(shard_id < static_cast<int>(pages_.size()), "Shard id is more than sum of shards.");
  CHECK_FAIL_RETURN_UNEXPECTED(shard_id < static_cast<int>(pages_.size()),
                               "[Internal ERROR] 'shard_id': " + std::to_string(shard_id) +
                                 " should be smaller than the size of 'pages_': " + std::to_string(pages_.size()) +
                                 ".");
  for (uint64_t i = pages_[shard_id].size(); i >= 1; i--) {
    auto page = pages_[shard_id][i - 1];
    if (page->GetPageType() == kPageTypeBlob && page->GetPageTypeID() == group_id) {
@@ -428,17 +445,17 @@ Status ShardHeader::GetPageByGroupId(const int &group_id, const int &shard_id, s
    }
  }
  page_ptr = nullptr;
  RETURN_STATUS_UNEXPECTED("Failed to get Page, 'group_id': " + std::to_string(group_id));
  RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to get Page, 'group_id': " + std::to_string(group_id));
 }

 int ShardHeader::AddSchema(std::shared_ptr<Schema> schema) {
  if (schema == nullptr) {
    MS_LOG(ERROR) << "The pointer of schema is null.";
    MS_LOG(ERROR) << "[Internal ERROR] The pointer of schema is NULL.";
    return -1;
  }

  if (!schema_.empty()) {
    MS_LOG(ERROR) << "The schema can not be added twice.";
    MS_LOG(ERROR) << "The schema is added repeatedly. Please remove the redundant 'add_schema' function.";
    return -1;
  }

@@ -474,11 +491,16 @@ std::shared_ptr<Index> ShardHeader::InitIndexPtr() {
 Status ShardHeader::CheckIndexField(const std::string &field, const json &schema) {
  // check field name is or is not valid
  CHECK_FAIL_RETURN_UNEXPECTED(schema.find(field) != schema.end(),
                               "Invalid input, field [" + field + "] can not found in schema.");
                               "Invalid input, 'index_fields': " + field + " can not found in schema: " +
                                 schema.dump() + ".\n Please use 'add_index' function to add proper 'index_fields'.");
  CHECK_FAIL_RETURN_UNEXPECTED(schema[field]["type"] != "Bytes",
                               "Invalid input, byte type field [" + field + "] can not set as an index field.");
                               "Invalid input, type of 'index_fields': " + field +
                                 " is bytes and can not set as an 'index_fields'.\n Please use 'add_index' function to "
                                 "add the other 'index_fields'.");
  CHECK_FAIL_RETURN_UNEXPECTED(schema.find(field) == schema.end() || schema[field].find("shape") == schema[field].end(),
                               "Invalid input, array type field [" + field + "] can not set as an index field.");
                               "Invalid input, type of 'index_fields': " + field +
                                 " is array and can not set as an 'index_fields'.\n Please use 'add_index' function to "
                                 "add the other 'index_fields'.");
  return Status::OK();
 }

@@ -486,7 +508,8 @@ Status ShardHeader::AddIndexFields(const std::vector<std::string> &fields) {
  if (fields.empty()) {
    return Status::OK();
  }
  CHECK_FAIL_RETURN_UNEXPECTED(!GetSchemas().empty(), "Invalid data, schema is empty.");
  CHECK_FAIL_RETURN_UNEXPECTED(!GetSchemas().empty(),
                               "Invalid data, schema is empty. Please use 'add_schema' function to add schema first.");
  // create index Object
  std::shared_ptr<Index> index = InitIndexPtr();
  for (const auto &schemaPtr : schema_) {
@@ -499,8 +522,9 @@ Status ShardHeader::AddIndexFields(const std::vector<std::string> &fields) {
      field_set.insert(item.second);
    }
    for (const auto &field : fields) {
      CHECK_FAIL_RETURN_UNEXPECTED(field_set.find(field) == field_set.end(),
                                   "Invalid data, the same index field [" + field + "] can not added twice.");
      CHECK_FAIL_RETURN_UNEXPECTED(
        field_set.find(field) == field_set.end(),
        "The 'index_fields': " + field + " is added repeatedly. Please remove the redundant 'add_index' function.");
      // check field name is or is not valid
      RETURN_IF_NOT_OK(CheckIndexField(field, schema));
      field_set.insert(field);
@@ -517,7 +541,7 @@ Status ShardHeader::GetAllSchemaID(std::set<uint64_t> &bucket_count) {
  for (const auto &schema : schema_) {
    auto schema_id = schema->GetSchemaID();
    CHECK_FAIL_RETURN_UNEXPECTED(bucket_count.find(schema_id) == bucket_count.end(),
                                 "Invalid data, duplicate schema exist, schema id: " + std::to_string(schema_id));
                                 "[Internal ERROR] duplicate schema exist, schema id: " + std::to_string(schema_id));
    bucket_count.insert(schema_id);
  }
  return Status::OK();
@@ -539,19 +563,21 @@ Status ShardHeader::AddIndexFields(std::vector<std::pair<uint64_t, std::string>>
  }
  for (const auto &field : fields) {
    CHECK_FAIL_RETURN_UNEXPECTED(field_set.find(field) == field_set.end(),
                                 "Invalid data, the same index field [" + field.second + "] can not added twice.");
                                 "The 'index_fields': " + field.second +
                                   " is added repeatedly. Please remove the redundant 'add_index' function.");
    uint64_t schema_id = field.first;
    std::string field_name = field.second;

    // check schemaId is or is not valid
    CHECK_FAIL_RETURN_UNEXPECTED(bucket_count.find(schema_id) != bucket_count.end(),
                                 "Invalid data, schema id [" + std::to_string(schema_id) + "] is invalid.");
                                 "[Internal ERROR] 'schema_id': " + std::to_string(schema_id) + " can not found.");
    // check field name is or is not valid
    std::shared_ptr<Schema> schema_ptr;
    RETURN_IF_NOT_OK(GetSchemaByID(schema_id, &schema_ptr));
    json schema = schema_ptr->GetSchema().at("schema");
    CHECK_FAIL_RETURN_UNEXPECTED(schema.find(field_name) != schema.end(),
                                 "Invalid data, field [" + field_name + "] is not found in schema.");
                                 "Invalid input, 'index_fields': " + field_name + " can not found in schema: " +
                                   schema.dump() + ".\n Please use 'add_index' function to add proper 'index_fields'.");
    RETURN_IF_NOT_OK(CheckIndexField(field_name, schema));
    field_set.insert(field);
    // add field into index
@@ -580,7 +606,7 @@ Status ShardHeader::GetSchemaByID(int64_t schema_id, std::shared_ptr<Schema> *sc
  RETURN_UNEXPECTED_IF_NULL(schema_ptr);
  int64_t schema_size = schema_.size();
  CHECK_FAIL_RETURN_UNEXPECTED(schema_id >= 0 && schema_id < schema_size,
                               "Invalid data, schema id [" + std::to_string(schema_id) + "] is not in range [0, " +
                               "[Internal ERROR] 'schema_id': " + std::to_string(schema_id) + " is not in range [0, " +
                                 std::to_string(schema_size) + ").");
  *schema_ptr = schema_.at(schema_id);
  return Status::OK();
@@ -590,18 +616,20 @@ Status ShardHeader::GetStatisticByID(int64_t statistic_id, std::shared_ptr<Stati
  RETURN_UNEXPECTED_IF_NULL(statistics_ptr);
  int64_t statistics_size = statistics_.size();
  CHECK_FAIL_RETURN_UNEXPECTED(statistic_id >= 0 && statistic_id < statistics_size,
                               "Invalid data, statistic id [" + std::to_string(statistic_id) +
                                 "] is not in range [0, " + std::to_string(statistics_size) + ").");
                               "[Internal ERROR] 'statistic_id': " + std::to_string(statistic_id) +
                                 " is not in range [0, " + std::to_string(statistics_size) + ").");
  *statistics_ptr = statistics_.at(statistic_id);
  return Status::OK();
 }

 Status ShardHeader::PagesToFile(const std::string dump_file_name) {
  auto realpath = FileUtils::GetRealPath(dump_file_name.data());
  CHECK_FAIL_RETURN_UNEXPECTED(realpath.has_value(), "Failed to get real path, path: " + dump_file_name);
  CHECK_FAIL_RETURN_UNEXPECTED(realpath.has_value(),
                               "[Internal ERROR] Failed to get the realpath of Pages file, path: " + dump_file_name);
  // write header content to file, dump whatever is in the file before
  std::ofstream page_out_handle(realpath.value(), std::ios_base::trunc | std::ios_base::out);
  CHECK_FAIL_RETURN_UNEXPECTED(page_out_handle.good(), "Failed to open page file, path: " + dump_file_name);
  CHECK_FAIL_RETURN_UNEXPECTED(page_out_handle.good(),
                               "[Internal ERROR] Failed to open Pages file, path: " + dump_file_name);
  auto pages = SerializePage();
  for (const auto &shard_pages : pages) {
    page_out_handle << shard_pages << "\n";
@@ -615,11 +643,12 @@ Status ShardHeader::FileToPages(const std::string dump_file_name) {
    v.clear();
  }
  auto realpath = FileUtils::GetRealPath(dump_file_name.data());
  CHECK_FAIL_RETURN_UNEXPECTED(realpath.has_value(), "Failed to get real path, path: " + dump_file_name);
  CHECK_FAIL_RETURN_UNEXPECTED(realpath.has_value(),
                               "[Internal ERROR] Failed to get the realpath of Pages file, path: " + dump_file_name);
  // attempt to open the file contains the page in json
  std::ifstream page_in_handle(realpath.value());
  CHECK_FAIL_RETURN_UNEXPECTED(page_in_handle.good(),
                               "Invalid file, page file does not exist, path: " + dump_file_name);
                               "[Internal ERROR] Pages file does not exist, path: " + dump_file_name);
  std::string line;
  while (std::getline(page_in_handle, line)) {
    RETURN_IF_NOT_OK(ParsePage(json::parse(line), -1, true));
@@ -633,7 +662,8 @@ Status ShardHeader::Initialize(const std::shared_ptr<ShardHeader> *header_ptr, c
                               uint64_t &schema_id) {
  RETURN_UNEXPECTED_IF_NULL(header_ptr);
  auto schema_ptr = Schema::Build("mindrecord", schema);
  CHECK_FAIL_RETURN_UNEXPECTED(schema_ptr != nullptr, "Failed to build schema: " + schema.dump() + ".");
  CHECK_FAIL_RETURN_UNEXPECTED(schema_ptr != nullptr,
                               "[Internal ERROR] Failed to build schema: " + schema.dump() + ".");
  schema_id = (*header_ptr)->AddSchema(schema_ptr);
  // create index
  std::vector<std::pair<uint64_t, std::string>> id_index_fields;
--- a/tests/ut/python/dataset/test_minddataset_exception.py
+++ b/tests/ut/python/dataset/test_minddataset_exception.py
@@ -100,8 +100,8 @@ def test_invalid_mindrecord():
        f.write('just for test')
    columns_list = ["data", "file_name", "label"]
    num_readers = 4
    with pytest.raises(RuntimeError, match="Unexpected error. Invalid file "
                       "content, incorrect file or file header is exceeds the upper limit."):
    with pytest.raises(RuntimeError, match="Unexpected error. Invalid file, the size of mindrecord file header "
                                           "is larger than the upper limit."):
        data_set = ds.MindDataset(file_name, columns_list, num_readers)
        for _ in data_set.create_dict_iterator(num_epochs=1, output_numpy=True):
            pass
--- a/tests/ut/python/mindrecord/test_cifar100_to_mindrecord.py
+++ b/tests/ut/python/mindrecord/test_cifar100_to_mindrecord.py
@@ -123,7 +123,7 @@ def test_cifar100_to_mindrecord_directory(fixture_file):
    when destination path is directory.
    """
    with pytest.raises(RuntimeError,
                       match="Invalid file, Mindrecord files already existed in path:"):
                       match="Invalid file, mindrecord files already exist. Please check file path:"):
        cifar100_transformer = Cifar100ToMR(CIFAR100_DIR,
                                            CIFAR100_DIR)
        cifar100_transformer.transform()
@@ -134,7 +134,7 @@ def test_cifar100_to_mindrecord_filename_equals_cifar100(fixture_file):
    when destination path equals source path.
    """
    with pytest.raises(RuntimeError,
                       match="Invalid file, Mindrecord files already existed in path:"):
                       match="Invalid file, mindrecord files already exist. Please check file path:"):
        cifar100_transformer = Cifar100ToMR(CIFAR100_DIR,
                                            CIFAR100_DIR + "/train")
        cifar100_transformer.transform()
--- a/tests/ut/python/mindrecord/test_cifar10_to_mindrecord.py
+++ b/tests/ut/python/mindrecord/test_cifar10_to_mindrecord.py
@@ -135,7 +135,7 @@ def test_cifar10_to_mindrecord_directory(fixture_file):
    when destination path is directory.
    """
    with pytest.raises(RuntimeError,
                       match="Unexpected error. Invalid file, Mindrecord files already existed in path:"):
                       match="Unexpected error. Invalid file, mindrecord files already exist. Please check file path:"):
        cifar10_transformer = Cifar10ToMR(CIFAR10_DIR, CIFAR10_DIR)
        cifar10_transformer.transform()

@@ -146,7 +146,7 @@ def test_cifar10_to_mindrecord_filename_equals_cifar10():
    when destination path equals source path.
    """
    with pytest.raises(RuntimeError,
                       match="Unexpected error. Invalid file, Mindrecord files already existed in path:"):
                       match="Unexpected error. Invalid file, mindrecord files already exist. Please check file path:"):
        cifar10_transformer = Cifar10ToMR(CIFAR10_DIR,
                                          CIFAR10_DIR + "/data_batch_0")
        cifar10_transformer.transform()
--- a/tests/ut/python/mindrecord/test_mindrecord_base.py
+++ b/tests/ut/python/mindrecord/test_mindrecord_base.py
@@ -1224,7 +1224,7 @@ def test_cv_file_overwrite_exception_01():
                          "label": {"type": "int64"}, "data": {"type": "bytes"}}
        writer.add_schema(cv_schema_json, "img_schema")
        writer.write_raw_data(data)
    assert 'Unexpected error. Invalid file, Mindrecord files already existed in path:' in str(err.value)
    assert 'Unexpected error. Invalid file, mindrecord files already exist. Please check file path:' in str(err.value)
    remove_multi_files(mindrecord_file_name, FILES_NUM)

 def test_cv_file_overwrite_exception_02():
@@ -1243,5 +1243,5 @@ def test_cv_file_overwrite_exception_02():
                          "label": {"type": "int64"}, "data": {"type": "bytes"}}
        writer.add_schema(cv_schema_json, "img_schema")
        writer.write_raw_data(data)
    assert 'Unexpected error. Invalid file, Mindrecord files already existed in path:' in str(err.value)
    assert 'Unexpected error. Invalid file, mindrecord files already exist. Please check file path:' in str(err.value)
    remove_multi_files(mindrecord_file_name, FILES_NUM)
--- a/tests/ut/python/mindrecord/test_mindrecord_exception.py
+++ b/tests/ut/python/mindrecord/test_mindrecord_exception.py
@@ -234,7 +234,7 @@ def test_invalid_mindrecord():
        f.write(dummy)
    with pytest.raises(RuntimeError) as err:
        FileReader(file_name)
    assert "Unexpected error. Invalid file content, incorrect file or file header" in str(err.value)
    assert "Invalid file, the size of mindrecord file header is larger than the upper limit." in str(err.value)
    remove_file(file_name)

 def test_invalid_db():
@@ -264,7 +264,7 @@ def test_overwrite_invalid_mindrecord():
        f.write('just for test')
    with pytest.raises(RuntimeError) as err:
        create_cv_mindrecord(1, file_name)
    assert 'Unexpected error. Invalid file, Mindrecord files already existed in path:' in str(err.value)
    assert 'Unexpected error. Invalid file, mindrecord files already exist. Please check file path:' in str(err.value)
    remove_file(file_name)

 def test_overwrite_invalid_db():
@@ -278,7 +278,7 @@ def test_overwrite_invalid_db():
        f.write('just for test')
    with pytest.raises(RuntimeError) as err:
        create_cv_mindrecord(1, file_name)
    assert 'Unexpected error. Invalid file, Mindrecord files already existed in path:' in str(err.value)
    assert 'Unexpected error. Invalid file, mindrecord files already exist. Please check file path:' in str(err.value)
    remove_file(file_name)

 def test_read_after_close():
@@ -560,7 +560,8 @@ def test_write_with_invalid_data():
    mindrecord_file_name = os.environ.get('PYTEST_CURRENT_TEST').split(':')[-1].split(' ')[0]

    # field: file_name  =>  filename
    with pytest.raises(RuntimeError, match="Unexpected error. Invalid data, schema count should be positive."):
    with pytest.raises(RuntimeError, match="Unexpected error. Invalid data, " \
                                           "the number of schema should be positive but got:"):
        remove_one_file(mindrecord_file_name)
        remove_one_file(mindrecord_file_name + ".db")

@@ -594,43 +595,9 @@ def test_write_with_invalid_data():
        writer.write_raw_data(data)
        writer.commit()

    # field: mask  =>  masks
    with pytest.raises(RuntimeError, match="Unexpected error. Invalid data, schema count should be positive."):
        remove_one_file(mindrecord_file_name)
        remove_one_file(mindrecord_file_name + ".db")

        data = [{"file_name": "001.jpg", "label": 43, "score": 0.8, "masks": np.array([3, 6, 9], dtype=np.int64),
                 "segments": np.array([[5.0, 1.6], [65.2, 8.3]], dtype=np.float32),
                 "data": bytes("image bytes abc", encoding='UTF-8')},
                {"file_name": "002.jpg", "label": 91, "score": 5.4, "masks": np.array([1, 4, 7], dtype=np.int64),
                 "segments": np.array([[5.1, 9.1], [2.0, 65.4]], dtype=np.float32),
                 "data": bytes("image bytes def", encoding='UTF-8')},
                {"file_name": "003.jpg", "label": 61, "score": 6.4, "masks": np.array([7, 6, 3], dtype=np.int64),
                 "segments": np.array([[0.0, 5.6], [3.0, 16.3]], dtype=np.float32),
                 "data": bytes("image bytes ghi", encoding='UTF-8')},
                {"file_name": "004.jpg", "label": 29, "score": 8.1, "masks": np.array([2, 8, 0], dtype=np.int64),
                 "segments": np.array([[5.9, 7.2], [4.0, 89.0]], dtype=np.float32),
                 "data": bytes("image bytes jkl", encoding='UTF-8')},
                {"file_name": "005.jpg", "label": 78, "score": 7.7, "masks": np.array([3, 1, 2], dtype=np.int64),
                 "segments": np.array([[0.6, 8.1], [5.3, 49.3]], dtype=np.float32),
                 "data": bytes("image bytes mno", encoding='UTF-8')},
                {"file_name": "006.jpg", "label": 37, "score": 9.4, "masks": np.array([7, 6, 7], dtype=np.int64),
                 "segments": np.array([[4.2, 6.3], [8.9, 81.8]], dtype=np.float32),
                 "data": bytes("image bytes pqr", encoding='UTF-8')}
                ]
        writer = FileWriter(mindrecord_file_name)
        schema = {"file_name": {"type": "string"},
                  "label": {"type": "int32"},
                  "score": {"type": "float64"},
                  "mask": {"type": "int64", "shape": [-1]},
                  "segments": {"type": "float32", "shape": [2, 2]},
                  "data": {"type": "bytes"}}
        writer.add_schema(schema, "data is so cool")
        writer.write_raw_data(data)
        writer.commit()

    # field: data  =>  image
    with pytest.raises(RuntimeError, match="Unexpected error. Invalid data, schema count should be positive."):
    with pytest.raises(RuntimeError, match="Unexpected error. Invalid data, " \
                                           "the number of schema should be positive but got:"):
        remove_one_file(mindrecord_file_name)
        remove_one_file(mindrecord_file_name + ".db")

@@ -664,78 +631,9 @@ def test_write_with_invalid_data():
        writer.write_raw_data(data)
        writer.commit()

    # field: label  =>  labels
    with pytest.raises(RuntimeError, match="Unexpected error. Invalid data, schema count should be positive."):
        remove_one_file(mindrecord_file_name)
        remove_one_file(mindrecord_file_name + ".db")

        data = [{"file_name": "001.jpg", "labels": 43, "score": 0.8, "mask": np.array([3, 6, 9], dtype=np.int64),
                 "segments": np.array([[5.0, 1.6], [65.2, 8.3]], dtype=np.float32),
                 "data": bytes("image bytes abc", encoding='UTF-8')},
                {"file_name": "002.jpg", "labels": 91, "score": 5.4, "mask": np.array([1, 4, 7], dtype=np.int64),
                 "segments": np.array([[5.1, 9.1], [2.0, 65.4]], dtype=np.float32),
                 "data": bytes("image bytes def", encoding='UTF-8')},
                {"file_name": "003.jpg", "labels": 61, "score": 6.4, "mask": np.array([7, 6, 3], dtype=np.int64),
                 "segments": np.array([[0.0, 5.6], [3.0, 16.3]], dtype=np.float32),
                 "data": bytes("image bytes ghi", encoding='UTF-8')},
                {"file_name": "004.jpg", "labels": 29, "score": 8.1, "mask": np.array([2, 8, 0], dtype=np.int64),
                 "segments": np.array([[5.9, 7.2], [4.0, 89.0]], dtype=np.float32),
                 "data": bytes("image bytes jkl", encoding='UTF-8')},
                {"file_name": "005.jpg", "labels": 78, "score": 7.7, "mask": np.array([3, 1, 2], dtype=np.int64),
                 "segments": np.array([[0.6, 8.1], [5.3, 49.3]], dtype=np.float32),
                 "data": bytes("image bytes mno", encoding='UTF-8')},
                {"file_name": "006.jpg", "labels": 37, "score": 9.4, "mask": np.array([7, 6, 7], dtype=np.int64),
                 "segments": np.array([[4.2, 6.3], [8.9, 81.8]], dtype=np.float32),
                 "data": bytes("image bytes pqr", encoding='UTF-8')}
                ]
        writer = FileWriter(mindrecord_file_name)
        schema = {"file_name": {"type": "string"},
                  "label": {"type": "int32"},
                  "score": {"type": "float64"},
                  "mask": {"type": "int64", "shape": [-1]},
                  "segments": {"type": "float32", "shape": [2, 2]},
                  "data": {"type": "bytes"}}
        writer.add_schema(schema, "data is so cool")
        writer.write_raw_data(data)
        writer.commit()

    # field: score  =>  scores
    with pytest.raises(RuntimeError, match="Unexpected error. Invalid data, schema count should be positive."):
        remove_one_file(mindrecord_file_name)
        remove_one_file(mindrecord_file_name + ".db")

        data = [{"file_name": "001.jpg", "label": 43, "scores": 0.8, "mask": np.array([3, 6, 9], dtype=np.int64),
                 "segments": np.array([[5.0, 1.6], [65.2, 8.3]], dtype=np.float32),
                 "data": bytes("image bytes abc", encoding='UTF-8')},
                {"file_name": "002.jpg", "label": 91, "scores": 5.4, "mask": np.array([1, 4, 7], dtype=np.int64),
                 "segments": np.array([[5.1, 9.1], [2.0, 65.4]], dtype=np.float32),
                 "data": bytes("image bytes def", encoding='UTF-8')},
                {"file_name": "003.jpg", "label": 61, "scores": 6.4, "mask": np.array([7, 6, 3], dtype=np.int64),
                 "segments": np.array([[0.0, 5.6], [3.0, 16.3]], dtype=np.float32),
                 "data": bytes("image bytes ghi", encoding='UTF-8')},
                {"file_name": "004.jpg", "label": 29, "scores": 8.1, "mask": np.array([2, 8, 0], dtype=np.int64),
                 "segments": np.array([[5.9, 7.2], [4.0, 89.0]], dtype=np.float32),
                 "data": bytes("image bytes jkl", encoding='UTF-8')},
                {"file_name": "005.jpg", "label": 78, "scores": 7.7, "mask": np.array([3, 1, 2], dtype=np.int64),
                 "segments": np.array([[0.6, 8.1], [5.3, 49.3]], dtype=np.float32),
                 "data": bytes("image bytes mno", encoding='UTF-8')},
                {"file_name": "006.jpg", "label": 37, "scores": 9.4, "mask": np.array([7, 6, 7], dtype=np.int64),
                 "segments": np.array([[4.2, 6.3], [8.9, 81.8]], dtype=np.float32),
                 "data": bytes("image bytes pqr", encoding='UTF-8')}
                ]
        writer = FileWriter(mindrecord_file_name)
        schema = {"file_name": {"type": "string"},
                  "label": {"type": "int32"},
                  "score": {"type": "float64"},
                  "mask": {"type": "int64", "shape": [-1]},
                  "segments": {"type": "float32", "shape": [2, 2]},
                  "data": {"type": "bytes"}}
        writer.add_schema(schema, "data is so cool")
        writer.write_raw_data(data)
        writer.commit()

    # string type with int value
    with pytest.raises(RuntimeError, match="Unexpected error. Invalid data, schema count should be positive."):
    with pytest.raises(RuntimeError, match="Unexpected error. Invalid data, " \
                                           "the number of schema should be positive but got:"):
        remove_one_file(mindrecord_file_name)
        remove_one_file(mindrecord_file_name + ".db")

@@ -770,7 +668,8 @@ def test_write_with_invalid_data():
        writer.commit()

    # field with int64 type, but the real data is string
    with pytest.raises(RuntimeError, match="Unexpected error. Invalid data, schema count should be positive."):
    with pytest.raises(RuntimeError, match="Unexpected error. Invalid data, " \
                                           "the number of schema should be positive but got:"):
        remove_one_file(mindrecord_file_name)
        remove_one_file(mindrecord_file_name + ".db")

@@ -805,7 +704,8 @@ def test_write_with_invalid_data():
        writer.commit()

    # bytes field is string
    with pytest.raises(RuntimeError, match="Unexpected error. Invalid data, schema count should be positive."):
    with pytest.raises(RuntimeError, match="Unexpected error. Invalid data, " \
                                           "the number of schema should be positive but got:"):
        remove_one_file(mindrecord_file_name)
        remove_one_file(mindrecord_file_name + ".db")

@@ -840,7 +740,8 @@ def test_write_with_invalid_data():
        writer.commit()

    # field is not numpy type
    with pytest.raises(RuntimeError, match="Unexpected error. Invalid data, schema count should be positive."):
    with pytest.raises(RuntimeError, match="Unexpected error. Invalid data, " \
                                           "the number of schema should be positive but got:"):
        remove_one_file(mindrecord_file_name)
        remove_one_file(mindrecord_file_name + ".db")

@@ -875,7 +776,8 @@ def test_write_with_invalid_data():
        writer.commit()

    # not enough field
    with pytest.raises(RuntimeError, match="Unexpected error. Invalid data, schema count should be positive."):
    with pytest.raises(RuntimeError, match="Unexpected error. Invalid data, " \
                                           "the number of schema should be positive but got:"):
        remove_one_file(mindrecord_file_name)
        remove_one_file(mindrecord_file_name + ".db")