Merge pull request !26611 from liyong126/fix_mindrecord_log_msgtags/v1.6.0
| @@ -44,19 +44,22 @@ Status ShardWriter::GetFullPathFromFileName(const std::vector<std::string> &path | |||
| // Get full path from file name | |||
| for (const auto &path : paths) { | |||
| CHECK_FAIL_RETURN_UNEXPECTED(CheckIsValidUtf8(path), | |||
| "Invalid data, file name: " + path + " contains invalid uft-8 character."); | |||
| "Invalid file, mindrecord file name: " + path + | |||
| " contains invalid uft-8 character. Please rename mindrecord file name."); | |||
| char resolved_path[PATH_MAX] = {0}; | |||
| char buf[PATH_MAX] = {0}; | |||
| CHECK_FAIL_RETURN_UNEXPECTED(strncpy_s(buf, PATH_MAX, common::SafeCStr(path), path.length()) == EOK, | |||
| "Failed to call securec func [strncpy_s], path: " + path); | |||
| "[Internal ERROR] Failed to call securec func [strncpy_s], path: " + path); | |||
| #if defined(_WIN32) || defined(_WIN64) | |||
| RETURN_UNEXPECTED_IF_NULL(_fullpath(resolved_path, dirname(&(buf[0])), PATH_MAX)); | |||
| RETURN_UNEXPECTED_IF_NULL(_fullpath(resolved_path, common::SafeCStr(path), PATH_MAX)); | |||
| #else | |||
| CHECK_FAIL_RETURN_UNEXPECTED(realpath(dirname(&(buf[0])), resolved_path) != nullptr, | |||
| "Invalid file, path: " + std::string(resolved_path)); | |||
| CHECK_FAIL_RETURN_UNEXPECTED( | |||
| realpath(dirname(&(buf[0])), resolved_path) != nullptr, | |||
| "Invalid file, failed to get the realpath of mindrecord files. Please check file path: " + | |||
| std::string(resolved_path)); | |||
| if (realpath(common::SafeCStr(path), resolved_path) == nullptr) { | |||
| MS_LOG(DEBUG) << "Path: " << common::SafeCStr(path) << "check success."; | |||
| MS_LOG(DEBUG) << "Succeed to check path: " << common::SafeCStr(path); | |||
| } | |||
| #endif | |||
| file_paths_.emplace_back(string(resolved_path)); | |||
| @@ -75,7 +78,8 @@ Status ShardWriter::OpenDataFiles(bool append, bool overwrite) { | |||
| } | |||
| auto realpath = FileUtils::GetRealPath(dir.value().data()); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(realpath.has_value(), "Failed to get real path, path: " + file); | |||
| CHECK_FAIL_RETURN_UNEXPECTED( | |||
| realpath.has_value(), "Invalid file, failed to get the realpath of mindrecord files. Please check file: " + file); | |||
| std::optional<std::string> whole_path = ""; | |||
| FileUtils::ConcatDirAndFileName(&realpath, &local_file_name, &whole_path); | |||
| @@ -91,19 +95,25 @@ Status ShardWriter::OpenDataFiles(bool append, bool overwrite) { | |||
| if (overwrite) { | |||
| auto res1 = std::remove(whole_path.value().c_str()); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(!std::ifstream(whole_path.value()) == true, | |||
| "Failed to delete file, path: " + file); | |||
| "Invalid file, failed to remove the old files when trying to overwrite " | |||
| "mindrecord files. Please check file path and permission: " + | |||
| file); | |||
| if (res1 == 0) { | |||
| MS_LOG(WARNING) << "Succeed to delete file, path: " << file; | |||
| MS_LOG(WARNING) << "Succeed to remove the old mindrecord files, path: " << file; | |||
| } | |||
| auto db_file = whole_path.value() + ".db"; | |||
| auto res2 = std::remove(db_file.c_str()); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(!std::ifstream(whole_path.value() + ".db") == true, | |||
| "Failed to delete db file, path: " + file + ".db"); | |||
| "Invalid file, failed to remove the old mindrecord meta files when trying to " | |||
| "overwrite mindrecord files. Please check file path and permission: " + | |||
| file + ".db"); | |||
| if (res2 == 0) { | |||
| MS_LOG(WARNING) << "Succeed to delete metadata file, path: " << file + ".db"; | |||
| MS_LOG(WARNING) << "Succeed to remove the old mindrecord metadata files, path: " << file + ".db"; | |||
| } | |||
| } else { | |||
| RETURN_STATUS_UNEXPECTED("Invalid file, Mindrecord files already existed in path: " + file); | |||
| RETURN_STATUS_UNEXPECTED( | |||
| "Invalid file, mindrecord files already exist. Please check file path: " + file + | |||
| +".\nIf you do not want to keep the files, set the 'overwrite' parameter to True and try again."); | |||
| } | |||
| } else { | |||
| fs->close(); | |||
| @@ -112,17 +122,23 @@ Status ShardWriter::OpenDataFiles(bool append, bool overwrite) { | |||
| // open the mindrecord file to write | |||
| fs->open(common::SafeCStr(file), std::ios::out | std::ios::in | std::ios::binary | std::ios::trunc); | |||
| if (!fs->good()) { | |||
| RETURN_STATUS_UNEXPECTED("Failed to open file, path: " + file); | |||
| RETURN_STATUS_UNEXPECTED( | |||
| "Invalid file, failed to open files for writing mindrecord files. Please check file path, permission and " | |||
| "open file limit: " + | |||
| file); | |||
| } | |||
| } else { | |||
| // open the mindrecord file to append | |||
| fs->open(common::SafeCStr(file), std::ios::out | std::ios::in | std::ios::binary); | |||
| if (!fs->good()) { | |||
| fs->close(); | |||
| RETURN_STATUS_UNEXPECTED("Failed to open file for append data, path: " + file); | |||
| RETURN_STATUS_UNEXPECTED( | |||
| "Invalid file, failed to open files for appending mindrecord files. Please check file path, permission and " | |||
| "open file limit: " + | |||
| file); | |||
| } | |||
| } | |||
| MS_LOG(INFO) << "Succeed to open shard file, path: " << file; | |||
| MS_LOG(INFO) << "Succeed to open mindrecord shard file, path: " << file; | |||
| file_streams_.push_back(fs); | |||
| } | |||
| return Status::OK(); | |||
| @@ -143,7 +159,7 @@ Status ShardWriter::RemoveLockFile() { | |||
| } | |||
| Status ShardWriter::InitLockFile() { | |||
| CHECK_FAIL_RETURN_UNEXPECTED(file_paths_.size() != 0, "Invalid data, file_paths_ is not initialized."); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(file_paths_.size() != 0, "[Internal ERROR] 'file_paths_' is not initialized."); | |||
| lock_file_ = file_paths_[0] + kLockFileSuffix; | |||
| pages_file_ = file_paths_[0] + kPageFileSuffix; | |||
| @@ -154,8 +170,8 @@ Status ShardWriter::InitLockFile() { | |||
| Status ShardWriter::Open(const std::vector<std::string> &paths, bool append, bool overwrite) { | |||
| shard_count_ = paths.size(); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(schema_count_ <= kMaxSchemaCount, | |||
| "Invalid data, schema_count_ must be less than or equal to " + | |||
| std::to_string(kMaxSchemaCount) + ", but got " + std::to_string(schema_count_)); | |||
| "[Internal ERROR] 'schema_count_' must be less than or equal to " + | |||
| std::to_string(kMaxSchemaCount) + ", but got: " + std::to_string(schema_count_)); | |||
| // Get full path from file name | |||
| RETURN_IF_NOT_OK(GetFullPathFromFileName(paths)); | |||
| @@ -167,7 +183,8 @@ Status ShardWriter::Open(const std::vector<std::string> &paths, bool append, boo | |||
| } | |||
| Status ShardWriter::OpenForAppend(const std::string &path) { | |||
| CHECK_FAIL_RETURN_UNEXPECTED(IsLegalFile(path), "Invalid file, path: " + path); | |||
| CHECK_FAIL_RETURN_UNEXPECTED( | |||
| IsLegalFile(path), "Invalid file, failed to verify files for append mindrecord files. Please check file: " + path); | |||
| std::shared_ptr<json> header_ptr; | |||
| RETURN_IF_NOT_OK(ShardHeader::BuildSingleHeader(path, &header_ptr)); | |||
| auto ds = std::make_shared<std::vector<std::string>>(); | |||
| @@ -262,7 +279,9 @@ void ShardWriter::DeleteErrorData(std::map<uint64_t, std::vector<json>> &raw_dat | |||
| for (auto &subMg : sub_err_mg) { | |||
| int loc = subMg.first; | |||
| std::string message = subMg.second; | |||
| MS_LOG(ERROR) << "Invalid input, the " << loc + 1 << " th data is invalid, " << message; | |||
| MS_LOG(ERROR) << "Invalid input, the " << loc + 1 | |||
| << " th data provided by user is invalid while writing mindrecord files. Please fix the error: " | |||
| << message; | |||
| (void)delete_set.insert(loc); | |||
| } | |||
| } | |||
| @@ -299,8 +318,8 @@ Status ShardWriter::CheckDataTypeAndValue(const std::string &key, const json &va | |||
| (data_type == "int64" && !data[key].is_number_integer()) || | |||
| (data_type == "float32" && !data[key].is_number_float()) || | |||
| (data_type == "float64" && !data[key].is_number_float()) || (data_type == "string" && !data[key].is_string())) { | |||
| std::string message = | |||
| "field: " + key + " ,type : " + data_type + " ,value: " + data[key].dump() + " is not matched."; | |||
| std::string message = "Invalid input, for field: " + key + ", type: " + data_type + | |||
| " and value: " + data[key].dump() + " do not match while writing mindrecord files."; | |||
| PopulateMutexErrorData(i, message, err_raw_data); | |||
| RETURN_STATUS_UNEXPECTED(message); | |||
| } | |||
| @@ -309,8 +328,8 @@ Status ShardWriter::CheckDataTypeAndValue(const std::string &key, const json &va | |||
| int64_t temp_value = data[key]; | |||
| if (static_cast<int64_t>(temp_value) < static_cast<int64_t>(std::numeric_limits<int32_t>::min()) && | |||
| static_cast<int64_t>(temp_value) > static_cast<int64_t>(std::numeric_limits<int32_t>::max())) { | |||
| std::string message = | |||
| "field: " + key + " ,type : " + data_type + " ,value: " + data[key].dump() + " is out of range."; | |||
| std::string message = "Invalid input, for field: " + key + "and its type: " + data_type + | |||
| ", value: " + data[key].dump() + " is out of range while writing mindrecord files."; | |||
| PopulateMutexErrorData(i, message, err_raw_data); | |||
| RETURN_STATUS_UNEXPECTED(message); | |||
| } | |||
| @@ -366,7 +385,7 @@ Status ShardWriter::CheckData(const std::map<uint64_t, std::vector<json>> &raw_d | |||
| // calculate start position and end position for each thread | |||
| int batch_size = rawdata_iter->second.size() / shard_count_; | |||
| int thread_num = shard_count_; | |||
| CHECK_FAIL_RETURN_UNEXPECTED(thread_num > 0, "Invalid data, thread_num should be positive."); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(thread_num > 0, "[Internal ERROR] 'thread_num' should be positive."); | |||
| if (thread_num > kMaxThreadCount) { | |||
| thread_num = kMaxThreadCount; | |||
| } | |||
| @@ -387,7 +406,7 @@ Status ShardWriter::CheckData(const std::map<uint64_t, std::vector<json>> &raw_d | |||
| } | |||
| CHECK_FAIL_RETURN_UNEXPECTED( | |||
| thread_num <= kMaxThreadCount, | |||
| "Invalid data, thread_num should be less than or equal to " + std::to_string(kMaxThreadCount)); | |||
| "[Internal ERROR] 'thread_num' should be less than or equal to " + std::to_string(kMaxThreadCount)); | |||
| // Wait for threads done | |||
| for (int x = 0; x < thread_num; ++x) { | |||
| thread_set[x].join(); | |||
| @@ -404,7 +423,8 @@ Status ShardWriter::ValidateRawData(std::map<uint64_t, std::vector<json>> &raw_d | |||
| RETURN_UNEXPECTED_IF_NULL(count_ptr); | |||
| auto rawdata_iter = raw_data.begin(); | |||
| schema_count_ = raw_data.size(); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(schema_count_ > 0, "Invalid data, schema count should be positive."); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(schema_count_ > 0, "Invalid data, the number of schema should be positive but got: " + | |||
| std::to_string(schema_count_) + ". Please check the input schema."); | |||
| // keep schema_id | |||
| std::set<int64_t> schema_ids; | |||
| @@ -412,17 +432,18 @@ Status ShardWriter::ValidateRawData(std::map<uint64_t, std::vector<json>> &raw_d | |||
| // Determine if the number of schemas is the same | |||
| CHECK_FAIL_RETURN_UNEXPECTED(shard_header_->GetSchemas().size() == schema_count_, | |||
| "Invalid data, schema count: " + std::to_string(schema_count_) + " is not matched."); | |||
| "[Internal ERROR] 'schema_count_' and the schema count in schema: " + | |||
| std::to_string(schema_count_) + " do not match."); | |||
| // Determine raw_data size == blob_data size | |||
| CHECK_FAIL_RETURN_UNEXPECTED(raw_data[0].size() == blob_data.size(), | |||
| "Invalid data, raw data size: " + std::to_string(raw_data[0].size()) + | |||
| "[Internal ERROR] raw data size: " + std::to_string(raw_data[0].size()) + | |||
| " is not equal to blob data size: " + std::to_string(blob_data.size()) + "."); | |||
| // Determine whether the number of samples corresponding to each schema is the same | |||
| for (rawdata_iter = raw_data.begin(); rawdata_iter != raw_data.end(); ++rawdata_iter) { | |||
| CHECK_FAIL_RETURN_UNEXPECTED(row_count_ == rawdata_iter->second.size(), | |||
| "Invalid data, number of samples: " + std::to_string(rawdata_iter->second.size()) + | |||
| " for schema is not matched."); | |||
| "[Internal ERROR] 'row_count_': " + std::to_string(rawdata_iter->second.size()) + | |||
| " for each schema is not the same."); | |||
| (void)schema_ids.insert(rawdata_iter->first); | |||
| } | |||
| const std::vector<std::shared_ptr<Schema>> &schemas = shard_header_->GetSchemas(); | |||
| @@ -431,7 +452,7 @@ Status ShardWriter::ValidateRawData(std::map<uint64_t, std::vector<json>> &raw_d | |||
| [schema_ids](const std::shared_ptr<Schema> &schema) { | |||
| return schema_ids.find(schema->GetSchemaID()) == schema_ids.end(); | |||
| }), | |||
| "Invalid data, schema id of data is not matched."); | |||
| "[Internal ERROR] schema id in 'schemas' can not found in 'schema_ids'."); | |||
| if (!sign) { | |||
| *count_ptr = std::make_shared<std::pair<int, int>>(schema_count_, row_count_); | |||
| return Status::OK(); | |||
| @@ -487,7 +508,7 @@ Status ShardWriter::LockWriter(bool parallel_writer, std::unique_ptr<int> *fd_pt | |||
| flock(fd, LOCK_EX); | |||
| } else { | |||
| close(fd); | |||
| RETURN_STATUS_UNEXPECTED("Failed to lock file, path: " + lock_file_); | |||
| RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to lock file, path: " + lock_file_); | |||
| } | |||
| #endif | |||
| @@ -497,20 +518,20 @@ Status ShardWriter::LockWriter(bool parallel_writer, std::unique_ptr<int> *fd_pt | |||
| auto realpath = FileUtils::GetRealPath(file.data()); | |||
| if (!realpath.has_value()) { | |||
| close(fd); | |||
| RETURN_STATUS_UNEXPECTED("Failed to get real path, path: " + file); | |||
| RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to get real path, path: " + file); | |||
| } | |||
| std::shared_ptr<std::fstream> fs = std::make_shared<std::fstream>(); | |||
| fs->open(realpath.value(), std::ios::in | std::ios::out | std::ios::binary); | |||
| if (fs->fail()) { | |||
| close(fd); | |||
| RETURN_STATUS_UNEXPECTED("Failed to open file, path: " + file); | |||
| RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to open file, path: " + file); | |||
| } | |||
| file_streams_.push_back(fs); | |||
| } | |||
| auto status = shard_header_->FileToPages(pages_file_); | |||
| if (status.IsError()) { | |||
| close(fd); | |||
| RETURN_STATUS_UNEXPECTED("Error raised in FileToPages function."); | |||
| RETURN_STATUS_UNEXPECTED("[Internal ERROR] Error raised in FileToPages function."); | |||
| } | |||
| *fd_ptr = std::make_unique<int>(fd); | |||
| return Status::OK(); | |||
| @@ -540,8 +561,9 @@ Status ShardWriter::WriteRawDataPreCheck(std::map<uint64_t, std::vector<json>> & | |||
| // check the free disk size | |||
| std::shared_ptr<uint64_t> size_ptr; | |||
| RETURN_IF_NOT_OK(GetDiskSize(file_paths_[0], kFreeSize, &size_ptr)); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(*size_ptr >= kMinFreeDiskSize, | |||
| "No free disk to be used, free disk size: " + std::to_string(*size_ptr)); | |||
| CHECK_FAIL_RETURN_UNEXPECTED( | |||
| *size_ptr >= kMinFreeDiskSize, | |||
| "No free disk to be used while writing mindrecord files, available free disk size: " + std::to_string(*size_ptr)); | |||
| // compress blob | |||
| if (shard_column_->CheckCompressBlob()) { | |||
| for (auto &blob : blob_data) { | |||
| @@ -615,7 +637,7 @@ Status ShardWriter::WriteRawData(std::map<uint64_t, std::vector<json>> &raw_data | |||
| // Serialize raw data | |||
| RETURN_IF_NOT_OK(WriteRawDataPreCheck(raw_data, blob_data, sign, &schema_count, &row_count)); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(row_count >= kInt0, "Invalid data, raw data size should be positive."); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(row_count >= kInt0, "[Internal ERROR] the size of raw data should be positive."); | |||
| if (row_count == kInt0) { | |||
| return Status::OK(); | |||
| } | |||
| @@ -676,7 +698,7 @@ Status ShardWriter::ParallelWriteData(const std::vector<std::vector<uint8_t>> &b | |||
| auto shards = BreakIntoShards(); | |||
| // define the number of thread | |||
| int thread_num = static_cast<int>(shard_count_); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(thread_num > 0, "Invalid data, thread_num should be positive."); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(thread_num > 0, "[Internal ERROR] 'thread_num' should be positive."); | |||
| if (thread_num > kMaxThreadCount) { | |||
| thread_num = kMaxThreadCount; | |||
| } | |||
| @@ -741,13 +763,13 @@ Status ShardWriter::CutRowGroup(int start_row, int end_row, const std::vector<st | |||
| int page_start_row = start_row; | |||
| CHECK_FAIL_RETURN_UNEXPECTED(start_row <= end_row, | |||
| "Invalid data, start row: " + std::to_string(start_row) + | |||
| " should be less than or equal to end row: " + std::to_string(end_row)); | |||
| "[Internal ERROR] 'start_row': " + std::to_string(start_row) + | |||
| " should be less than or equal to 'end_row': " + std::to_string(end_row)); | |||
| CHECK_FAIL_RETURN_UNEXPECTED( | |||
| end_row <= static_cast<int>(blob_data_size_.size()) && end_row <= static_cast<int>(raw_data_size_.size()), | |||
| "Invalid data, end row: " + std::to_string(end_row) + " should be less than blob data size: " + | |||
| std::to_string(blob_data_size_.size()) + " and raw data size: " + std::to_string(raw_data_size_.size()) + "."); | |||
| "[Internal ERROR] 'end_row': " + std::to_string(end_row) + " should be less than 'blob_data_size': " + | |||
| std::to_string(blob_data_size_.size()) + " and 'raw_data_size': " + std::to_string(raw_data_size_.size()) + "."); | |||
| for (int i = start_row; i < end_row; ++i) { | |||
| // n_byte_blob(0) indicate appendBlobPage | |||
| if (n_byte_blob == 0 || n_byte_blob + blob_data_size_[i] > page_size_ || | |||
| @@ -780,7 +802,7 @@ Status ShardWriter::AppendBlobPage(const int &shard_id, const std::vector<std::v | |||
| auto &io_seekp = file_streams_[shard_id]->seekp(page_size_ * page_id + header_size_ + bytes_page, std::ios::beg); | |||
| if (!io_seekp.good() || io_seekp.fail() || io_seekp.bad()) { | |||
| file_streams_[shard_id]->close(); | |||
| RETURN_STATUS_UNEXPECTED("Failed to seekg file."); | |||
| RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to seekg file."); | |||
| } | |||
| (void)FlushBlobChunk(file_streams_[shard_id], blob_data, blob_row); | |||
| @@ -808,7 +830,7 @@ Status ShardWriter::NewBlobPage(const int &shard_id, const std::vector<std::vect | |||
| auto &io_seekp = file_streams_[shard_id]->seekp(page_size_ * (page_id + 1) + header_size_, std::ios::beg); | |||
| if (!io_seekp.good() || io_seekp.fail() || io_seekp.bad()) { | |||
| file_streams_[shard_id]->close(); | |||
| RETURN_STATUS_UNEXPECTED("Failed to seekg file."); | |||
| RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to seekg file."); | |||
| } | |||
| (void)FlushBlobChunk(file_streams_[shard_id], blob_data, blob_row); | |||
| @@ -847,32 +869,32 @@ Status ShardWriter::ShiftRawPage(const int &shard_id, const std::vector<std::pai | |||
| // Read last row group from previous raw data page | |||
| CHECK_FAIL_RETURN_UNEXPECTED( | |||
| shard_id >= 0 && shard_id < file_streams_.size(), | |||
| "Invalid data, shard_id should be in range [0, " + std::to_string(file_streams_.size()) + ")."); | |||
| "[Internal ERROR] 'shard_id' should be in range [0, " + std::to_string(file_streams_.size()) + ")."); | |||
| auto &io_seekg = file_streams_[shard_id]->seekg( | |||
| page_size_ * last_raw_page_id + header_size_ + last_row_group_id_offset, std::ios::beg); | |||
| if (!io_seekg.good() || io_seekg.fail() || io_seekg.bad()) { | |||
| file_streams_[shard_id]->close(); | |||
| RETURN_STATUS_UNEXPECTED("Failed to seekg file."); | |||
| RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to seekg file."); | |||
| } | |||
| auto &io_read = file_streams_[shard_id]->read(reinterpret_cast<char *>(&buf[0]), buf.size()); | |||
| if (!io_read.good() || io_read.fail() || io_read.bad()) { | |||
| file_streams_[shard_id]->close(); | |||
| RETURN_STATUS_UNEXPECTED("Failed to read file."); | |||
| RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to read file."); | |||
| } | |||
| // Merge into new row group at new raw data page | |||
| auto &io_seekp = file_streams_[shard_id]->seekp(page_size_ * (page_id + 1) + header_size_, std::ios::beg); | |||
| if (!io_seekp.good() || io_seekp.fail() || io_seekp.bad()) { | |||
| file_streams_[shard_id]->close(); | |||
| RETURN_STATUS_UNEXPECTED("Failed to seekg file."); | |||
| RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to seekg file."); | |||
| } | |||
| auto &io_handle = file_streams_[shard_id]->write(reinterpret_cast<char *>(&buf[0]), buf.size()); | |||
| if (!io_handle.good() || io_handle.fail() || io_handle.bad()) { | |||
| file_streams_[shard_id]->close(); | |||
| RETURN_STATUS_UNEXPECTED("Failed to write file."); | |||
| RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to write file."); | |||
| } | |||
| last_raw_page->DeleteLastGroupId(); | |||
| (void)shard_header_->SetPage(last_raw_page); | |||
| @@ -935,7 +957,7 @@ Status ShardWriter::AppendRawPage(const int &shard_id, const std::vector<std::pa | |||
| file_streams_[shard_id]->seekp(page_size_ * last_raw_page_id + header_size_ + n_bytes, std::ios::beg); | |||
| if (!io_seekp.good() || io_seekp.fail() || io_seekp.bad()) { | |||
| file_streams_[shard_id]->close(); | |||
| RETURN_STATUS_UNEXPECTED("Failed to seekg file."); | |||
| RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to seekg file."); | |||
| } | |||
| if (chunk_id > 0) { | |||
| @@ -958,7 +980,7 @@ Status ShardWriter::FlushBlobChunk(const std::shared_ptr<std::fstream> &out, | |||
| const std::pair<int, int> &blob_row) { | |||
| CHECK_FAIL_RETURN_UNEXPECTED( | |||
| blob_row.first <= blob_row.second && blob_row.second <= static_cast<int>(blob_data.size()) && blob_row.first >= 0, | |||
| "Invalid data, blob_row: " + std::to_string(blob_row.first) + ", " + std::to_string(blob_row.second) + | |||
| "[Internal ERROR] 'blob_row': " + std::to_string(blob_row.first) + ", " + std::to_string(blob_row.second) + | |||
| " is invalid."); | |||
| for (int j = blob_row.first; j < blob_row.second; ++j) { | |||
| // Write the size of blob | |||
| @@ -966,7 +988,7 @@ Status ShardWriter::FlushBlobChunk(const std::shared_ptr<std::fstream> &out, | |||
| auto &io_handle = out->write(reinterpret_cast<char *>(&line_len), kInt64Len); | |||
| if (!io_handle.good() || io_handle.fail() || io_handle.bad()) { | |||
| out->close(); | |||
| RETURN_STATUS_UNEXPECTED("Failed to write file."); | |||
| RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to write file."); | |||
| } | |||
| // Write the data of blob | |||
| @@ -974,7 +996,7 @@ Status ShardWriter::FlushBlobChunk(const std::shared_ptr<std::fstream> &out, | |||
| auto &io_handle_data = out->write(reinterpret_cast<char *>(&line[0]), line_len); | |||
| if (!io_handle_data.good() || io_handle_data.fail() || io_handle_data.bad()) { | |||
| out->close(); | |||
| RETURN_STATUS_UNEXPECTED("Failed to write file."); | |||
| RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to write file."); | |||
| } | |||
| } | |||
| return Status::OK(); | |||
| @@ -990,7 +1012,7 @@ Status ShardWriter::FlushRawChunk(const std::shared_ptr<std::fstream> &out, | |||
| auto &io_handle = out->write(reinterpret_cast<char *>(&line_len), kInt64Len); | |||
| if (!io_handle.good() || io_handle.fail() || io_handle.bad()) { | |||
| out->close(); | |||
| RETURN_STATUS_UNEXPECTED("Failed to write file."); | |||
| RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to write file."); | |||
| } | |||
| } | |||
| // Write the data of multi schemas | |||
| @@ -999,7 +1021,7 @@ Status ShardWriter::FlushRawChunk(const std::shared_ptr<std::fstream> &out, | |||
| auto &io_handle = out->write(reinterpret_cast<char *>(&line[0]), line.size()); | |||
| if (!io_handle.good() || io_handle.fail() || io_handle.bad()) { | |||
| out->close(); | |||
| RETURN_STATUS_UNEXPECTED("Failed to write file."); | |||
| RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to write file."); | |||
| } | |||
| } | |||
| } | |||
| @@ -1041,32 +1063,32 @@ Status ShardWriter::WriteShardHeader() { | |||
| // Write header data to multi files | |||
| CHECK_FAIL_RETURN_UNEXPECTED( | |||
| shard_count_ <= static_cast<int>(file_streams_.size()) && shard_count_ <= static_cast<int>(shard_header.size()), | |||
| "Invalid data, shard count should be less than or equal to file size: " + std::to_string(file_streams_.size()) + | |||
| ", and header size: " + std::to_string(shard_header.size()) + "."); | |||
| "[Internal ERROR] 'shard_count_' should be less than or equal to 'file_stream_' size: " + | |||
| std::to_string(file_streams_.size()) + ", and 'shard_header' size: " + std::to_string(shard_header.size()) + "."); | |||
| if (shard_count_ <= kMaxShardCount) { | |||
| for (int shard_id = 0; shard_id < shard_count_; ++shard_id) { | |||
| auto &io_seekp = file_streams_[shard_id]->seekp(0, std::ios::beg); | |||
| if (!io_seekp.good() || io_seekp.fail() || io_seekp.bad()) { | |||
| file_streams_[shard_id]->close(); | |||
| RETURN_STATUS_UNEXPECTED("Failed to seekp file."); | |||
| RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to seekp file."); | |||
| } | |||
| std::vector<uint8_t> bin_header(shard_header[shard_id].begin(), shard_header[shard_id].end()); | |||
| uint64_t line_len = bin_header.size(); | |||
| if (line_len + kInt64Len > header_size_) { | |||
| file_streams_[shard_id]->close(); | |||
| RETURN_STATUS_UNEXPECTED("shard header is too big."); | |||
| RETURN_STATUS_UNEXPECTED("[Internal ERROR] shard header is too big."); | |||
| } | |||
| auto &io_handle = file_streams_[shard_id]->write(reinterpret_cast<char *>(&line_len), kInt64Len); | |||
| if (!io_handle.good() || io_handle.fail() || io_handle.bad()) { | |||
| file_streams_[shard_id]->close(); | |||
| RETURN_STATUS_UNEXPECTED("Failed to write file."); | |||
| RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to write file."); | |||
| } | |||
| auto &io_handle_header = file_streams_[shard_id]->write(reinterpret_cast<char *>(&bin_header[0]), line_len); | |||
| if (!io_handle_header.good() || io_handle_header.fail() || io_handle_header.bad()) { | |||
| file_streams_[shard_id]->close(); | |||
| RETURN_STATUS_UNEXPECTED("Failed to write file."); | |||
| RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to write file."); | |||
| } | |||
| file_streams_[shard_id]->close(); | |||
| } | |||
| @@ -1100,7 +1122,7 @@ Status ShardWriter::SerializeRawData(std::map<uint64_t, std::vector<json>> &raw_ | |||
| // Set obstacles to prevent the main thread from running | |||
| thread_set[x].join(); | |||
| } | |||
| CHECK_FAIL_RETURN_SYNTAX_ERROR(flag_ != true, "Error raised in FillArray function."); | |||
| CHECK_FAIL_RETURN_SYNTAX_ERROR(flag_ != true, "[Internal ERROR] Error raised in FillArray function."); | |||
| return Status::OK(); | |||
| } | |||
| @@ -1111,9 +1133,10 @@ Status ShardWriter::SetRawDataSize(const std::vector<std::vector<uint8_t>> &bin_ | |||
| bin_raw_data.begin() + (i * schema_count_), bin_raw_data.begin() + (i * schema_count_) + schema_count_, 0, | |||
| [](uint64_t accumulator, const std::vector<uint8_t> &row) { return accumulator + kInt64Len + row.size(); }); | |||
| } | |||
| CHECK_FAIL_RETURN_SYNTAX_ERROR( | |||
| *std::max_element(raw_data_size_.begin(), raw_data_size_.end()) <= page_size_, | |||
| "Invalid data, Page size: " + std::to_string(page_size_) + " is too small to save a raw row!"); | |||
| CHECK_FAIL_RETURN_SYNTAX_ERROR(*std::max_element(raw_data_size_.begin(), raw_data_size_.end()) <= page_size_, | |||
| "Invalid data, Page size: " + std::to_string(page_size_) + | |||
| " is too small to save a raw row. Please try to use the mindrecord api " | |||
| "'set_page_size(1<<25)' to enable 64MB page size."); | |||
| return Status::OK(); | |||
| } | |||
| @@ -1121,16 +1144,17 @@ Status ShardWriter::SetBlobDataSize(const std::vector<std::vector<uint8_t>> &blo | |||
| blob_data_size_ = std::vector<uint64_t>(row_count_); | |||
| (void)std::transform(blob_data.begin(), blob_data.end(), blob_data_size_.begin(), | |||
| [](const std::vector<uint8_t> &row) { return kInt64Len + row.size(); }); | |||
| CHECK_FAIL_RETURN_SYNTAX_ERROR( | |||
| *std::max_element(blob_data_size_.begin(), blob_data_size_.end()) <= page_size_, | |||
| "Invalid data, Page size: " + std::to_string(page_size_) + " is too small to save a blob row!"); | |||
| CHECK_FAIL_RETURN_SYNTAX_ERROR(*std::max_element(blob_data_size_.begin(), blob_data_size_.end()) <= page_size_, | |||
| "Invalid data, Page size: " + std::to_string(page_size_) + | |||
| " is too small to save a blob row. Please try to use the mindrecord api " | |||
| "'set_page_size(1<<25)' to enable 64MB page size."); | |||
| return Status::OK(); | |||
| } | |||
| Status ShardWriter::SetLastRawPage(const int &shard_id, std::shared_ptr<Page> &last_raw_page) { | |||
| // Get last raw page | |||
| auto last_raw_page_id = shard_header_->GetLastPageIdByType(shard_id, kPageTypeRaw); | |||
| CHECK_FAIL_RETURN_SYNTAX_ERROR(last_raw_page_id >= 0, "Invalid data, last_raw_page_id: " + | |||
| CHECK_FAIL_RETURN_SYNTAX_ERROR(last_raw_page_id >= 0, "[Internal ERROR] 'last_raw_page_id': " + | |||
| std::to_string(last_raw_page_id) + " should be positive."); | |||
| RETURN_IF_NOT_OK(shard_header_->GetPage(shard_id, last_raw_page_id, &last_raw_page)); | |||
| return Status::OK(); | |||
| @@ -1139,7 +1163,7 @@ Status ShardWriter::SetLastRawPage(const int &shard_id, std::shared_ptr<Page> &l | |||
| Status ShardWriter::SetLastBlobPage(const int &shard_id, std::shared_ptr<Page> &last_blob_page) { | |||
| // Get last blob page | |||
| auto last_blob_page_id = shard_header_->GetLastPageIdByType(shard_id, kPageTypeBlob); | |||
| CHECK_FAIL_RETURN_SYNTAX_ERROR(last_blob_page_id >= 0, "Invalid data, last_blob_page_id: " + | |||
| CHECK_FAIL_RETURN_SYNTAX_ERROR(last_blob_page_id >= 0, "[Internal ERROR] 'last_blob_page_id': " + | |||
| std::to_string(last_blob_page_id) + " should be positive."); | |||
| RETURN_IF_NOT_OK(shard_header_->GetPage(shard_id, last_blob_page_id, &last_blob_page)); | |||
| return Status::OK(); | |||
| @@ -61,20 +61,27 @@ Status ShardHeader::InitializeHeader(const std::vector<json> &headers, bool load | |||
| Status ShardHeader::CheckFileStatus(const std::string &path) { | |||
| auto realpath = FileUtils::GetRealPath(path.data()); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(realpath.has_value(), "Failed to get real path, path: " + path); | |||
| CHECK_FAIL_RETURN_UNEXPECTED( | |||
| realpath.has_value(), | |||
| "Invalid file, failed to get the realpath of mindrecord files. Please check file path: " + path); | |||
| std::ifstream fin(realpath.value(), std::ios::in | std::ios::binary); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(fin, "Failed to open file, file path: " + path); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(fin, | |||
| "Invalid file, failed to open files for loading mindrecord files. Please check file " | |||
| "path, permission and open file limit: " + | |||
| path); | |||
| // fetch file size | |||
| auto &io_seekg = fin.seekg(0, std::ios::end); | |||
| if (!io_seekg.good() || io_seekg.fail() || io_seekg.bad()) { | |||
| fin.close(); | |||
| RETURN_STATUS_UNEXPECTED("Failed to seekg file, file path: " + path); | |||
| RETURN_STATUS_UNEXPECTED("[Internal ERROR] failed to seekg file, file path: " + path); | |||
| } | |||
| size_t file_size = fin.tellg(); | |||
| if (file_size < kMinFileSize) { | |||
| fin.close(); | |||
| RETURN_STATUS_UNEXPECTED("Invalid file content, file " + path + " size is smaller than the lower limit."); | |||
| RETURN_STATUS_UNEXPECTED("Invalid file, the size of mindrecord file: " + std::to_string(file_size) + | |||
| " is smaller than the lower limit: " + std::to_string(kMinFileSize) + | |||
| ".\n Please use 'FileWriter' to generate valid mindrecord files."); | |||
| } | |||
| fin.close(); | |||
| return Status::OK(); | |||
| @@ -86,18 +93,23 @@ Status ShardHeader::ValidateHeader(const std::string &path, std::shared_ptr<json | |||
| // read header size | |||
| json json_header; | |||
| std::ifstream fin(common::SafeCStr(path), std::ios::in | std::ios::binary); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(fin.is_open(), "Failed to open file, file path: " + path); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(fin.is_open(), | |||
| "Invalid file, failed to open files for loading mindrecord files. Please check file " | |||
| "path, permission and open file limit: " + | |||
| path); | |||
| uint64_t header_size = 0; | |||
| auto &io_read = fin.read(reinterpret_cast<char *>(&header_size), kInt64Len); | |||
| if (!io_read.good() || io_read.fail() || io_read.bad()) { | |||
| fin.close(); | |||
| RETURN_STATUS_UNEXPECTED("Failed to read file, file path: " + path); | |||
| RETURN_STATUS_UNEXPECTED("[Internal ERROR] failed to read file, file path: " + path); | |||
| } | |||
| if (header_size > kMaxHeaderSize) { | |||
| fin.close(); | |||
| RETURN_STATUS_UNEXPECTED("Invalid file content, incorrect file or file header is exceeds the upper limit."); | |||
| RETURN_STATUS_UNEXPECTED( | |||
| "Invalid file, the size of mindrecord file header is larger than the upper limit. \nPlease use 'FileWriter' to " | |||
| "generate valid mindrecord files."); | |||
| } | |||
| // read header content | |||
| @@ -105,7 +117,7 @@ Status ShardHeader::ValidateHeader(const std::string &path, std::shared_ptr<json | |||
| auto &io_read_content = fin.read(reinterpret_cast<char *>(&header_content[0]), header_size); | |||
| if (!io_read_content.good() || io_read_content.fail() || io_read_content.bad()) { | |||
| fin.close(); | |||
| RETURN_STATUS_UNEXPECTED("Failed to read file, file path: " + path); | |||
| RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to read file, file path: " + path); | |||
| } | |||
| fin.close(); | |||
| @@ -114,7 +126,8 @@ Status ShardHeader::ValidateHeader(const std::string &path, std::shared_ptr<json | |||
| try { | |||
| json_header = json::parse(raw_header_content); | |||
| } catch (json::parse_error &e) { | |||
| RETURN_STATUS_UNEXPECTED("Json parse failed: " + std::string(e.what())); | |||
| RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to parse the metadata in JSON format in the mindrecord files: " + | |||
| std::string(e.what())); | |||
| } | |||
| *header_ptr = std::make_shared<json>(json_header); | |||
| return Status::OK(); | |||
| @@ -165,7 +178,7 @@ Status ShardHeader::BuildDataset(const std::vector<std::string> &file_paths, boo | |||
| } | |||
| if (thread_status) { | |||
| thread_status = false; | |||
| RETURN_STATUS_UNEXPECTED("Error occurred in GetHeadersOneTask thread."); | |||
| RETURN_STATUS_UNEXPECTED("[Internal ERROR] Error raised in GetHeadersOneTask function."); | |||
| } | |||
| RETURN_IF_NOT_OK(InitializeHeader(headers, load_dataset)); | |||
| return Status::OK(); | |||
| @@ -186,8 +199,8 @@ void ShardHeader::GetHeadersOneTask(int start, int end, std::vector<json> &heade | |||
| (*header)["shard_addresses"] = realAddresses; | |||
| if (std::find(kSupportedVersion.begin(), kSupportedVersion.end(), (*header)["version"]) == | |||
| kSupportedVersion.end()) { | |||
| MS_LOG(ERROR) << "Invalid version, file version " << (*header)["version"].dump() << " can not match lib version " | |||
| << kVersion << "."; | |||
| MS_LOG(ERROR) << "Invalid file, the version of mindrecord files" << (*header)["version"].dump() | |||
| << " is not supported.\nPlease use 'FileWriter' to generate valid mindrecord files."; | |||
| thread_status = true; | |||
| return; | |||
| } | |||
| @@ -205,8 +218,8 @@ Status ShardHeader::InitByFiles(const std::vector<std::string> &file_paths) { | |||
| shard_addresses_ = std::move(file_names); | |||
| shard_count_ = file_paths.size(); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(shard_count_ != 0 && (shard_count_ <= kMaxShardCount), | |||
| "Invalid input, The number of MindRecord files " + std::to_string(shard_count_) + | |||
| "is not int range (0, " + std::to_string(kMaxShardCount) + "]."); | |||
| "[Internal ERROR] 'shard_count_': " + std::to_string(shard_count_) + | |||
| "is not in range (0, " + std::to_string(kMaxShardCount) + "]."); | |||
| pages_.resize(shard_count_); | |||
| return Status::OK(); | |||
| } | |||
| @@ -225,9 +238,10 @@ Status ShardHeader::ParseIndexFields(const json &index_fields) { | |||
| Status ShardHeader::ParsePage(const json &pages, int shard_index, bool load_dataset) { | |||
| // set shard_index when load_dataset is false | |||
| CHECK_FAIL_RETURN_UNEXPECTED(shard_count_ <= kMaxFileCount, "Invalid input, The number of MindRecord files " + | |||
| std::to_string(shard_count_) + "is not int range (0, " + | |||
| std::to_string(kMaxFileCount) + "]."); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(shard_count_ <= kMaxFileCount, | |||
| "Invalid file, the number of mindrecord files: " + std::to_string(shard_count_) + | |||
| "is not in range (0, " + std::to_string(kMaxFileCount) + | |||
| "].\nPlease use 'FileWriter' to generate fewer mindrecord files."); | |||
| if (pages_.empty()) { | |||
| pages_.resize(shard_count_); | |||
| } | |||
| @@ -261,7 +275,7 @@ Status ShardHeader::ParseStatistics(const json &statistics) { | |||
| for (auto &statistic : statistics) { | |||
| CHECK_FAIL_RETURN_UNEXPECTED( | |||
| statistic.find("desc") != statistic.end() && statistic.find("statistics") != statistic.end(), | |||
| "Failed to deserialize statistics, statistic info: " + statistics.dump()); | |||
| "[Internal ERROR] Failed to deserialize statistics: " + statistics.dump()); | |||
| std::string statistic_description = statistic["desc"].get<std::string>(); | |||
| json statistic_body = statistic["statistics"]; | |||
| std::shared_ptr<Statistics> parsed_statistic = Statistics::Build(statistic_description, statistic_body); | |||
| @@ -276,7 +290,7 @@ Status ShardHeader::ParseSchema(const json &schemas) { | |||
| // change how we get schemaBody once design is finalized | |||
| CHECK_FAIL_RETURN_UNEXPECTED(schema.find("desc") != schema.end() && schema.find("blob_fields") != schema.end() && | |||
| schema.find("schema") != schema.end(), | |||
| "Failed to deserialize schema, schema info: " + schema.dump()); | |||
| "[Internal ERROR] Failed to deserialize schema: " + schema.dump()); | |||
| std::string schema_description = schema["desc"].get<std::string>(); | |||
| std::vector<std::string> blob_fields = schema["blob_fields"].get<std::vector<std::string>>(); | |||
| json schema_body = schema["schema"]; | |||
| @@ -373,7 +387,7 @@ Status ShardHeader::GetPage(const int &shard_id, const int &page_id, std::shared | |||
| return Status::OK(); | |||
| } | |||
| page_ptr = nullptr; | |||
| RETURN_STATUS_UNEXPECTED("Failed to get Page, 'page_id': " + std::to_string(page_id)); | |||
| RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to get Page, 'page_id': " + std::to_string(page_id)); | |||
| } | |||
| Status ShardHeader::SetPage(const std::shared_ptr<Page> &new_page) { | |||
| @@ -383,7 +397,7 @@ Status ShardHeader::SetPage(const std::shared_ptr<Page> &new_page) { | |||
| pages_[shard_id][page_id] = new_page; | |||
| return Status::OK(); | |||
| } | |||
| RETURN_STATUS_UNEXPECTED("Failed to set Page, 'page_id': " + std::to_string(page_id)); | |||
| RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to set Page, 'page_id': " + std::to_string(page_id)); | |||
| } | |||
| Status ShardHeader::AddPage(const std::shared_ptr<Page> &new_page) { | |||
| @@ -393,7 +407,7 @@ Status ShardHeader::AddPage(const std::shared_ptr<Page> &new_page) { | |||
| pages_[shard_id].push_back(new_page); | |||
| return Status::OK(); | |||
| } | |||
| RETURN_STATUS_UNEXPECTED("Failed to add Page, 'page_id': " + std::to_string(page_id)); | |||
| RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to add Page, 'page_id': " + std::to_string(page_id)); | |||
| } | |||
| int64_t ShardHeader::GetLastPageId(const int &shard_id) { | |||
| @@ -419,7 +433,10 @@ int ShardHeader::GetLastPageIdByType(const int &shard_id, const std::string &pag | |||
| Status ShardHeader::GetPageByGroupId(const int &group_id, const int &shard_id, std::shared_ptr<Page> *page_ptr) { | |||
| RETURN_UNEXPECTED_IF_NULL(page_ptr); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(shard_id < static_cast<int>(pages_.size()), "Shard id is more than sum of shards."); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(shard_id < static_cast<int>(pages_.size()), | |||
| "[Internal ERROR] 'shard_id': " + std::to_string(shard_id) + | |||
| " should be smaller than the size of 'pages_': " + std::to_string(pages_.size()) + | |||
| "."); | |||
| for (uint64_t i = pages_[shard_id].size(); i >= 1; i--) { | |||
| auto page = pages_[shard_id][i - 1]; | |||
| if (page->GetPageType() == kPageTypeBlob && page->GetPageTypeID() == group_id) { | |||
| @@ -428,17 +445,17 @@ Status ShardHeader::GetPageByGroupId(const int &group_id, const int &shard_id, s | |||
| } | |||
| } | |||
| page_ptr = nullptr; | |||
| RETURN_STATUS_UNEXPECTED("Failed to get Page, 'group_id': " + std::to_string(group_id)); | |||
| RETURN_STATUS_UNEXPECTED("[Internal ERROR] Failed to get Page, 'group_id': " + std::to_string(group_id)); | |||
| } | |||
| int ShardHeader::AddSchema(std::shared_ptr<Schema> schema) { | |||
| if (schema == nullptr) { | |||
| MS_LOG(ERROR) << "The pointer of schema is null."; | |||
| MS_LOG(ERROR) << "[Internal ERROR] The pointer of schema is NULL."; | |||
| return -1; | |||
| } | |||
| if (!schema_.empty()) { | |||
| MS_LOG(ERROR) << "The schema can not be added twice."; | |||
| MS_LOG(ERROR) << "The schema is added repeatedly. Please remove the redundant 'add_schema' function."; | |||
| return -1; | |||
| } | |||
| @@ -474,11 +491,16 @@ std::shared_ptr<Index> ShardHeader::InitIndexPtr() { | |||
| Status ShardHeader::CheckIndexField(const std::string &field, const json &schema) { | |||
| // check field name is or is not valid | |||
| CHECK_FAIL_RETURN_UNEXPECTED(schema.find(field) != schema.end(), | |||
| "Invalid input, field [" + field + "] can not found in schema."); | |||
| "Invalid input, 'index_fields': " + field + " can not found in schema: " + | |||
| schema.dump() + ".\n Please use 'add_index' function to add proper 'index_fields'."); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(schema[field]["type"] != "Bytes", | |||
| "Invalid input, byte type field [" + field + "] can not set as an index field."); | |||
| "Invalid input, type of 'index_fields': " + field + | |||
| " is bytes and can not set as an 'index_fields'.\n Please use 'add_index' function to " | |||
| "add the other 'index_fields'."); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(schema.find(field) == schema.end() || schema[field].find("shape") == schema[field].end(), | |||
| "Invalid input, array type field [" + field + "] can not set as an index field."); | |||
| "Invalid input, type of 'index_fields': " + field + | |||
| " is array and can not set as an 'index_fields'.\n Please use 'add_index' function to " | |||
| "add the other 'index_fields'."); | |||
| return Status::OK(); | |||
| } | |||
| @@ -486,7 +508,8 @@ Status ShardHeader::AddIndexFields(const std::vector<std::string> &fields) { | |||
| if (fields.empty()) { | |||
| return Status::OK(); | |||
| } | |||
| CHECK_FAIL_RETURN_UNEXPECTED(!GetSchemas().empty(), "Invalid data, schema is empty."); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(!GetSchemas().empty(), | |||
| "Invalid data, schema is empty. Please use 'add_schema' function to add schema first."); | |||
| // create index Object | |||
| std::shared_ptr<Index> index = InitIndexPtr(); | |||
| for (const auto &schemaPtr : schema_) { | |||
| @@ -499,8 +522,9 @@ Status ShardHeader::AddIndexFields(const std::vector<std::string> &fields) { | |||
| field_set.insert(item.second); | |||
| } | |||
| for (const auto &field : fields) { | |||
| CHECK_FAIL_RETURN_UNEXPECTED(field_set.find(field) == field_set.end(), | |||
| "Invalid data, the same index field [" + field + "] can not added twice."); | |||
| CHECK_FAIL_RETURN_UNEXPECTED( | |||
| field_set.find(field) == field_set.end(), | |||
| "The 'index_fields': " + field + " is added repeatedly. Please remove the redundant 'add_index' function."); | |||
| // check field name is or is not valid | |||
| RETURN_IF_NOT_OK(CheckIndexField(field, schema)); | |||
| field_set.insert(field); | |||
| @@ -517,7 +541,7 @@ Status ShardHeader::GetAllSchemaID(std::set<uint64_t> &bucket_count) { | |||
| for (const auto &schema : schema_) { | |||
| auto schema_id = schema->GetSchemaID(); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(bucket_count.find(schema_id) == bucket_count.end(), | |||
| "Invalid data, duplicate schema exist, schema id: " + std::to_string(schema_id)); | |||
| "[Internal ERROR] duplicate schema exist, schema id: " + std::to_string(schema_id)); | |||
| bucket_count.insert(schema_id); | |||
| } | |||
| return Status::OK(); | |||
| @@ -539,19 +563,21 @@ Status ShardHeader::AddIndexFields(std::vector<std::pair<uint64_t, std::string>> | |||
| } | |||
| for (const auto &field : fields) { | |||
| CHECK_FAIL_RETURN_UNEXPECTED(field_set.find(field) == field_set.end(), | |||
| "Invalid data, the same index field [" + field.second + "] can not added twice."); | |||
| "The 'index_fields': " + field.second + | |||
| " is added repeatedly. Please remove the redundant 'add_index' function."); | |||
| uint64_t schema_id = field.first; | |||
| std::string field_name = field.second; | |||
| // check schemaId is or is not valid | |||
| CHECK_FAIL_RETURN_UNEXPECTED(bucket_count.find(schema_id) != bucket_count.end(), | |||
| "Invalid data, schema id [" + std::to_string(schema_id) + "] is invalid."); | |||
| "[Internal ERROR] 'schema_id': " + std::to_string(schema_id) + " can not found."); | |||
| // check field name is or is not valid | |||
| std::shared_ptr<Schema> schema_ptr; | |||
| RETURN_IF_NOT_OK(GetSchemaByID(schema_id, &schema_ptr)); | |||
| json schema = schema_ptr->GetSchema().at("schema"); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(schema.find(field_name) != schema.end(), | |||
| "Invalid data, field [" + field_name + "] is not found in schema."); | |||
| "Invalid input, 'index_fields': " + field_name + " can not found in schema: " + | |||
| schema.dump() + ".\n Please use 'add_index' function to add proper 'index_fields'."); | |||
| RETURN_IF_NOT_OK(CheckIndexField(field_name, schema)); | |||
| field_set.insert(field); | |||
| // add field into index | |||
| @@ -580,7 +606,7 @@ Status ShardHeader::GetSchemaByID(int64_t schema_id, std::shared_ptr<Schema> *sc | |||
| RETURN_UNEXPECTED_IF_NULL(schema_ptr); | |||
| int64_t schema_size = schema_.size(); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(schema_id >= 0 && schema_id < schema_size, | |||
| "Invalid data, schema id [" + std::to_string(schema_id) + "] is not in range [0, " + | |||
| "[Internal ERROR] 'schema_id': " + std::to_string(schema_id) + " is not in range [0, " + | |||
| std::to_string(schema_size) + ")."); | |||
| *schema_ptr = schema_.at(schema_id); | |||
| return Status::OK(); | |||
| @@ -590,18 +616,20 @@ Status ShardHeader::GetStatisticByID(int64_t statistic_id, std::shared_ptr<Stati | |||
| RETURN_UNEXPECTED_IF_NULL(statistics_ptr); | |||
| int64_t statistics_size = statistics_.size(); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(statistic_id >= 0 && statistic_id < statistics_size, | |||
| "Invalid data, statistic id [" + std::to_string(statistic_id) + | |||
| "] is not in range [0, " + std::to_string(statistics_size) + ")."); | |||
| "[Internal ERROR] 'statistic_id': " + std::to_string(statistic_id) + | |||
| " is not in range [0, " + std::to_string(statistics_size) + ")."); | |||
| *statistics_ptr = statistics_.at(statistic_id); | |||
| return Status::OK(); | |||
| } | |||
| Status ShardHeader::PagesToFile(const std::string dump_file_name) { | |||
| auto realpath = FileUtils::GetRealPath(dump_file_name.data()); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(realpath.has_value(), "Failed to get real path, path: " + dump_file_name); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(realpath.has_value(), | |||
| "[Internal ERROR] Failed to get the realpath of Pages file, path: " + dump_file_name); | |||
| // write header content to file, dump whatever is in the file before | |||
| std::ofstream page_out_handle(realpath.value(), std::ios_base::trunc | std::ios_base::out); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(page_out_handle.good(), "Failed to open page file, path: " + dump_file_name); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(page_out_handle.good(), | |||
| "[Internal ERROR] Failed to open Pages file, path: " + dump_file_name); | |||
| auto pages = SerializePage(); | |||
| for (const auto &shard_pages : pages) { | |||
| page_out_handle << shard_pages << "\n"; | |||
| @@ -615,11 +643,12 @@ Status ShardHeader::FileToPages(const std::string dump_file_name) { | |||
| v.clear(); | |||
| } | |||
| auto realpath = FileUtils::GetRealPath(dump_file_name.data()); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(realpath.has_value(), "Failed to get real path, path: " + dump_file_name); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(realpath.has_value(), | |||
| "[Internal ERROR] Failed to get the realpath of Pages file, path: " + dump_file_name); | |||
| // attempt to open the file contains the page in json | |||
| std::ifstream page_in_handle(realpath.value()); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(page_in_handle.good(), | |||
| "Invalid file, page file does not exist, path: " + dump_file_name); | |||
| "[Internal ERROR] Pages file does not exist, path: " + dump_file_name); | |||
| std::string line; | |||
| while (std::getline(page_in_handle, line)) { | |||
| RETURN_IF_NOT_OK(ParsePage(json::parse(line), -1, true)); | |||
| @@ -633,7 +662,8 @@ Status ShardHeader::Initialize(const std::shared_ptr<ShardHeader> *header_ptr, c | |||
| uint64_t &schema_id) { | |||
| RETURN_UNEXPECTED_IF_NULL(header_ptr); | |||
| auto schema_ptr = Schema::Build("mindrecord", schema); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(schema_ptr != nullptr, "Failed to build schema: " + schema.dump() + "."); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(schema_ptr != nullptr, | |||
| "[Internal ERROR] Failed to build schema: " + schema.dump() + "."); | |||
| schema_id = (*header_ptr)->AddSchema(schema_ptr); | |||
| // create index | |||
| std::vector<std::pair<uint64_t, std::string>> id_index_fields; | |||
| @@ -100,8 +100,8 @@ def test_invalid_mindrecord(): | |||
| f.write('just for test') | |||
| columns_list = ["data", "file_name", "label"] | |||
| num_readers = 4 | |||
| with pytest.raises(RuntimeError, match="Unexpected error. Invalid file " | |||
| "content, incorrect file or file header is exceeds the upper limit."): | |||
| with pytest.raises(RuntimeError, match="Unexpected error. Invalid file, the size of mindrecord file header " | |||
| "is larger than the upper limit."): | |||
| data_set = ds.MindDataset(file_name, columns_list, num_readers) | |||
| for _ in data_set.create_dict_iterator(num_epochs=1, output_numpy=True): | |||
| pass | |||
| @@ -123,7 +123,7 @@ def test_cifar100_to_mindrecord_directory(fixture_file): | |||
| when destination path is directory. | |||
| """ | |||
| with pytest.raises(RuntimeError, | |||
| match="Invalid file, Mindrecord files already existed in path:"): | |||
| match="Invalid file, mindrecord files already exist. Please check file path:"): | |||
| cifar100_transformer = Cifar100ToMR(CIFAR100_DIR, | |||
| CIFAR100_DIR) | |||
| cifar100_transformer.transform() | |||
| @@ -134,7 +134,7 @@ def test_cifar100_to_mindrecord_filename_equals_cifar100(fixture_file): | |||
| when destination path equals source path. | |||
| """ | |||
| with pytest.raises(RuntimeError, | |||
| match="Invalid file, Mindrecord files already existed in path:"): | |||
| match="Invalid file, mindrecord files already exist. Please check file path:"): | |||
| cifar100_transformer = Cifar100ToMR(CIFAR100_DIR, | |||
| CIFAR100_DIR + "/train") | |||
| cifar100_transformer.transform() | |||
| @@ -135,7 +135,7 @@ def test_cifar10_to_mindrecord_directory(fixture_file): | |||
| when destination path is directory. | |||
| """ | |||
| with pytest.raises(RuntimeError, | |||
| match="Unexpected error. Invalid file, Mindrecord files already existed in path:"): | |||
| match="Unexpected error. Invalid file, mindrecord files already exist. Please check file path:"): | |||
| cifar10_transformer = Cifar10ToMR(CIFAR10_DIR, CIFAR10_DIR) | |||
| cifar10_transformer.transform() | |||
| @@ -146,7 +146,7 @@ def test_cifar10_to_mindrecord_filename_equals_cifar10(): | |||
| when destination path equals source path. | |||
| """ | |||
| with pytest.raises(RuntimeError, | |||
| match="Unexpected error. Invalid file, Mindrecord files already existed in path:"): | |||
| match="Unexpected error. Invalid file, mindrecord files already exist. Please check file path:"): | |||
| cifar10_transformer = Cifar10ToMR(CIFAR10_DIR, | |||
| CIFAR10_DIR + "/data_batch_0") | |||
| cifar10_transformer.transform() | |||
| @@ -1224,7 +1224,7 @@ def test_cv_file_overwrite_exception_01(): | |||
| "label": {"type": "int64"}, "data": {"type": "bytes"}} | |||
| writer.add_schema(cv_schema_json, "img_schema") | |||
| writer.write_raw_data(data) | |||
| assert 'Unexpected error. Invalid file, Mindrecord files already existed in path:' in str(err.value) | |||
| assert 'Unexpected error. Invalid file, mindrecord files already exist. Please check file path:' in str(err.value) | |||
| remove_multi_files(mindrecord_file_name, FILES_NUM) | |||
| def test_cv_file_overwrite_exception_02(): | |||
| @@ -1243,5 +1243,5 @@ def test_cv_file_overwrite_exception_02(): | |||
| "label": {"type": "int64"}, "data": {"type": "bytes"}} | |||
| writer.add_schema(cv_schema_json, "img_schema") | |||
| writer.write_raw_data(data) | |||
| assert 'Unexpected error. Invalid file, Mindrecord files already existed in path:' in str(err.value) | |||
| assert 'Unexpected error. Invalid file, mindrecord files already exist. Please check file path:' in str(err.value) | |||
| remove_multi_files(mindrecord_file_name, FILES_NUM) | |||
| @@ -234,7 +234,7 @@ def test_invalid_mindrecord(): | |||
| f.write(dummy) | |||
| with pytest.raises(RuntimeError) as err: | |||
| FileReader(file_name) | |||
| assert "Unexpected error. Invalid file content, incorrect file or file header" in str(err.value) | |||
| assert "Invalid file, the size of mindrecord file header is larger than the upper limit." in str(err.value) | |||
| remove_file(file_name) | |||
| def test_invalid_db(): | |||
| @@ -264,7 +264,7 @@ def test_overwrite_invalid_mindrecord(): | |||
| f.write('just for test') | |||
| with pytest.raises(RuntimeError) as err: | |||
| create_cv_mindrecord(1, file_name) | |||
| assert 'Unexpected error. Invalid file, Mindrecord files already existed in path:' in str(err.value) | |||
| assert 'Unexpected error. Invalid file, mindrecord files already exist. Please check file path:' in str(err.value) | |||
| remove_file(file_name) | |||
| def test_overwrite_invalid_db(): | |||
| @@ -278,7 +278,7 @@ def test_overwrite_invalid_db(): | |||
| f.write('just for test') | |||
| with pytest.raises(RuntimeError) as err: | |||
| create_cv_mindrecord(1, file_name) | |||
| assert 'Unexpected error. Invalid file, Mindrecord files already existed in path:' in str(err.value) | |||
| assert 'Unexpected error. Invalid file, mindrecord files already exist. Please check file path:' in str(err.value) | |||
| remove_file(file_name) | |||
| def test_read_after_close(): | |||
| @@ -560,7 +560,8 @@ def test_write_with_invalid_data(): | |||
| mindrecord_file_name = os.environ.get('PYTEST_CURRENT_TEST').split(':')[-1].split(' ')[0] | |||
| # field: file_name => filename | |||
| with pytest.raises(RuntimeError, match="Unexpected error. Invalid data, schema count should be positive."): | |||
| with pytest.raises(RuntimeError, match="Unexpected error. Invalid data, " \ | |||
| "the number of schema should be positive but got:"): | |||
| remove_one_file(mindrecord_file_name) | |||
| remove_one_file(mindrecord_file_name + ".db") | |||
| @@ -594,43 +595,9 @@ def test_write_with_invalid_data(): | |||
| writer.write_raw_data(data) | |||
| writer.commit() | |||
| # field: mask => masks | |||
| with pytest.raises(RuntimeError, match="Unexpected error. Invalid data, schema count should be positive."): | |||
| remove_one_file(mindrecord_file_name) | |||
| remove_one_file(mindrecord_file_name + ".db") | |||
| data = [{"file_name": "001.jpg", "label": 43, "score": 0.8, "masks": np.array([3, 6, 9], dtype=np.int64), | |||
| "segments": np.array([[5.0, 1.6], [65.2, 8.3]], dtype=np.float32), | |||
| "data": bytes("image bytes abc", encoding='UTF-8')}, | |||
| {"file_name": "002.jpg", "label": 91, "score": 5.4, "masks": np.array([1, 4, 7], dtype=np.int64), | |||
| "segments": np.array([[5.1, 9.1], [2.0, 65.4]], dtype=np.float32), | |||
| "data": bytes("image bytes def", encoding='UTF-8')}, | |||
| {"file_name": "003.jpg", "label": 61, "score": 6.4, "masks": np.array([7, 6, 3], dtype=np.int64), | |||
| "segments": np.array([[0.0, 5.6], [3.0, 16.3]], dtype=np.float32), | |||
| "data": bytes("image bytes ghi", encoding='UTF-8')}, | |||
| {"file_name": "004.jpg", "label": 29, "score": 8.1, "masks": np.array([2, 8, 0], dtype=np.int64), | |||
| "segments": np.array([[5.9, 7.2], [4.0, 89.0]], dtype=np.float32), | |||
| "data": bytes("image bytes jkl", encoding='UTF-8')}, | |||
| {"file_name": "005.jpg", "label": 78, "score": 7.7, "masks": np.array([3, 1, 2], dtype=np.int64), | |||
| "segments": np.array([[0.6, 8.1], [5.3, 49.3]], dtype=np.float32), | |||
| "data": bytes("image bytes mno", encoding='UTF-8')}, | |||
| {"file_name": "006.jpg", "label": 37, "score": 9.4, "masks": np.array([7, 6, 7], dtype=np.int64), | |||
| "segments": np.array([[4.2, 6.3], [8.9, 81.8]], dtype=np.float32), | |||
| "data": bytes("image bytes pqr", encoding='UTF-8')} | |||
| ] | |||
| writer = FileWriter(mindrecord_file_name) | |||
| schema = {"file_name": {"type": "string"}, | |||
| "label": {"type": "int32"}, | |||
| "score": {"type": "float64"}, | |||
| "mask": {"type": "int64", "shape": [-1]}, | |||
| "segments": {"type": "float32", "shape": [2, 2]}, | |||
| "data": {"type": "bytes"}} | |||
| writer.add_schema(schema, "data is so cool") | |||
| writer.write_raw_data(data) | |||
| writer.commit() | |||
| # field: data => image | |||
| with pytest.raises(RuntimeError, match="Unexpected error. Invalid data, schema count should be positive."): | |||
| with pytest.raises(RuntimeError, match="Unexpected error. Invalid data, " \ | |||
| "the number of schema should be positive but got:"): | |||
| remove_one_file(mindrecord_file_name) | |||
| remove_one_file(mindrecord_file_name + ".db") | |||
| @@ -664,78 +631,9 @@ def test_write_with_invalid_data(): | |||
| writer.write_raw_data(data) | |||
| writer.commit() | |||
| # field: label => labels | |||
| with pytest.raises(RuntimeError, match="Unexpected error. Invalid data, schema count should be positive."): | |||
| remove_one_file(mindrecord_file_name) | |||
| remove_one_file(mindrecord_file_name + ".db") | |||
| data = [{"file_name": "001.jpg", "labels": 43, "score": 0.8, "mask": np.array([3, 6, 9], dtype=np.int64), | |||
| "segments": np.array([[5.0, 1.6], [65.2, 8.3]], dtype=np.float32), | |||
| "data": bytes("image bytes abc", encoding='UTF-8')}, | |||
| {"file_name": "002.jpg", "labels": 91, "score": 5.4, "mask": np.array([1, 4, 7], dtype=np.int64), | |||
| "segments": np.array([[5.1, 9.1], [2.0, 65.4]], dtype=np.float32), | |||
| "data": bytes("image bytes def", encoding='UTF-8')}, | |||
| {"file_name": "003.jpg", "labels": 61, "score": 6.4, "mask": np.array([7, 6, 3], dtype=np.int64), | |||
| "segments": np.array([[0.0, 5.6], [3.0, 16.3]], dtype=np.float32), | |||
| "data": bytes("image bytes ghi", encoding='UTF-8')}, | |||
| {"file_name": "004.jpg", "labels": 29, "score": 8.1, "mask": np.array([2, 8, 0], dtype=np.int64), | |||
| "segments": np.array([[5.9, 7.2], [4.0, 89.0]], dtype=np.float32), | |||
| "data": bytes("image bytes jkl", encoding='UTF-8')}, | |||
| {"file_name": "005.jpg", "labels": 78, "score": 7.7, "mask": np.array([3, 1, 2], dtype=np.int64), | |||
| "segments": np.array([[0.6, 8.1], [5.3, 49.3]], dtype=np.float32), | |||
| "data": bytes("image bytes mno", encoding='UTF-8')}, | |||
| {"file_name": "006.jpg", "labels": 37, "score": 9.4, "mask": np.array([7, 6, 7], dtype=np.int64), | |||
| "segments": np.array([[4.2, 6.3], [8.9, 81.8]], dtype=np.float32), | |||
| "data": bytes("image bytes pqr", encoding='UTF-8')} | |||
| ] | |||
| writer = FileWriter(mindrecord_file_name) | |||
| schema = {"file_name": {"type": "string"}, | |||
| "label": {"type": "int32"}, | |||
| "score": {"type": "float64"}, | |||
| "mask": {"type": "int64", "shape": [-1]}, | |||
| "segments": {"type": "float32", "shape": [2, 2]}, | |||
| "data": {"type": "bytes"}} | |||
| writer.add_schema(schema, "data is so cool") | |||
| writer.write_raw_data(data) | |||
| writer.commit() | |||
| # field: score => scores | |||
| with pytest.raises(RuntimeError, match="Unexpected error. Invalid data, schema count should be positive."): | |||
| remove_one_file(mindrecord_file_name) | |||
| remove_one_file(mindrecord_file_name + ".db") | |||
| data = [{"file_name": "001.jpg", "label": 43, "scores": 0.8, "mask": np.array([3, 6, 9], dtype=np.int64), | |||
| "segments": np.array([[5.0, 1.6], [65.2, 8.3]], dtype=np.float32), | |||
| "data": bytes("image bytes abc", encoding='UTF-8')}, | |||
| {"file_name": "002.jpg", "label": 91, "scores": 5.4, "mask": np.array([1, 4, 7], dtype=np.int64), | |||
| "segments": np.array([[5.1, 9.1], [2.0, 65.4]], dtype=np.float32), | |||
| "data": bytes("image bytes def", encoding='UTF-8')}, | |||
| {"file_name": "003.jpg", "label": 61, "scores": 6.4, "mask": np.array([7, 6, 3], dtype=np.int64), | |||
| "segments": np.array([[0.0, 5.6], [3.0, 16.3]], dtype=np.float32), | |||
| "data": bytes("image bytes ghi", encoding='UTF-8')}, | |||
| {"file_name": "004.jpg", "label": 29, "scores": 8.1, "mask": np.array([2, 8, 0], dtype=np.int64), | |||
| "segments": np.array([[5.9, 7.2], [4.0, 89.0]], dtype=np.float32), | |||
| "data": bytes("image bytes jkl", encoding='UTF-8')}, | |||
| {"file_name": "005.jpg", "label": 78, "scores": 7.7, "mask": np.array([3, 1, 2], dtype=np.int64), | |||
| "segments": np.array([[0.6, 8.1], [5.3, 49.3]], dtype=np.float32), | |||
| "data": bytes("image bytes mno", encoding='UTF-8')}, | |||
| {"file_name": "006.jpg", "label": 37, "scores": 9.4, "mask": np.array([7, 6, 7], dtype=np.int64), | |||
| "segments": np.array([[4.2, 6.3], [8.9, 81.8]], dtype=np.float32), | |||
| "data": bytes("image bytes pqr", encoding='UTF-8')} | |||
| ] | |||
| writer = FileWriter(mindrecord_file_name) | |||
| schema = {"file_name": {"type": "string"}, | |||
| "label": {"type": "int32"}, | |||
| "score": {"type": "float64"}, | |||
| "mask": {"type": "int64", "shape": [-1]}, | |||
| "segments": {"type": "float32", "shape": [2, 2]}, | |||
| "data": {"type": "bytes"}} | |||
| writer.add_schema(schema, "data is so cool") | |||
| writer.write_raw_data(data) | |||
| writer.commit() | |||
| # string type with int value | |||
| with pytest.raises(RuntimeError, match="Unexpected error. Invalid data, schema count should be positive."): | |||
| with pytest.raises(RuntimeError, match="Unexpected error. Invalid data, " \ | |||
| "the number of schema should be positive but got:"): | |||
| remove_one_file(mindrecord_file_name) | |||
| remove_one_file(mindrecord_file_name + ".db") | |||
| @@ -770,7 +668,8 @@ def test_write_with_invalid_data(): | |||
| writer.commit() | |||
| # field with int64 type, but the real data is string | |||
| with pytest.raises(RuntimeError, match="Unexpected error. Invalid data, schema count should be positive."): | |||
| with pytest.raises(RuntimeError, match="Unexpected error. Invalid data, " \ | |||
| "the number of schema should be positive but got:"): | |||
| remove_one_file(mindrecord_file_name) | |||
| remove_one_file(mindrecord_file_name + ".db") | |||
| @@ -805,7 +704,8 @@ def test_write_with_invalid_data(): | |||
| writer.commit() | |||
| # bytes field is string | |||
| with pytest.raises(RuntimeError, match="Unexpected error. Invalid data, schema count should be positive."): | |||
| with pytest.raises(RuntimeError, match="Unexpected error. Invalid data, " \ | |||
| "the number of schema should be positive but got:"): | |||
| remove_one_file(mindrecord_file_name) | |||
| remove_one_file(mindrecord_file_name + ".db") | |||
| @@ -840,7 +740,8 @@ def test_write_with_invalid_data(): | |||
| writer.commit() | |||
| # field is not numpy type | |||
| with pytest.raises(RuntimeError, match="Unexpected error. Invalid data, schema count should be positive."): | |||
| with pytest.raises(RuntimeError, match="Unexpected error. Invalid data, " \ | |||
| "the number of schema should be positive but got:"): | |||
| remove_one_file(mindrecord_file_name) | |||
| remove_one_file(mindrecord_file_name + ".db") | |||
| @@ -875,7 +776,8 @@ def test_write_with_invalid_data(): | |||
| writer.commit() | |||
| # not enough field | |||
| with pytest.raises(RuntimeError, match="Unexpected error. Invalid data, schema count should be positive."): | |||
| with pytest.raises(RuntimeError, match="Unexpected error. Invalid data, " \ | |||
| "the number of schema should be positive but got:"): | |||
| remove_one_file(mindrecord_file_name) | |||
| remove_one_file(mindrecord_file_name + ".db") | |||