Merge pull request !1308 from h.farahat/string_tensor2tags/v0.3.0-alpha
| @@ -1,6 +1,10 @@ | |||
| ms_protobuf_generate(EXAMPLE_SRCS EXAMPLE_HDRS example.proto) | |||
| ms_protobuf_generate(FEATURE_SRCS FEATURE_HDRS feature.proto) | |||
| file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc") | |||
| set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD) | |||
| add_library(core OBJECT | |||
| ${EXAMPLE_SRCS} | |||
| ${FEATURE_SRCS} | |||
| client.cc | |||
| config_manager.cc | |||
| cv_tensor.cc | |||
| @@ -9,4 +13,5 @@ add_library(core OBJECT | |||
| tensor.cc | |||
| tensor_shape.cc | |||
| ) | |||
| add_dependencies(core mindspore::protobuf) | |||
| target_include_directories(core PRIVATE ${pybind11_INCLUDE_DIRS}) | |||
| @@ -25,14 +25,14 @@ namespace dataset { | |||
| uint8_t DataType::SizeInBytes() const { | |||
| if (type_ < DataType::NUM_OF_TYPES) | |||
| return SIZE_IN_BYTES[type_]; | |||
| return kTypeInfo[type_].sizeInBytes_; | |||
| else | |||
| return 0; | |||
| } | |||
| py::dtype DataType::AsNumpyType() const { | |||
| if (type_ < DataType::NUM_OF_TYPES) | |||
| return py::dtype(PYBIND_TYPES[type_]); | |||
| return py::dtype(kTypeInfo[type_].pybindType_); | |||
| else | |||
| return py::dtype("unknown"); | |||
| } | |||
| @@ -40,7 +40,7 @@ py::dtype DataType::AsNumpyType() const { | |||
| uint8_t DataType::AsCVType() const { | |||
| uint8_t res = kCVInvalidType; | |||
| if (type_ < DataType::NUM_OF_TYPES) { | |||
| res = CV_TYPES[type_]; | |||
| res = kTypeInfo[type_].cvType_; | |||
| } | |||
| if (res == kCVInvalidType) { | |||
| @@ -108,7 +108,7 @@ DataType::DataType(const std::string &type_str) { | |||
| std::string DataType::ToString() const { | |||
| if (type_ < DataType::NUM_OF_TYPES) | |||
| return TO_STRINGS[type_]; | |||
| return kTypeInfo[type_].name_; | |||
| else | |||
| return "unknown"; | |||
| } | |||
| @@ -149,7 +149,7 @@ DataType DataType::FromNpArray(const py::array &arr) { | |||
| std::string DataType::GetPybindFormat() const { | |||
| std::string res; | |||
| if (type_ < DataType::NUM_OF_TYPES) { | |||
| res = PYBIND_FORMAT_DESCRIPTOR[type_]; | |||
| res = kTypeInfo[type_].pybindFormatDescriptor_; | |||
| } | |||
| if (res.empty()) { | |||
| @@ -51,56 +51,31 @@ class DataType { | |||
| NUM_OF_TYPES | |||
| }; | |||
| inline static constexpr uint8_t SIZE_IN_BYTES[] = {0, // DE_UNKNOWN | |||
| 1, // DE_BOOL | |||
| 1, // DE_INT8 | |||
| 1, // DE_UINT8 | |||
| 2, // DE_INT16 | |||
| 2, // DE_UINT16 | |||
| 4, // DE_INT32 | |||
| 4, // DE_UINT32 | |||
| 8, // DE_INT64 | |||
| 8, // DE_UINT64 | |||
| 2, // DE_FLOAT16 | |||
| 4, // DE_FLOAT32 | |||
| 8, // DE_FLOAT64 | |||
| 0}; // DE_STRING | |||
| inline static const char *TO_STRINGS[] = {"unknown", "bool", "int8", "uint8", "int16", "uint16", "int32", | |||
| "uint32", "int64", "uint64", "float16", "float32", "float64", "string"}; | |||
| inline static const char *PYBIND_TYPES[] = {"object", "bool", "int8", "uint8", "int16", "uint16", "int32", | |||
| "uint32", "int64", "uint64", "float16", "float32", "double", "bytes"}; | |||
| inline static const std::string PYBIND_FORMAT_DESCRIPTOR[] = {"", // DE_UNKNOWN | |||
| py::format_descriptor<bool>::format(), // DE_BOOL | |||
| py::format_descriptor<int8_t>::format(), // DE_INT8 | |||
| py::format_descriptor<uint8_t>::format(), // DE_UINT8 | |||
| py::format_descriptor<int16_t>::format(), // DE_INT16 | |||
| py::format_descriptor<uint16_t>::format(), // DE_UINT16 | |||
| py::format_descriptor<int32_t>::format(), // DE_INT32 | |||
| py::format_descriptor<uint32_t>::format(), // DE_UINT32 | |||
| py::format_descriptor<int64_t>::format(), // DE_INT64 | |||
| py::format_descriptor<uint64_t>::format(), // DE_UINT64 | |||
| "e", // DE_FLOAT16 | |||
| py::format_descriptor<float>::format(), // DE_FLOAT32 | |||
| py::format_descriptor<double>::format(), // DE_FLOAT64 | |||
| "S"}; // DE_STRING | |||
| inline static constexpr uint8_t CV_TYPES[] = {kCVInvalidType, // DE_UNKNOWN | |||
| CV_8U, // DE_BOOL | |||
| CV_8S, // DE_INT8 | |||
| CV_8U, // DE_UINT8 | |||
| CV_16S, // DE_INT16 | |||
| CV_16U, // DE_UINT16 | |||
| CV_32S, // DE_INT32 | |||
| kCVInvalidType, // DE_UINT32 | |||
| kCVInvalidType, // DE_INT64 | |||
| kCVInvalidType, // DE_UINT64 | |||
| CV_16F, // DE_FLOAT16 | |||
| CV_32F, // DE_FLOAT32 | |||
| CV_64F, // DE_FLOAT64 | |||
| kCVInvalidType}; // DE_STRING | |||
| struct TypeInfo { | |||
| const char *name_; // name to be represent the type while printing | |||
| const uint8_t sizeInBytes_; // number of bytes needed for this type | |||
| const char *pybindType_; // Python matching type, used in get_output_types | |||
| const std::string pybindFormatDescriptor_; // pybind format used for numpy types | |||
| const uint8_t cvType_; // OpenCv matching type | |||
| }; | |||
| static inline const TypeInfo kTypeInfo[] = { | |||
| // name, sizeInBytes, pybindTypem formatDescriptor, openCV | |||
| {"unknown", 0, "object", "", kCVInvalidType}, // DE_UNKNOWN | |||
| {"bool", 1, "bool", py::format_descriptor<bool>::format(), CV_8U}, // DE_BOOL | |||
| {"int8", 1, "int8", py::format_descriptor<int8_t>::format(), CV_8S}, // DE_INT8 | |||
| {"uint8", 1, "uint8", py::format_descriptor<uint8_t>::format(), CV_8U}, // DE_UINT8 | |||
| {"int16", 2, "int16", py::format_descriptor<int16_t>::format(), CV_16S}, // DE_INT16 | |||
| {"uint16", 2, "uint16", py::format_descriptor<uint16_t>::format(), CV_16U}, // DE_UINT16 | |||
| {"int32", 4, "int32", py::format_descriptor<int32_t>::format(), CV_32S}, // DE_INT32 | |||
| {"uint32", 4, "uint32", py::format_descriptor<uint32_t>::format(), kCVInvalidType}, // DE_UINT32 | |||
| {"int64", 8, "int64", py::format_descriptor<int64_t>::format(), kCVInvalidType}, // DE_INT64 | |||
| {"uint64", 8, "uint64", py::format_descriptor<uint64_t>::format(), kCVInvalidType}, // DE_UINT64 | |||
| {"float16", 2, "float16", "e", CV_16F}, // DE_FLOAT16 | |||
| {"float32", 4, "float32", py::format_descriptor<float>::format(), CV_32F}, // DE_FLOAT32 | |||
| {"float64", 8, "double", py::format_descriptor<double>::format(), CV_64F}, // DE_FLOAT64 | |||
| {"string", 0, "bytes", "S", kCVInvalidType} // DE_STRING | |||
| }; | |||
| // No arg constructor to create an unknown shape | |||
| DataType() : type_(DE_UNKNOWN) {} | |||
| @@ -57,18 +57,40 @@ Tensor::Tensor(const TensorShape &shape, const DataType &type) : shape_(shape), | |||
| } | |||
| Tensor::Tensor(const TensorShape &shape, const DataType &type, const unsigned char *data) : Tensor(shape, type) { | |||
| if (type.IsNumeric()) { | |||
| // If the data pointer was given, then we can also populate the tensor with data | |||
| if (data != nullptr) { | |||
| // Given the shape/type of this tensor, compute the data size and copy in the input bytes. | |||
| int64_t byte_size = this->SizeInBytes(); | |||
| Status s = this->AllocateBuffer(byte_size); // Allocates data_ inside itself | |||
| if (s.IsOk() && data_ != nullptr) { | |||
| int ret_code = memcpy_s(data_, byte_size, data, byte_size); | |||
| if (ret_code != 0) { | |||
| MS_LOG(ERROR) << "Failed to copy data into Tensor!"; | |||
| } | |||
| } else { | |||
| MS_LOG(ERROR) << "Failed to create memory for Tensor!"; | |||
| } | |||
| } | |||
| } else { | |||
| MS_LOG(ERROR) << "Type should be numeric to use this constructor."; | |||
| } | |||
| } | |||
| Tensor::Tensor(const TensorShape &shape, const DataType &type, const unsigned char *data, const dsize_t &length) | |||
| : Tensor(shape, type) { | |||
| // If the data pointer was given, then we can also populate the tensor with data | |||
| if (data != nullptr) { | |||
| // Given the shape/type of this tensor, compute the data size and copy in the input bytes. | |||
| int64_t byte_size = this->SizeInBytes(); | |||
| static_cast<void>(this->GetMutableBuffer()); // Allocates data_ inside itself | |||
| // Allocates data_ inside itself | |||
| Status s = AllocateBuffer(length); | |||
| if (s.IsError()) { | |||
| MS_LOG(ERROR) << "Failed to create memory for Tensor!"; | |||
| } | |||
| if (data_ != nullptr) { | |||
| int ret_code = memcpy_s(data_, byte_size, data, byte_size); | |||
| int ret_code = memcpy_s(data_, length, data, length); | |||
| if (ret_code != 0) { | |||
| MS_LOG(ERROR) << "Failed to copy data into Tensor!"; | |||
| } | |||
| } else { | |||
| MS_LOG(ERROR) << "Failed to create memory for Tensor!"; | |||
| } | |||
| } | |||
| } | |||
| @@ -98,32 +120,79 @@ Tensor::Tensor(const std::vector<std::string> &strings, const TensorShape &shape | |||
| auto length_sum = [](dsize_t sum, const std::string &s) { return s.length() + sum; }; | |||
| dsize_t total_length = std::accumulate(strings.begin(), strings.end(), 0, length_sum); | |||
| dsize_t num_bytes = (kOffsetSize + 1) * shape_.NumOfElements() + total_length; | |||
| // total bytes needed = offset array + strings | |||
| // offset array needs to store one offset var per element + 1 extra to get the length of the last string. | |||
| // strings will be null-terminated --> need 1 extra byte per element | |||
| dsize_t num_bytes = (kOffsetSize + 1) * shape_.NumOfElements() + kOffsetSize + total_length; | |||
| data_ = data_allocator_->allocate(num_bytes); | |||
| auto offset_arr = reinterpret_cast<offset_t *>(data_); | |||
| uchar *buf = GetStringsBuffer(); | |||
| offset_t offset = -1; | |||
| offset_t offset = buf - data_; // the first string will start here | |||
| uint32_t i = 0; | |||
| for (const auto &str : strings) { | |||
| // insert the end index of the string | |||
| // end index of a string is the end index of previous string + the length (including \0) | |||
| offset = offset + str.length() + 1; | |||
| // insert the start index of the string. | |||
| offset_arr[i++] = offset; | |||
| // total bytes are reduced by kOffsetSize | |||
| num_bytes -= kOffsetSize; | |||
| // insert actual string | |||
| memcpy_s(buf, num_bytes, str.c_str(), str.length() + 1); | |||
| buf += str.length() + 1; | |||
| int ret_code = memcpy_s(data_ + offset, num_bytes, common::SafeCStr(str), str.length() + 1); | |||
| if (ret_code != 0) MS_LOG(ERROR) << "Cannot copy string into Tensor"; | |||
| // next string will be stored right after the current one. | |||
| offset = offset + str.length() + 1; | |||
| // total bytes are reduced by the length of the string | |||
| num_bytes -= str.length() + 1; | |||
| } | |||
| this->data_end_ = buf; | |||
| // store one more offset value so we can get the length of the last string | |||
| // length[last_element] = offset_arr[last_element + 1] - offset_arr[last_element] | |||
| offset_arr[i] = offset; | |||
| this->data_end_ = data_ + offset_arr[i]; | |||
| DS_ASSERT(num_bytes == 0); | |||
| if (shape.known()) Tensor::Reshape(shape); | |||
| } | |||
| Tensor::Tensor(const dataengine::BytesList &bytes_list, const TensorShape &shape) | |||
| : Tensor(TensorShape({static_cast<dsize_t>(bytes_list.value_size())}), DataType(DataType::DE_STRING)) { | |||
| // total bytes needed = offset array + strings | |||
| // offset array needs to store one offset var per element + 1 extra to get the length of the last string. | |||
| // strings will be null-terminated --> need 1 extra byte per element | |||
| dsize_t num_bytes = (kOffsetSize)*shape_.NumOfElements() + kOffsetSize + bytes_list.ByteSizeLong(); | |||
| data_ = data_allocator_->allocate(num_bytes); | |||
| auto offset_arr = reinterpret_cast<offset_t *>(data_); | |||
| uchar *buf = GetStringsBuffer(); | |||
| offset_t offset = buf - data_; // the first string will start here | |||
| uint32_t i = 0; | |||
| for (; i < bytes_list.value_size(); i++) { | |||
| const std::string &str = bytes_list.value(i); | |||
| // insert the start index of the string. | |||
| offset_arr[i] = offset; | |||
| // total bytes are reduced by kOffsetSize | |||
| num_bytes -= kOffsetSize; | |||
| // insert actual string | |||
| int ret_code = memcpy_s(data_ + offset, num_bytes, common::SafeCStr(str), str.length() + 1); | |||
| if (ret_code != 0) { | |||
| MS_LOG(ERROR) << "Cannot copy string into Tensor"; | |||
| } | |||
| // next string will be stored right after the current one. | |||
| offset = offset + str.length() + 1; | |||
| // total bytes are reduced by the length of the string | |||
| num_bytes -= str.length() + 1; | |||
| } | |||
| // store one more offset value so we can get the length of the last string | |||
| // length[last_element] = offset_arr[last_element + 1] - offset_arr[last_element] | |||
| offset_arr[i] = offset; | |||
| data_end_ = data_ + offset_arr[i]; | |||
| DS_ASSERT(num_bytes == 0); | |||
| if (shape.known()) Tensor::Reshape(shape); | |||
| } | |||
| Status Tensor::CreateTensor(std::shared_ptr<Tensor> *ptr, TensorImpl tensor_impl, const TensorShape &shape, | |||
| DataType type, const unsigned char *data) { | |||
| if (!shape.known()) { | |||
| @@ -152,20 +221,17 @@ Status Tensor::CreateTensor(std::shared_ptr<Tensor> *ptr, TensorImpl tensor_impl | |||
| } | |||
| return Status::OK(); // returns base-class shared_ptr | |||
| } | |||
| std::string to(std::string x) { return x; } | |||
| Status Tensor::CreateTensorFromNumpyString(std::shared_ptr<Tensor> *ptr, py::array arr) { | |||
| std::vector<dsize_t> shape; | |||
| for (dsize_t i = 0; i < arr.ndim(); i++) { | |||
| shape.push_back(static_cast<dsize_t>(arr.shape()[i])); | |||
| } | |||
| arr.resize({arr.size()}); | |||
| auto itr = arr.begin(); | |||
| arr.resize({arr.size()}); // flatten the py::array so we can iterate once | |||
| std::vector<std::string> strings; | |||
| for (; itr != arr.end(); itr++) { | |||
| std::string s = to(py::cast<py::bytes>(*itr)); | |||
| strings.push_back(s); | |||
| } | |||
| arr.resize(shape); | |||
| std::for_each(arr.begin(), arr.end(), [&strings](const auto &s) { strings.emplace_back(py::cast<py::bytes>(s)); }); | |||
| arr.resize(shape); // resize arr back to the original shape | |||
| return CreateTensor(ptr, strings, TensorShape{shape}); | |||
| } | |||
| @@ -190,8 +256,9 @@ Status Tensor::CreateTensor(std::shared_ptr<Tensor> *ptr, py::array arr) { | |||
| std::shared_ptr<MemoryPool> global_pool = GlobalContext::Instance()->mem_pool(); | |||
| (*ptr)->data_allocator_ = std::make_unique<Allocator<unsigned char>>(global_pool); | |||
| static_cast<void>((*ptr)->GetMutableBuffer()); | |||
| int64_t byte_size = (*ptr)->SizeInBytes(); | |||
| RETURN_IF_NOT_OK((*ptr)->AllocateBuffer(byte_size)); | |||
| unsigned char *data = static_cast<unsigned char *>(arr.request().ptr); | |||
| if ((*ptr)->data_ == nullptr) { | |||
| RETURN_STATUS_UNEXPECTED("Failed to create memory for Tensor."); | |||
| @@ -232,6 +299,13 @@ Status Tensor::CreateTensor(std::shared_ptr<Tensor> *ptr, const std::vector<std: | |||
| return Status::OK(); | |||
| } | |||
| Status Tensor::CreateTensor(std::shared_ptr<Tensor> *ptr, const dataengine::BytesList &bytes_list, | |||
| const TensorShape &shape) { | |||
| const TensorAlloc *alloc = GlobalContext::Instance()->tensor_allocator(); | |||
| *ptr = std::allocate_shared<Tensor>(*alloc, bytes_list, shape); | |||
| return Status::OK(); | |||
| } | |||
| // Memcpy the given strided array's used part to consecutive memory | |||
| // Consider a 3-d array | |||
| // A[(i * shape[1] + j) * shape[2] + k] = B[i][j][k] = C[i * strides[0] + j * strides[1] + k * strides[2]] | |||
| @@ -370,25 +444,20 @@ void Tensor::Print(std::ostream &out) const { | |||
| out << "[Data area is null]"; | |||
| } | |||
| } | |||
| // Name: ToFlatIndex() | |||
| // Description: convert a vector style index to number, used to access memory internal use only | |||
| Status Tensor::ToFlatIndex(const std::vector<dsize_t> &index, dsize_t *flat_index) const { | |||
| if (!shape_.IsValidIndex(index)) { | |||
| std::string err = "Not a valid index"; | |||
| RETURN_STATUS_UNEXPECTED(err); | |||
| } | |||
| *flat_index = 0; | |||
| for (size_t k = 0; k < index.size(); k++) { | |||
| dsize_t product = 1; | |||
| for (size_t l = k + 1; l < index.size(); l++) { | |||
| product *= shape_[l]; | |||
| Status Tensor::AllocateBuffer(const dsize_t &length) { | |||
| if (data_ == nullptr) { | |||
| if (data_allocator_ != nullptr) { | |||
| data_ = data_allocator_->allocate(length); | |||
| RETURN_UNEXPECTED_IF_NULL(data_); | |||
| data_end_ = data_ + length; | |||
| } else { | |||
| data_ = static_cast<unsigned char *>(malloc(length)); | |||
| data_end_ = data_ + length; | |||
| RETURN_UNEXPECTED_IF_NULL(data_); | |||
| } | |||
| *flat_index += index[k] * product; | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| const unsigned char *Tensor::GetBuffer() const { | |||
| // This version cannot modify anything. data_ could possibly be null. | |||
| return data_; | |||
| @@ -404,17 +473,11 @@ unsigned char *Tensor::GetMutableBuffer() { | |||
| } else { | |||
| // If the data area is not created, then identify the memory size based | |||
| // on the shape and type and allocate it. | |||
| if (data_allocator_ != nullptr) { | |||
| data_ = data_allocator_->allocate(this->SizeInBytes()); | |||
| data_end_ = data_ + SizeInBytes(); | |||
| if (this->AllocateBuffer(this->SizeInBytes()).IsOk()) { | |||
| return data_; | |||
| } else { | |||
| data_ = static_cast<unsigned char *>(malloc(this->SizeInBytes())); | |||
| data_end_ = data_ + SizeInBytes(); | |||
| if (data_ == nullptr) { | |||
| return nullptr; | |||
| } | |||
| return nullptr; | |||
| } | |||
| return data_; | |||
| } | |||
| } | |||
| @@ -444,7 +507,7 @@ Status Tensor::GetItemPtr(T **ptr, const std::vector<dsize_t> &index) const { | |||
| RETURN_STATUS_UNEXPECTED(err); | |||
| } | |||
| dsize_t flat_idx; | |||
| RETURN_IF_NOT_OK(ToFlatIndex(index, &flat_idx)); | |||
| RETURN_IF_NOT_OK(shape_.ToFlatIndex(index, &flat_idx)); | |||
| *ptr = reinterpret_cast<T *>(data_ + flat_idx * type_.SizeInBytes()); | |||
| return Status::OK(); | |||
| @@ -461,7 +524,7 @@ Status Tensor::GetItemPtr(uchar **ptr, const std::vector<dsize_t> &index, offset | |||
| RETURN_STATUS_UNEXPECTED(err); | |||
| } | |||
| dsize_t flat_idx; | |||
| RETURN_IF_NOT_OK(ToFlatIndex(index, &flat_idx)); | |||
| RETURN_IF_NOT_OK(shape_.ToFlatIndex(index, &flat_idx)); | |||
| offset_t length_temp = 0; | |||
| RETURN_IF_NOT_OK(GetStringAt(flat_idx, ptr, &length_temp)); | |||
| if (length != nullptr) *length = length_temp; | |||
| @@ -481,7 +544,7 @@ Status Tensor::StartAddrOfIndex(std::vector<dsize_t> ind, uchar **start_addr_of_ | |||
| std::vector<dsize_t> r(t_shape.begin() + ind.size(), t_shape.end()); | |||
| *remaining = TensorShape(r); | |||
| ind.resize(this->Rank(), 0); // same as -> while (ind.size() < this->Rank()) ind.push_back(0); | |||
| RETURN_IF_NOT_OK(ToFlatIndex(ind, &flat_ind)); | |||
| RETURN_IF_NOT_OK(shape_.ToFlatIndex(ind, &flat_ind)); | |||
| // check if GetBuffer() returns null, we should flag this as an error, this sanity check will only | |||
| // be true is the tensor failed to allocate memory. | |||
| if (GetMutableBuffer() == nullptr) { | |||
| @@ -588,10 +651,10 @@ Status Tensor::GetItemAt(std::string_view *o, const std::vector<dsize_t> &index) | |||
| RETURN_UNEXPECTED_IF_NULL(o); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(type_ == DataType::DE_STRING, "Type is not DE_STRING"); | |||
| uchar *buf = nullptr; | |||
| uchar *start = nullptr; | |||
| offset_t length = 0; | |||
| RETURN_IF_NOT_OK(GetItemPtr(&buf, index, &length)); | |||
| std::string_view sv{reinterpret_cast<const char *>(buf), length}; | |||
| RETURN_IF_NOT_OK(GetItemPtr(&start, index, &length)); | |||
| std::string_view sv{reinterpret_cast<const char *>(start)}; | |||
| o->swap(sv); | |||
| return Status::OK(); | |||
| } | |||
| @@ -778,13 +841,11 @@ Status Tensor::GetStringAt(dsize_t index, uchar **string_start, offset_t *length | |||
| RETURN_UNEXPECTED_IF_NULL(string_start); | |||
| RETURN_UNEXPECTED_IF_NULL(length); | |||
| auto *offset_ptr = reinterpret_cast<offset_t *>(data_); // offsets starts here | |||
| offset_t end = offset_ptr[index]; | |||
| offset_t start = 0; | |||
| if (index != 0) start = offset_ptr[index - 1] + 1; // string starts at where the previous string ends + 1 | |||
| uchar *buf = GetStringsBuffer(); // string data starts here | |||
| *string_start = buf + start; | |||
| *length = end - start; | |||
| offset_t start = offset_ptr[index]; | |||
| *string_start = data_ + start; | |||
| *length = offset_ptr[index + 1] - start - 1; // -1 to skip the \0 from the string length | |||
| return Status::OK(); | |||
| } | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -35,6 +35,7 @@ | |||
| #include "dataset/util/allocator.h" | |||
| #include "dataset/util/de_error.h" | |||
| #include "dataset/util/status.h" | |||
| #include "proto/example.pb.h" | |||
| namespace py = pybind11; | |||
| namespace mindspore { | |||
| @@ -64,6 +65,8 @@ class Tensor { | |||
| // @param data unsigned char*, pointer to the data. | |||
| Tensor(const TensorShape &shape, const DataType &type, const unsigned char *data); | |||
| Tensor(const TensorShape &shape, const DataType &type, const unsigned char *data, const dsize_t &length); | |||
| Tensor(const Tensor &other) = delete; | |||
| Tensor &operator=(const Tensor &other) = delete; | |||
| @@ -72,6 +75,8 @@ class Tensor { | |||
| Tensor &operator=(Tensor &&other) noexcept; | |||
| Status AllocateBuffer(const dsize_t &length); | |||
| // type of offest values to store strings information | |||
| using offset_t = uint32_t; | |||
| // const of the size of the offset variable | |||
| @@ -84,15 +89,24 @@ class Tensor { | |||
| // Construct a tensor from a list of strings. Reshape the tensor with `shape` if given, otherwise assume the shape is | |||
| // the size of the vector `strings`. | |||
| // The memory layout of a Tensor of strings consists of the Offset_array followed by the strings. | |||
| // OFFSET1, OFFSET2, ... String1, String2, ... | |||
| // The value of each offset is the end index of the corresponding string | |||
| // Thr offset array will store one extra value to find the length of the last string. | |||
| // OFFSET1, OFFSET2, ..., OFFSETn+1, STRING1, STRING2, ..., STRINGn | |||
| // The value of each offset is the start index of the corresponding string | |||
| // Offsets is of type offest_t | |||
| // strings will ne null-terminated | |||
| // example: Tensor(['abc', 'de'], shape={2}, type=DE_STRING) | |||
| // 3 6 a b c \0 d e \0 | |||
| // |----------------------------------------------------------------| | |||
| // | OFFSET ARRAY | STRINGS | | |||
| // | bytes 0-3 | bytes 3-6 | bytes 7-10 | bytes 11-14 | bytes 15-17 | | |||
| // | 11 | 15 | 18 | abc\0 | de\0 | | |||
| // |----------------------------------------------------------------| | |||
| explicit Tensor(const std::vector<std::string> &strings, | |||
| const TensorShape &shape = TensorShape::CreateUnknownRankShape()); | |||
| // Same as Tensor(vector<string>) but the input is protobuf bytelist | |||
| explicit Tensor(const dataengine::BytesList &bytes_list, | |||
| const TensorShape &shape = TensorShape::CreateUnknownRankShape()); | |||
| // A static factory method to create the given flavour of derived Tensor | |||
| // Returns the base class reference for the Tensor. | |||
| // @param ptr output argument to hold the created Tensor of given tensor_impl | |||
| @@ -121,6 +135,9 @@ class Tensor { | |||
| static Status CreateTensor(std::shared_ptr<Tensor> *ptr, const std::vector<std::string> &strings, | |||
| const TensorShape &shape = TensorShape::CreateUnknownRankShape()); | |||
| static Status CreateTensor(std::shared_ptr<Tensor> *ptr, const dataengine::BytesList &bytes_list, | |||
| const TensorShape &shape); | |||
| // Copy raw data of a array based on shape and strides to the destination pointer | |||
| // @param dst Pointer to the destination array where the content is to be copied | |||
| // @param src Pointer to the source of strided array to be copied | |||
| @@ -166,7 +183,7 @@ class Tensor { | |||
| // @param value of type `T` | |||
| template <typename T> | |||
| Status SetItemAt(const std::vector<dsize_t> &index, const T &value) { | |||
| static_cast<void>(GetMutableBuffer()); | |||
| RETURN_IF_NOT_OK(AllocateBuffer(SizeInBytes())); | |||
| T *ptr = nullptr; | |||
| RETURN_IF_NOT_OK(GetItemPtr<T>(&ptr, index)); | |||
| *ptr = value; | |||
| @@ -203,7 +220,7 @@ class Tensor { | |||
| template <typename T> | |||
| Status Fill(const T &value) { | |||
| CHECK_FAIL_RETURN_UNEXPECTED(type_ != DataType::DE_STRING, "Cannot use fill on tensor of strings."); | |||
| static_cast<void>(GetMutableBuffer()); | |||
| RETURN_IF_NOT_OK(AllocateBuffer(SizeInBytes())); | |||
| int64_t cellSize = type_.SizeInBytes(); | |||
| if ((data_ != nullptr) && type_.IsCompatible<T>()) { | |||
| for (dsize_t i = 0; i < Size(); i++) { | |||
| @@ -418,32 +435,28 @@ class Tensor { | |||
| using pointer = std::string_view *; | |||
| using reference = std::string_view &; | |||
| explicit TensorIterator(uchar *offset = nullptr, const uchar *buf = nullptr, dsize_t index = 0) { | |||
| offset_ = reinterpret_cast<offset_t *>(offset); | |||
| buf_ = reinterpret_cast<const char *>(buf); | |||
| explicit TensorIterator(uchar *data = nullptr, dsize_t index = 0) { | |||
| data_ = reinterpret_cast<const char *>(data); | |||
| index_ = index; | |||
| } | |||
| TensorIterator(const TensorIterator<std::string_view, DUMMY> &raw_iterator) { | |||
| offset_ = raw_iterator.offset_; | |||
| buf_ = raw_iterator.buf_; | |||
| data_ = raw_iterator.data_; | |||
| index_ = raw_iterator.index_; | |||
| } | |||
| ~TensorIterator() = default; | |||
| bool operator==(const TensorIterator<std::string_view> &rhs) { | |||
| return buf_ == rhs.buf_ && offset_ == rhs.offset_ && index_ == rhs.index_; | |||
| } | |||
| bool operator==(const TensorIterator<std::string_view> &rhs) { return data_ == rhs.data_ && index_ == rhs.index_; } | |||
| bool operator!=(const TensorIterator<std::string_view> &rhs) { return !(*this == rhs); } | |||
| operator bool() const { return offset_ != nullptr; } | |||
| operator bool() const { return data_ != nullptr; } | |||
| std::string_view operator*() const { | |||
| offset_t start = 0; | |||
| if (index_ != 0) start = offset_[index_ - 1] + 1; | |||
| return std::string_view{buf_ + start}; | |||
| auto offset_ = reinterpret_cast<const offset_t *>(data_); | |||
| offset_t start = offset_[index_]; | |||
| return std::string_view{data_ + start}; | |||
| } | |||
| TensorIterator<std::string_view> &operator+=(const dsize_t &inc) { | |||
| @@ -496,8 +509,7 @@ class Tensor { | |||
| protected: | |||
| dsize_t index_; | |||
| offset_t *offset_; | |||
| const char *buf_; | |||
| const char *data_; | |||
| }; | |||
| // Return a TensorIterator that points to the start of the Tensor. | |||
| @@ -518,11 +530,6 @@ class Tensor { | |||
| } | |||
| protected: | |||
| // Returns the location of the item assuming row major memory layout. | |||
| // @param index | |||
| // @return | |||
| Status ToFlatIndex(const std::vector<dsize_t> &index, dsize_t *flat_index) const; | |||
| // A function that prints Tensor recursively, first called by print | |||
| // @param out | |||
| // @param cur_dim | |||
| @@ -559,7 +566,7 @@ class Tensor { | |||
| // Skip the offsets and returns the start of the buffer where the real strings is stored. Caller needs to check if the | |||
| // tensor's type is a string, otherwise undefined address would be returned. | |||
| // @return address of the first string of the tensor. | |||
| uchar *GetStringsBuffer() const { return data_ + kOffsetSize * shape_.NumOfElements(); } | |||
| uchar *GetStringsBuffer() const { return data_ + kOffsetSize * shape_.NumOfElements() + kOffsetSize; } | |||
| // all access to shape_ should be via shape | |||
| TensorShape shape_; | |||
| @@ -573,14 +580,8 @@ class Tensor { | |||
| unsigned char *data_end_ = nullptr; | |||
| }; | |||
| template <> | |||
| inline Tensor::TensorIterator<std::string_view> Tensor::begin<std::string_view>() { | |||
| uchar *buf = GetStringsBuffer(); | |||
| return TensorIterator<std::string_view>(data_, buf); | |||
| } | |||
| template <> | |||
| inline Tensor::TensorIterator<std::string_view> Tensor::end<std::string_view>() { | |||
| uchar *buf = GetStringsBuffer(); | |||
| return TensorIterator<std::string_view>(data_, buf, shape_.NumOfElements()); | |||
| return TensorIterator<std::string_view>(data_, shape_.NumOfElements()); | |||
| } | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -40,16 +40,7 @@ dsize_t TensorShape::NumOfElements() const { | |||
| if (!known()) { | |||
| return 0; | |||
| } | |||
| dsize_t num = 1; | |||
| for (auto i : raw_shape_) { | |||
| if (multi_ok(num, i)) { | |||
| num *= i; | |||
| } else { | |||
| // dsize_t can wrap since it is signed int, we double check here | |||
| MS_LOG(ERROR) << "Tensor shape larger than maximum allowed value!"; | |||
| } | |||
| } | |||
| return num; | |||
| return strides_[0]; | |||
| } | |||
| void TensorShape::Print(std::ostream &out) const { | |||
| @@ -72,20 +63,23 @@ void TensorShape::Print(std::ostream &out) const { | |||
| } | |||
| TensorShape::TensorShape(const std::initializer_list<dsize_t> &list) | |||
| : raw_shape_(*GlobalContext::Instance()->int_allocator()) { | |||
| : raw_shape_(*GlobalContext::Instance()->int_allocator()), strides_(*GlobalContext::Instance()->int_allocator()) { | |||
| AddListToShape(list); | |||
| } | |||
| TensorShape::TensorShape(const std::vector<dsize_t> &list) : raw_shape_(*GlobalContext::Instance()->int_allocator()) { | |||
| TensorShape::TensorShape(const std::vector<dsize_t> &list) | |||
| : raw_shape_(*GlobalContext::Instance()->int_allocator()), strides_(*GlobalContext::Instance()->int_allocator()) { | |||
| AddListToShape(list); | |||
| } | |||
| TensorShape::TensorShape(const TensorShape &shape) : raw_shape_(*GlobalContext::Instance()->int_allocator()) { | |||
| TensorShape::TensorShape(const TensorShape &shape) | |||
| : raw_shape_(*GlobalContext::Instance()->int_allocator()), strides_(*GlobalContext::Instance()->int_allocator()) { | |||
| AddListToShape(shape.AsVector()); | |||
| known_ = shape.known_; // override with the input shape in case of unknown-rank tensor shape. | |||
| } | |||
| TensorShape::TensorShape(py::list l) : raw_shape_(*GlobalContext::Instance()->int_allocator()) { | |||
| TensorShape::TensorShape(py::list l) | |||
| : raw_shape_(*GlobalContext::Instance()->int_allocator()), strides_(*GlobalContext::Instance()->int_allocator()) { | |||
| std::vector<dsize_t> list_c; | |||
| for (auto &i : l) { | |||
| if (!i.is_none()) { | |||
| @@ -97,6 +91,18 @@ TensorShape::TensorShape(py::list l) : raw_shape_(*GlobalContext::Instance()->in | |||
| AddListToShape(list_c); | |||
| } | |||
| TensorShape::TensorShape(cv::MatSize cv_size, uint32_t type) | |||
| : raw_shape_(*GlobalContext::Instance()->int_allocator()), strides_(*GlobalContext::Instance()->int_allocator()) { | |||
| for (int i = 0; i < cv_size.dims(); i++) { | |||
| raw_shape_.push_back(cv_size[i]); | |||
| } | |||
| auto channels = static_cast<uint8_t>(1 + (type >> static_cast<uint8_t>(CV_CN_SHIFT))); | |||
| if (channels != 1) { | |||
| raw_shape_.push_back(channels); | |||
| } | |||
| known_ = true; | |||
| } | |||
| TensorShape TensorShape::CreateUnknownRankShape() { | |||
| TensorShape s({}); | |||
| s.known_ = false; | |||
| @@ -109,17 +115,6 @@ TensorShape TensorShape::InsertDim(dsize_t axis, dsize_t dim) const { | |||
| return TensorShape(tmp); | |||
| } | |||
| TensorShape::TensorShape(cv::MatSize cv_size, uint32_t type) : raw_shape_(*GlobalContext::Instance()->int_allocator()) { | |||
| for (int i = 0; i < cv_size.dims(); i++) { | |||
| raw_shape_.push_back(cv_size[i]); | |||
| } | |||
| auto channels = static_cast<uint8_t>(1 + (type >> static_cast<uint8_t>(CV_CN_SHIFT))); | |||
| if (channels != 1) { | |||
| raw_shape_.push_back(channels); | |||
| } | |||
| known_ = true; | |||
| } | |||
| std::vector<dsize_t> TensorShape::AsVector() const { | |||
| return std::vector<dsize_t>(raw_shape_.begin(), raw_shape_.end()); | |||
| } | |||
| @@ -139,23 +134,28 @@ bool TensorShape::IsValidIndex(const std::vector<dsize_t> &index) const { | |||
| template <typename T> | |||
| void TensorShape::AddListToShape(const T &list) { | |||
| raw_shape_.resize(list.size()); | |||
| strides_.resize(list.size() + 1); | |||
| strides_[list.size()] = 1; | |||
| known_ = true; | |||
| dsize_t num = 1; | |||
| dsize_t size = 0; | |||
| for (const auto &itr : list) { | |||
| if (itr > 0) { | |||
| if (num > std::numeric_limits<int64_t>::max() / itr) { | |||
| auto itr = std::rbegin(list); // iterate over the list in reverse order | |||
| auto s = list.size() - 1; // to compute strides while adding dims | |||
| for (; itr != std::rend(list); itr++, s--) { | |||
| dsize_t dim = *itr; | |||
| if (dim > 0) { | |||
| if (strides_[s + 1] > std::numeric_limits<int64_t>::max() / dim) { | |||
| MS_LOG(ERROR) << "Invalid shape data, overflow occurred!"; | |||
| known_ = false; | |||
| raw_shape_.clear(); | |||
| return; | |||
| } | |||
| num *= itr; | |||
| strides_[s] = dim * strides_[s + 1]; | |||
| } | |||
| if (itr < 0) { | |||
| if (dim < 0) { | |||
| known_ = false; | |||
| } | |||
| if (itr > kDeMaxDim) { | |||
| if (dim > kDeMaxDim) { | |||
| std::stringstream ss; | |||
| ss << "Invalid shape data, dim (" << size << ") is larger than the maximum dim size(" << kDeMaxDim << ")!"; | |||
| MS_LOG(ERROR) << ss.str().c_str(); | |||
| @@ -163,7 +163,7 @@ void TensorShape::AddListToShape(const T &list) { | |||
| raw_shape_.clear(); | |||
| return; | |||
| } | |||
| raw_shape_.push_back(itr); | |||
| raw_shape_[s] = dim; | |||
| size++; | |||
| } | |||
| if (size > kDeMaxRank) { | |||
| @@ -215,17 +215,18 @@ TensorShape TensorShape::Squeeze() const { | |||
| } | |||
| return TensorShape(new_shape); | |||
| } | |||
| std::vector<dsize_t> TensorShape::Strides() { | |||
| std::vector<dsize_t> strides(Rank()); | |||
| dsize_t count = NumOfElements(); | |||
| for (dsize_t i = 0; i < Rank(); i++) { | |||
| if (raw_shape_[i] != 0) | |||
| count /= raw_shape_[i]; | |||
| else | |||
| count = 0; | |||
| strides[i] = count; | |||
| std::vector<dsize_t> TensorShape::Strides() const { return std::vector<dsize_t>{strides_.begin() + 1, strides_.end()}; } | |||
| // Name: ToFlatIndex() | |||
| // Description: convert a vector style index to number, used to access memory internal use only | |||
| Status TensorShape::ToFlatIndex(const std::vector<dsize_t> &index, dsize_t *flat_index) const { | |||
| *flat_index = 0; | |||
| for (size_t k = 0; k < index.size(); k++) { | |||
| *flat_index += index[k] * strides_[k + 1]; // skip the first element of strides_ which is numOfElements | |||
| } | |||
| return strides; | |||
| CHECK_FAIL_RETURN_UNEXPECTED(*flat_index < NumOfElements(), "Not a valid index"); | |||
| return Status::OK(); | |||
| } | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -156,13 +156,20 @@ class TensorShape { | |||
| TensorShape Squeeze() const; | |||
| std::vector<dsize_t> Strides(); | |||
| std::vector<dsize_t> Strides() const; | |||
| // Returns the location of the item assuming row major memory layout. | |||
| // @param index | |||
| // @return | |||
| Status ToFlatIndex(const std::vector<dsize_t> &index, dsize_t *flat_index) const; | |||
| private: | |||
| // True if known and valid shape, false otherwise | |||
| bool known_; | |||
| // Vector to keep the dims of the shape. | |||
| std::vector<dsize_t, IntAlloc> raw_shape_; | |||
| // Vector to keep the strides of the shape. The size is rank+1 | |||
| std::vector<dsize_t, IntAlloc> strides_; | |||
| // Internal utility function to iterate over a list, check if the dim is valid and then insert it into the shape. | |||
| // @tparam T list | |||
| @@ -1,5 +1,3 @@ | |||
| ms_protobuf_generate(EXAMPLE_SRCS EXAMPLE_HDRS example.proto) | |||
| ms_protobuf_generate(FEATURE_SRCS FEATURE_HDRS feature.proto) | |||
| add_subdirectory(sampler) | |||
| file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc") | |||
| set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD) | |||
| @@ -15,13 +13,9 @@ add_library(engine-datasetops-source OBJECT | |||
| image_folder_op.cc | |||
| mnist_op.cc | |||
| voc_op.cc | |||
| ${EXAMPLE_SRCS} | |||
| ${FEATURE_SRCS} | |||
| manifest_op.cc | |||
| cifar_op.cc | |||
| random_data_op.cc | |||
| celeba_op.cc | |||
| text_file_op.cc | |||
| ) | |||
| add_dependencies(engine-datasetops-source mindspore::protobuf) | |||
| ) | |||
| @@ -127,8 +127,10 @@ Status MindRecordOp::Init() { | |||
| std::string type_str = mindrecord::ColumnDataTypeNameNormalized[col_data_types[i]]; | |||
| DataType t_dtype = DataType(type_str); // valid types: {"bytes", "string", "int32", "int64", "float32", "float64"} | |||
| if (col_data_types[i] == mindrecord::ColumnBytes || col_data_types[i] == mindrecord::ColumnString) { // rank = 1 | |||
| if (col_data_types[i] == mindrecord::ColumnBytes) { // rank = 1 | |||
| col_desc = ColDescriptor(colname, t_dtype, TensorImpl::kFlexible, 1); | |||
| } else if (col_data_types[i] == mindrecord::ColumnString) { // rank = 0 | |||
| col_desc = ColDescriptor(colname, t_dtype, TensorImpl::kFlexible, 0); | |||
| } else if (col_shapes[i].size() > 0) { | |||
| std::vector<dsize_t> vec(col_shapes[i].size()); // temporary vector to hold shape | |||
| (void)std::copy(col_shapes[i].begin(), col_shapes[i].end(), vec.begin()); | |||
| @@ -309,7 +311,10 @@ Status MindRecordOp::LoadTensorRow(TensorRow *tensor_row, const std::vector<uint | |||
| // Set shape | |||
| auto num_elements = n_bytes / column_data_type_size; | |||
| if (column.hasShape()) { | |||
| if (type == DataType::DE_STRING) { | |||
| std::string s{data, data + n_bytes}; | |||
| RETURN_IF_NOT_OK(Tensor::CreateTensor(&tensor, {s}, TensorShape::CreateScalar())); | |||
| } else if (column.hasShape()) { | |||
| auto new_shape = TensorShape(column.shape()); | |||
| RETURN_IF_NOT_OK(column.MaterializeTensorShape(static_cast<int32_t>(num_elements), &new_shape)); | |||
| RETURN_IF_NOT_OK(Tensor::CreateTensor(&tensor, column.tensorImpl(), new_shape, type, data)); | |||
| @@ -63,7 +63,8 @@ Status Sampler::CreateSamplerTensor(std::shared_ptr<Tensor> *sample_ids, int64_t | |||
| } | |||
| TensorShape shape(std::vector<dsize_t>(1, num_elements)); | |||
| RETURN_IF_NOT_OK(Tensor::CreateTensor(sample_ids, col_desc_->tensorImpl(), shape, col_desc_->type())); | |||
| (void)(*sample_ids)->GetMutableBuffer(); // allocate memory in case user forgets! | |||
| RETURN_IF_NOT_OK( | |||
| (*sample_ids)->AllocateBuffer((*sample_ids)->SizeInBytes())); // allocate memory in case user forgets! | |||
| return Status::OK(); | |||
| } | |||
| @@ -724,18 +724,26 @@ Status TFReaderOp::LoadBytesList(const ColDescriptor ¤t_col, const dataeng | |||
| // kBytesList can map to the following DE types ONLY! | |||
| // DE_UINT8, DE_INT8 | |||
| // Must be single byte type for each element! | |||
| if (current_col.type() != DataType::DE_UINT8 && current_col.type() != DataType::DE_INT8) { | |||
| if (current_col.type() != DataType::DE_UINT8 && current_col.type() != DataType::DE_INT8 && | |||
| current_col.type() != DataType::DE_STRING) { | |||
| std::string err_msg = "Invalid datatype for Tensor at column: " + current_col.name(); | |||
| RETURN_STATUS_UNEXPECTED(err_msg); | |||
| } | |||
| const dataengine::BytesList &bytes_list = column_values_list.bytes_list(); | |||
| *num_elements = bytes_list.value_size(); | |||
| if (current_col.type() == DataType::DE_STRING) { | |||
| TensorShape shape = TensorShape::CreateScalar(); | |||
| RETURN_IF_NOT_OK(current_col.MaterializeTensorShape(*num_elements, &shape)); | |||
| RETURN_IF_NOT_OK(Tensor::CreateTensor(tensor, bytes_list, shape)); | |||
| return Status::OK(); | |||
| } | |||
| uint64_t max_size = 0; | |||
| for (uint32_t i = 0; i < bytes_list.value_size(); ++i) max_size = std::max(max_size, bytes_list.value(i).size()); | |||
| *num_elements = bytes_list.value_size(); | |||
| int64_t pad_size = max_size; | |||
| // if user provides a shape in the form of [-1, d1, 2d, ... , dn], we need to pad to d1 * d2 * ... * dn | |||
| @@ -879,7 +887,7 @@ Status TFReaderOp::LoadIntList(const ColDescriptor ¤t_col, const dataengin | |||
| RETURN_IF_NOT_OK(Tensor::CreateTensor(tensor, current_col.tensorImpl(), current_shape, current_col.type())); | |||
| // Tensors are lazily allocated, this eagerly allocates memory for the tensor. | |||
| (void)(*tensor)->GetMutableBuffer(); | |||
| RETURN_IF_NOT_OK((*tensor)->AllocateBuffer((*tensor)->SizeInBytes())); | |||
| int64_t i = 0; | |||
| auto it = (*tensor)->begin<T>(); | |||
| @@ -162,7 +162,7 @@ void CastFrom(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *out | |||
| Status TypeCast(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, const DataType &data_type) { | |||
| RETURN_IF_NOT_OK(Tensor::CreateTensor(output, TensorImpl::kFlexible, input->shape(), data_type)); | |||
| static_cast<void>((*output)->GetMutableBuffer()); | |||
| RETURN_IF_NOT_OK((*output)->AllocateBuffer((*output)->SizeInBytes())); | |||
| switch (input->type().value()) { | |||
| case DataType::DE_BOOL: | |||
| CastFrom<bool>(input, output); | |||
| @@ -211,7 +211,7 @@ Status ToFloat16(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> * | |||
| // initiate new tensor for type cast | |||
| DataType new_type = DataType("float16"); | |||
| RETURN_IF_NOT_OK(Tensor::CreateTensor(output, TensorImpl::kFlexible, input->shape(), new_type)); | |||
| static_cast<void>((*output)->GetMutableBuffer()); | |||
| RETURN_IF_NOT_OK((*output)->AllocateBuffer((*output)->SizeInBytes())); | |||
| auto in_itr = input->begin<float>(); | |||
| auto out_itr = (*output)->begin<float16>(); | |||
| @@ -64,7 +64,8 @@ Status Flip(std::shared_ptr<Tensor> input, std::shared_ptr<Tensor> *output, int | |||
| std::shared_ptr<CVTensor> output_cv = std::make_shared<CVTensor>(input_cv->shape(), input_cv->type()); | |||
| RETURN_UNEXPECTED_IF_NULL(output_cv); | |||
| (void)output_cv->GetMutableBuffer(); | |||
| RETURN_IF_NOT_OK(output_cv->AllocateBuffer(output_cv->SizeInBytes())); | |||
| if (input_cv->mat().data) { | |||
| try { | |||
| cv::flip(input_cv->mat(), output_cv->mat(), flip_code); | |||
| @@ -51,7 +51,7 @@ enum ColumnDataType { | |||
| // mapping as {"bytes", "string", "int32", "int64", "float32", "float64"}; | |||
| const uint32_t ColumnDataTypeSize[kDataTypes] = {1, 1, 4, 8, 4, 8}; | |||
| const std::vector<std::string> ColumnDataTypeNameNormalized = {"uint8", "uint8", "int32", | |||
| const std::vector<std::string> ColumnDataTypeNameNormalized = {"uint8", "string", "int32", | |||
| "int64", "float32", "float64"}; | |||
| const std::unordered_map<std::string, ColumnDataType> ColumnDataTypeMap = { | |||
| @@ -48,6 +48,7 @@ def mstype_to_detype(type_): | |||
| mstype.float16: cde.DataType("float16"), | |||
| mstype.float32: cde.DataType("float32"), | |||
| mstype.float64: cde.DataType("float64"), | |||
| mstype.string: cde.DataType("string"), | |||
| }[type_] | |||
| @@ -26,7 +26,7 @@ from . import datasets | |||
| INT32_MAX = 2147483647 | |||
| valid_detype = [ | |||
| "bool", "int8", "int16", "int32", "int64", "uint8", "uint16", | |||
| "uint32", "uint64", "float16", "float32", "float64" | |||
| "uint32", "uint64", "float16", "float32", "float64", "string" | |||
| ] | |||
| @@ -32,47 +32,47 @@ class MindDataTestDatatype : public UT::Common { | |||
| TEST_F(MindDataTestDatatype, TestSizes) { | |||
| uint8_t x = DataType::SIZE_IN_BYTES[DataType::DE_BOOL]; | |||
| uint8_t x = DataType::kTypeInfo[DataType::DE_BOOL].sizeInBytes_; | |||
| DataType d = DataType(DataType::DE_BOOL); | |||
| ASSERT_EQ(x, 1); | |||
| ASSERT_EQ(d.SizeInBytes(), x); | |||
| x = DataType::SIZE_IN_BYTES[DataType::DE_INT8]; | |||
| x = DataType::kTypeInfo[DataType::DE_INT8].sizeInBytes_; | |||
| d = DataType(DataType::DE_INT8); | |||
| ASSERT_EQ(x, 1); | |||
| ASSERT_EQ(d.SizeInBytes(), x); | |||
| x = DataType::SIZE_IN_BYTES[DataType::DE_UINT8]; | |||
| x = DataType::kTypeInfo[DataType::DE_UINT8].sizeInBytes_; | |||
| d = DataType(DataType::DE_UINT8); | |||
| ASSERT_EQ(x, 1); | |||
| ASSERT_EQ(d.SizeInBytes(), x); | |||
| x = DataType::SIZE_IN_BYTES[DataType::DE_INT16]; | |||
| x = DataType::kTypeInfo[DataType::DE_INT16].sizeInBytes_; | |||
| d = DataType(DataType::DE_INT16); | |||
| ASSERT_EQ(x, 2); | |||
| ASSERT_EQ(d.SizeInBytes(), x); | |||
| x = DataType::SIZE_IN_BYTES[DataType::DE_UINT16]; | |||
| x = DataType::kTypeInfo[DataType::DE_UINT16].sizeInBytes_; | |||
| d = DataType(DataType::DE_UINT16); | |||
| ASSERT_EQ(x, 2); | |||
| ASSERT_EQ(d.SizeInBytes(), x); | |||
| x = DataType::SIZE_IN_BYTES[DataType::DE_INT32]; | |||
| x = DataType::kTypeInfo[DataType::DE_INT32].sizeInBytes_; | |||
| d = DataType(DataType::DE_INT32); | |||
| ASSERT_EQ(x, 4); | |||
| ASSERT_EQ(d.SizeInBytes(), x); | |||
| x = DataType::SIZE_IN_BYTES[DataType::DE_UINT32]; | |||
| x = DataType::kTypeInfo[DataType::DE_UINT32].sizeInBytes_; | |||
| d = DataType(DataType::DE_UINT32); | |||
| ASSERT_EQ(x, 4); | |||
| ASSERT_EQ(d.SizeInBytes(), x); | |||
| x = DataType::SIZE_IN_BYTES[DataType::DE_INT64]; | |||
| x = DataType::kTypeInfo[DataType::DE_INT64].sizeInBytes_; | |||
| d = DataType(DataType::DE_INT64); | |||
| ASSERT_EQ(x, 8); | |||
| ASSERT_EQ(d.SizeInBytes(), x); | |||
| x = DataType::SIZE_IN_BYTES[DataType::DE_UINT64]; | |||
| x = DataType::kTypeInfo[DataType::DE_UINT64].sizeInBytes_; | |||
| d = DataType(DataType::DE_UINT64); | |||
| ASSERT_EQ(x, 8); | |||
| ASSERT_EQ(d.SizeInBytes(), x); | |||
| x = DataType::SIZE_IN_BYTES[DataType::DE_FLOAT32]; | |||
| x = DataType::kTypeInfo[DataType::DE_FLOAT32].sizeInBytes_; | |||
| d = DataType(DataType::DE_FLOAT32); | |||
| ASSERT_EQ(x, 4); | |||
| ASSERT_EQ(d.SizeInBytes(), x); | |||
| x = DataType::SIZE_IN_BYTES[DataType::DE_FLOAT64]; | |||
| x = DataType::kTypeInfo[DataType::DE_FLOAT64].sizeInBytes_; | |||
| d = DataType(DataType::DE_FLOAT64); | |||
| ASSERT_EQ(x, 8); | |||
| ASSERT_EQ(d.SizeInBytes(), x); | |||
| @@ -14,9 +14,7 @@ | |||
| * limitations under the License. | |||
| */ | |||
| #include "common/common.h" | |||
| #include "common/cvop_common.h" | |||
| #include "dataset/kernels/data/one_hot_op.h" | |||
| #include "dataset/core/cv_tensor.h" | |||
| #include "utils/log_adapter.h" | |||
| using namespace mindspore::dataset; | |||
| @@ -24,9 +22,9 @@ using mindspore::MsLogLevel::INFO; | |||
| using mindspore::ExceptionType::NoExceptionType; | |||
| using mindspore::LogStream; | |||
| class MindDataTestOneHotOp : public UT::CVOP::CVOpCommon { | |||
| class MindDataTestOneHotOp : public UT::Common { | |||
| protected: | |||
| MindDataTestOneHotOp() : CVOpCommon() {} | |||
| MindDataTestOneHotOp() {} | |||
| }; | |||
| TEST_F(MindDataTestOneHotOp, TestOp) { | |||
| @@ -65,14 +65,14 @@ TEST_F(MindDataTestStringTensorDE, Basics) { | |||
| TEST_F(MindDataTestStringTensorDE, Basics2) { | |||
| std::shared_ptr<Tensor> t = | |||
| std::make_shared<Tensor>(std::vector<std::string>{"abc", "defg", "hi", "klmno", "123", "789"}, TensorShape({2, 3})); | |||
| ASSERT_TRUE(t->SizeInBytes() == 6 * 5 + 20); | |||
| std::vector<uint32_t> offsets = {3, 8, 11, 17, 21, 25}; | |||
| ASSERT_TRUE(t->SizeInBytes() == 6 * 5 + 20 + 4); | |||
| std::vector<uint32_t> offsets = {0, 4, 9, 12, 18, 22, 26}; | |||
| uint32_t ctr = 0; | |||
| for (auto i : offsets) { | |||
| ASSERT_TRUE(*(reinterpret_cast<uint32_t *>(t->GetMutableBuffer() + ctr)) == i); | |||
| ASSERT_TRUE(*(reinterpret_cast<uint32_t *>(t->GetMutableBuffer() + ctr)) == i + 28); | |||
| ctr += 4; | |||
| } | |||
| const char *buf = reinterpret_cast<char *>(t->GetMutableBuffer()) + 6 * 4; | |||
| const char *buf = reinterpret_cast<char *>(t->GetMutableBuffer()) + 6 * 4 + 4; | |||
| std::vector<uint32_t> starts = {0, 4, 9, 12, 18, 22}; | |||
| uint32_t index = 0; | |||
| @@ -90,14 +90,14 @@ TEST_F(MindDataTestStringTensorDE, Empty) { | |||
| std::shared_ptr<Tensor> t = std::make_shared<Tensor>(strings, TensorShape({2, 3})); | |||
| // abc_defg___123__ | |||
| // 0123456789012345 | |||
| ASSERT_TRUE(t->SizeInBytes() == 6 * 5 + 10); | |||
| std::vector<uint32_t> offsets = {3, 8, 9, 10, 14, 15}; | |||
| ASSERT_TRUE(t->SizeInBytes() == 6 * 5 + 10 + 4); | |||
| std::vector<uint32_t> offsets = {0, 4, 9, 10, 11, 15, 16}; | |||
| uint32_t ctr = 0; | |||
| for (auto i : offsets) { | |||
| ASSERT_TRUE(*(reinterpret_cast<uint32_t *>(t->GetMutableBuffer() + ctr)) == i); | |||
| ASSERT_TRUE(*(reinterpret_cast<uint32_t *>(t->GetMutableBuffer() + ctr)) == i + 28); | |||
| ctr += 4; | |||
| } | |||
| const char *buf = reinterpret_cast<char *>(t->GetMutableBuffer()) + 6 * 4; | |||
| const char *buf = reinterpret_cast<char *>(t->GetMutableBuffer()) + 6 * 4 + 4; | |||
| std::vector<uint32_t> starts = {0, 4, 9, 10, 11, 15}; | |||
| uint32_t index = 0; | |||
| @@ -41,6 +41,7 @@ class MindDataTestTensorDE : public UT::Common { | |||
| TEST_F(MindDataTestTensorDE, Basics) { | |||
| std::shared_ptr<Tensor> t = std::make_shared<Tensor>(TensorShape({2, 3}), DataType(DataType::DE_UINT64)); | |||
| ASSERT_TRUE((t->AllocateBuffer(t->SizeInBytes())).IsOk()); | |||
| ASSERT_EQ(t->shape(), TensorShape({2, 3})); | |||
| ASSERT_EQ(t->type(), DataType::DE_UINT64); | |||
| ASSERT_EQ(t->SizeInBytes(), 2 * 3 * 8); | |||
| @@ -0,0 +1,18 @@ | |||
| { | |||
| "datasetType": "TF", | |||
| "numRows": 3, | |||
| "columns": { | |||
| "line": { | |||
| "type": "string", | |||
| "rank": 0 | |||
| }, | |||
| "words": { | |||
| "type": "string", | |||
| "rank": 1 | |||
| }, | |||
| "chinese": { | |||
| "type": "string", | |||
| "rank": 0 | |||
| } | |||
| } | |||
| } | |||
| @@ -584,7 +584,7 @@ def test_cv_minddataset_reader_basic_tutorial_5_epoch(add_and_remove_cv_file): | |||
| def test_cv_minddataset_reader_basic_tutorial_5_epoch_with_batch(add_and_remove_cv_file): | |||
| """tutorial for cv minderdataset.""" | |||
| columns_list = ["data", "file_name", "label"] | |||
| columns_list = ["data", "label"] | |||
| num_readers = 4 | |||
| data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers) | |||
| @@ -948,8 +948,7 @@ def test_write_with_multi_bytes_and_array_and_read_by_MindDataset(): | |||
| data_value_to_list = [] | |||
| for item in data: | |||
| new_data = {} | |||
| new_data['file_name'] = np.asarray( | |||
| list(bytes(item["file_name"], encoding='utf-8')), dtype=np.uint8) | |||
| new_data['file_name'] = np.asarray(item["file_name"], dtype='S') | |||
| new_data['label'] = np.asarray(list([item["label"]]), dtype=np.int32) | |||
| new_data['image1'] = np.asarray(list(item["image1"]), dtype=np.uint8) | |||
| new_data['image2'] = np.asarray(list(item["image2"]), dtype=np.uint8) | |||
| @@ -1153,8 +1152,7 @@ def test_write_with_multi_bytes_and_MindDataset(): | |||
| data_value_to_list = [] | |||
| for item in data: | |||
| new_data = {} | |||
| new_data['file_name'] = np.asarray( | |||
| list(bytes(item["file_name"], encoding='utf-8')), dtype=np.uint8) | |||
| new_data['file_name'] = np.asarray(item["file_name"], dtype='S') | |||
| new_data['label'] = np.asarray(list([item["label"]]), dtype=np.int32) | |||
| new_data['image1'] = np.asarray(list(item["image1"]), dtype=np.uint8) | |||
| new_data['image2'] = np.asarray(list(item["image2"]), dtype=np.uint8) | |||
| @@ -27,6 +27,7 @@ import mindspore.dataset as ds | |||
| import mindspore.dataset.transforms.vision.c_transforms as vision | |||
| from mindspore import log as logger | |||
| from mindspore.dataset.transforms.vision import Inter | |||
| from mindspore.dataset.transforms.text import as_text | |||
| from mindspore.mindrecord import FileWriter | |||
| FILES_NUM = 4 | |||
| @@ -72,7 +73,7 @@ def test_cv_minddataset_pk_sample_no_column(add_and_remove_cv_file): | |||
| for item in data_set.create_dict_iterator(): | |||
| logger.info("-------------- cv reader basic: {} ------------------------".format(num_iter)) | |||
| logger.info("-------------- item[file_name]: \ | |||
| {}------------------------".format("".join([chr(x) for x in item["file_name"]]))) | |||
| {}------------------------".format(as_text(item["file_name"]))) | |||
| logger.info("-------------- item[label]: {} ----------------------------".format(item["label"])) | |||
| num_iter += 1 | |||
| @@ -92,7 +93,7 @@ def test_cv_minddataset_pk_sample_basic(add_and_remove_cv_file): | |||
| logger.info("-------------- item[data]: \ | |||
| {}------------------------".format(item["data"][:10])) | |||
| logger.info("-------------- item[file_name]: \ | |||
| {}------------------------".format("".join([chr(x) for x in item["file_name"]]))) | |||
| {}------------------------".format(as_text(item["file_name"]))) | |||
| logger.info("-------------- item[label]: {} ----------------------------".format(item["label"])) | |||
| num_iter += 1 | |||
| @@ -110,7 +111,7 @@ def test_cv_minddataset_pk_sample_shuffle(add_and_remove_cv_file): | |||
| for item in data_set.create_dict_iterator(): | |||
| logger.info("-------------- cv reader basic: {} ------------------------".format(num_iter)) | |||
| logger.info("-------------- item[file_name]: \ | |||
| {}------------------------".format("".join([chr(x) for x in item["file_name"]]))) | |||
| {}------------------------".format(as_text(item["file_name"]))) | |||
| logger.info("-------------- item[label]: {} ----------------------------".format(item["label"])) | |||
| num_iter += 1 | |||
| @@ -127,7 +128,7 @@ def test_cv_minddataset_pk_sample_out_of_range(add_and_remove_cv_file): | |||
| for item in data_set.create_dict_iterator(): | |||
| logger.info("-------------- cv reader basic: {} ------------------------".format(num_iter)) | |||
| logger.info("-------------- item[file_name]: \ | |||
| {}------------------------".format("".join([chr(x) for x in item["file_name"]]))) | |||
| {}------------------------".format(as_text(item["file_name"]))) | |||
| logger.info("-------------- item[label]: {} ----------------------------".format(item["label"])) | |||
| num_iter += 1 | |||
| @@ -17,17 +17,15 @@ import numpy as np | |||
| import pytest | |||
| import mindspore.dataset as ds | |||
| import mindspore.common.dtype as mstype | |||
| # pylint: disable=comparison-with-itself | |||
| def test_basic(): | |||
| x = np.array([["ab", "cde", "121"], ["x", "km", "789"]], dtype='S') | |||
| # x = np.array(["ab", "cde"], dtype='S') | |||
| n = cde.Tensor(x) | |||
| arr = n.as_array() | |||
| y = np.array([1, 2]) | |||
| assert all(y == y) | |||
| # assert np.testing.assert_array_equal(y,y) | |||
| np.testing.assert_array_equal(x, arr) | |||
| def compare(strings): | |||
| @@ -60,7 +58,125 @@ def test_batching_strings(): | |||
| assert "[Batch ERROR] Batch does not support" in str(info) | |||
| def test_map(): | |||
| def gen(): | |||
| yield np.array(["ab cde 121"], dtype='S'), | |||
| data = ds.GeneratorDataset(gen, column_names=["col"]) | |||
| def split(b): | |||
| splits = b.item().decode("utf8").split() | |||
| return np.array(splits, dtype='S') | |||
| data = data.map(input_columns=["col"], operations=split) | |||
| expected = np.array(["ab", "cde", "121"], dtype='S') | |||
| for d in data: | |||
| np.testing.assert_array_equal(d[0], expected) | |||
| def as_str(arr): | |||
| def decode(s): return s.decode("utf8") | |||
| decode_v = np.vectorize(decode) | |||
| return decode_v(arr) | |||
| line = np.array(["This is a text file.", | |||
| "Be happy every day.", | |||
| "Good luck to everyone."]) | |||
| words = np.array([["This", "text", "file", "a"], | |||
| ["Be", "happy", "day", "b"], | |||
| ["女", "", "everyone", "c"]]) | |||
| chinese = np.array(["今天天气太好了我们一起去外面玩吧", | |||
| "男默女泪", | |||
| "江州市长江大桥参加了长江大桥的通车仪式"]) | |||
| def test_tfrecord1(): | |||
| s = ds.Schema() | |||
| s.add_column("line", "string", []) | |||
| s.add_column("words", "string", [-1]) | |||
| s.add_column("chinese", "string", []) | |||
| data = ds.TFRecordDataset("../data/dataset/testTextTFRecord/text.tfrecord", shuffle=False, schema=s) | |||
| for i, d in enumerate(data.create_dict_iterator()): | |||
| assert d["line"].shape == line[i].shape | |||
| assert d["words"].shape == words[i].shape | |||
| assert d["chinese"].shape == chinese[i].shape | |||
| np.testing.assert_array_equal(line[i], as_str(d["line"])) | |||
| np.testing.assert_array_equal(words[i], as_str(d["words"])) | |||
| np.testing.assert_array_equal(chinese[i], as_str(d["chinese"])) | |||
| def test_tfrecord2(): | |||
| data = ds.TFRecordDataset("../data/dataset/testTextTFRecord/text.tfrecord", shuffle=False, | |||
| schema='../data/dataset/testTextTFRecord/datasetSchema.json') | |||
| for i, d in enumerate(data.create_dict_iterator()): | |||
| assert d["line"].shape == line[i].shape | |||
| assert d["words"].shape == words[i].shape | |||
| assert d["chinese"].shape == chinese[i].shape | |||
| np.testing.assert_array_equal(line[i], as_str(d["line"])) | |||
| np.testing.assert_array_equal(words[i], as_str(d["words"])) | |||
| np.testing.assert_array_equal(chinese[i], as_str(d["chinese"])) | |||
| def test_tfrecord3(): | |||
| s = ds.Schema() | |||
| s.add_column("line", mstype.string, []) | |||
| s.add_column("words", mstype.string, [-1, 2]) | |||
| s.add_column("chinese", mstype.string, []) | |||
| data = ds.TFRecordDataset("../data/dataset/testTextTFRecord/text.tfrecord", shuffle=False, schema=s) | |||
| for i, d in enumerate(data.create_dict_iterator()): | |||
| assert d["line"].shape == line[i].shape | |||
| assert d["words"].shape == words[i].reshape([2, 2]).shape | |||
| assert d["chinese"].shape == chinese[i].shape | |||
| np.testing.assert_array_equal(line[i], as_str(d["line"])) | |||
| np.testing.assert_array_equal(words[i].reshape([2, 2]), as_str(d["words"])) | |||
| np.testing.assert_array_equal(chinese[i], as_str(d["chinese"])) | |||
| def create_text_mindrecord(): | |||
| # methood to create mindrecord with string data, used to generate testTextMindRecord/test.mindrecord | |||
| from mindspore.mindrecord import FileWriter | |||
| mindrecord_file_name = "test.mindrecord" | |||
| data = [{"english": "This is a text file.", | |||
| "chinese": "今天天气太好了我们一起去外面玩吧"}, | |||
| {"english": "Be happy every day.", | |||
| "chinese": "男默女泪"}, | |||
| {"english": "Good luck to everyone.", | |||
| "chinese": "江州市长江大桥参加了长江大桥的通车仪式"}, | |||
| ] | |||
| writer = FileWriter(mindrecord_file_name) | |||
| schema = {"english": {"type": "string"}, | |||
| "chinese": {"type": "string"}, | |||
| } | |||
| writer.add_schema(schema) | |||
| writer.write_raw_data(data) | |||
| writer.commit() | |||
| def test_mindrecord(): | |||
| data = ds.MindDataset("../data/dataset/testTextMindRecord/test.mindrecord", shuffle=False) | |||
| for i, d in enumerate(data.create_dict_iterator()): | |||
| assert d["english"].shape == line[i].shape | |||
| assert d["chinese"].shape == chinese[i].shape | |||
| np.testing.assert_array_equal(line[i], as_str(d["english"])) | |||
| np.testing.assert_array_equal(chinese[i], as_str(d["chinese"])) | |||
| if __name__ == '__main__': | |||
| test_generator() | |||
| test_basic() | |||
| test_batching_strings() | |||
| # test_generator() | |||
| # test_basic() | |||
| # test_batching_strings() | |||
| test_map() | |||
| # test_tfrecord1() | |||
| # test_tfrecord2() | |||
| # test_tfrecord3() | |||
| # test_mindrecord() | |||