From e8ca243364c0d8d7d08b04c6cb099c6d54184bd0 Mon Sep 17 00:00:00 2001 From: hesham Date: Fri, 8 May 2020 15:36:06 -0400 Subject: [PATCH] -Add DE_STRING -replace switch'case by indexing - Add test case - Add constructors - Add getItem string - Fix bugs - Add more tests - Tensor iterator - asNumpy - TextFileDataset - Tensor(Numpy) - Super > 2D - Add more test cases for GeneratorDataset - Change StartAddr to GetBuffer and GetMutableNuffer - Raise an error if batch is used with strings Clean-up work --- .../ccsrc/dataset/api/python_bindings.cc | 5 + mindspore/ccsrc/dataset/core/cv_tensor.cc | 12 +- mindspore/ccsrc/dataset/core/data_type.cc | 221 +++--------------- mindspore/ccsrc/dataset/core/data_type.h | 78 ++++++- mindspore/ccsrc/dataset/core/tensor.cc | 175 ++++++++++++-- mindspore/ccsrc/dataset/core/tensor.h | 218 +++++++++++++++-- mindspore/ccsrc/dataset/core/tensor_shape.cc | 12 + mindspore/ccsrc/dataset/core/tensor_shape.h | 2 + .../dataset/engine/datasetops/batch_op.cc | 8 +- .../engine/datasetops/device_queue_op.cc | 9 +- .../engine/datasetops/source/celeba_op.cc | 2 +- .../engine/datasetops/source/cifar_op.cc | 4 +- .../datasetops/source/image_folder_op.cc | 2 +- .../engine/datasetops/source/manifest_op.cc | 2 +- .../engine/datasetops/source/mnist_op.cc | 2 +- .../datasetops/source/random_data_op.cc | 2 +- .../engine/datasetops/source/random_data_op.h | 1 - .../source/sampler/distributed_sampler.cc | 2 +- .../datasetops/source/sampler/pk_sampler.cc | 2 +- .../source/sampler/random_sampler.cc | 2 +- .../datasetops/source/sampler/sampler.cc | 2 +- .../source/sampler/sequential_sampler.cc | 2 +- .../source/sampler/subset_random_sampler.cc | 2 +- .../source/sampler/weighted_random_sampler.cc | 2 +- .../engine/datasetops/source/text_file_op.cc | 5 +- .../engine/datasetops/source/tf_reader_op.cc | 4 +- .../engine/datasetops/source/voc_op.cc | 2 +- .../ccsrc/dataset/engine/tdt/tdt_plugin.cc | 2 +- .../ccsrc/dataset/kernels/data/data_utils.cc | 4 +- .../dataset/kernels/image/image_utils.cc | 10 +- .../image/random_crop_decode_resize_op.cc | 4 +- tests/ut/cpp/dataset/CMakeLists.txt | 1 + tests/ut/cpp/dataset/common/cvop_common.cc | 2 +- tests/ut/cpp/dataset/datatype_test.cc | 22 +- tests/ut/cpp/dataset/image_folder_op_test.cc | 2 +- tests/ut/cpp/dataset/map_op_test.cc | 8 +- .../random_crop_decode_resize_op_test.cc | 8 +- .../cpp/dataset/stand_alone_samplers_test.cc | 2 +- tests/ut/cpp/dataset/tensor_string_test.cc | 153 ++++++++++++ tests/ut/cpp/dataset/tensor_test.cc | 16 +- .../dataset/test_datasets_textfileop.py | 8 +- tests/ut/python/dataset/test_flat_map.py | 6 +- tests/ut/python/dataset/test_tensor_string.py | 65 ++++++ 43 files changed, 781 insertions(+), 312 deletions(-) create mode 100644 tests/ut/cpp/dataset/tensor_string_test.cc create mode 100644 tests/ut/python/dataset/test_tensor_string.py diff --git a/mindspore/ccsrc/dataset/api/python_bindings.cc b/mindspore/ccsrc/dataset/api/python_bindings.cc index 9b525191e0..ffedc8570e 100644 --- a/mindspore/ccsrc/dataset/api/python_bindings.cc +++ b/mindspore/ccsrc/dataset/api/python_bindings.cc @@ -237,6 +237,11 @@ void bindTensor(py::module *m) { .def("type", &Tensor::type) .def("as_array", [](py::object &t) { auto &tensor = py::cast(t); + if (tensor.type() == DataType::DE_STRING) { + py::array res; + tensor.GetDataAsNumpyStrings(&res); + return res; + } py::buffer_info info; THROW_IF_ERROR(Tensor::GetBufferInfo(tensor, &info)); return py::array(pybind11::dtype(info), info.shape, info.strides, info.ptr, t); diff --git a/mindspore/ccsrc/dataset/core/cv_tensor.cc b/mindspore/ccsrc/dataset/core/cv_tensor.cc index b09751128d..16921e8b2d 100644 --- a/mindspore/ccsrc/dataset/core/cv_tensor.cc +++ b/mindspore/ccsrc/dataset/core/cv_tensor.cc @@ -24,15 +24,15 @@ namespace mindspore { namespace dataset { CVTensor::CVTensor(const TensorShape &shape, const DataType &type) : Tensor(shape, type) { - (void)this->MatInit(StartAddr(), shape_, type_, &mat_); + (void)this->MatInit(GetMutableBuffer(), shape_, type_, &mat_); } CVTensor::CVTensor(const TensorShape &shape, const DataType &type, const uchar *data) : Tensor(shape, type, data) { - (void)this->MatInit(StartAddr(), shape_, type_, &mat_); + (void)this->MatInit(GetMutableBuffer(), shape_, type_, &mat_); } CVTensor::CVTensor(std::shared_ptr tensor) : Tensor(std::move(*tensor)) { - (void)this->MatInit(StartAddr(), shape_, type_, &mat_); + (void)this->MatInit(GetMutableBuffer(), shape_, type_, &mat_); } std::pair, int> CVTensor::IsValidImage(const TensorShape &shape, const DataType &type) { @@ -83,19 +83,19 @@ Status CVTensor::MatInit(uchar *data, const TensorShape &shape, const DataType & Status CVTensor::Reshape(const TensorShape &shape) { RETURN_IF_NOT_OK(Tensor::Reshape(shape)); - RETURN_IF_NOT_OK(this->MatInit(StartAddr(), shape_, type_, &mat_)); + RETURN_IF_NOT_OK(this->MatInit(GetMutableBuffer(), shape_, type_, &mat_)); return Status::OK(); } Status CVTensor::ExpandDim(const dsize_t &axis) { RETURN_IF_NOT_OK(Tensor::ExpandDim(axis)); - RETURN_IF_NOT_OK(this->MatInit(StartAddr(), shape_, type_, &mat_)); + RETURN_IF_NOT_OK(this->MatInit(GetMutableBuffer(), shape_, type_, &mat_)); return Status::OK(); } void CVTensor::Squeeze() { Tensor::Squeeze(); - (void)this->MatInit(StartAddr(), shape_, type_, &mat_); + (void)this->MatInit(GetMutableBuffer(), shape_, type_, &mat_); } } // namespace dataset } // namespace mindspore diff --git a/mindspore/ccsrc/dataset/core/data_type.cc b/mindspore/ccsrc/dataset/core/data_type.cc index 5451e2bbda..4420f78e2d 100644 --- a/mindspore/ccsrc/dataset/core/data_type.cc +++ b/mindspore/ccsrc/dataset/core/data_type.cc @@ -15,116 +15,40 @@ */ #include "dataset/core/data_type.h" -#include - #include "utils/log_adapter.h" -#include "dataset/core/constants.h" #include "dataset/core/pybind_support.h" #include "dataset/util/de_error.h" namespace mindspore { namespace dataset { + uint8_t DataType::SizeInBytes() const { - switch (type_) { - case DataType::DE_BOOL: - case DataType::DE_INT8: - case DataType::DE_UINT8: - return 1; - case DataType::DE_INT16: - case DataType::DE_UINT16: - case DataType::DE_FLOAT16: - return 2; - case DataType::DE_INT32: - case DataType::DE_UINT32: - case DataType::DE_FLOAT32: - return 4; - case DataType::DE_INT64: - case DataType::DE_UINT64: - case DataType::DE_FLOAT64: - return 8; - default: - return 0; - } + if (type_ < DataType::NUM_OF_TYPES) + return SIZE_IN_BYTES[type_]; + else + return 0; } py::dtype DataType::AsNumpyType() const { - std::string s; - switch (type_) { - case DataType::DE_BOOL: - s = "bool"; - break; - case DataType::DE_INT8: - s = "int8"; - break; - case DataType::DE_UINT8: - s = "uint8"; - break; - case DataType::DE_INT16: - s = "int16"; - break; - case DataType::DE_UINT16: - s = "uint16"; - break; - case DataType::DE_INT32: - s = "int32"; - break; - case DataType::DE_UINT32: - s = "uint32"; - break; - case DataType::DE_INT64: - s = "int64"; - break; - case DataType::DE_UINT64: - s = "uint64"; - break; - case DataType::DE_FLOAT16: - s = "float16"; - break; - case DataType::DE_FLOAT32: - s = "float32"; - break; - case DataType::DE_FLOAT64: - s = "double"; - break; - case DataType::DE_UNKNOWN: - s = "unknown"; - break; - default: - s = "unknown"; - break; - } - return py::dtype(s); + if (type_ < DataType::NUM_OF_TYPES) + return py::dtype(PYBIND_TYPES[type_]); + else + return py::dtype("unknown"); } uint8_t DataType::AsCVType() const { - switch (type_) { - case DataType::DE_BOOL: - return CV_8U; - case DataType::DE_INT8: - return CV_8S; - case DataType::DE_UINT8: - return CV_8U; - case DataType::DE_INT16: - return CV_16S; - case DataType::DE_UINT16: - return CV_16U; - case DataType::DE_INT32: - return CV_32S; - case DataType::DE_FLOAT16: - return CV_16F; - case DataType::DE_FLOAT32: - return CV_32F; - case DataType::DE_FLOAT64: - return CV_64F; - case DataType::DE_UINT32: - case DataType::DE_INT64: - case DataType::DE_UINT64: - default: - MS_LOG(ERROR) << "Cannot convert to OpenCV type. Return invalid type!"; - return kCVInvalidType; + uint8_t res = kCVInvalidType; + if (type_ < DataType::NUM_OF_TYPES) { + res = CV_TYPES[type_]; } -} + + if (res == kCVInvalidType) { + MS_LOG(ERROR) << "Cannot convert to OpenCV type. Return invalid type!"; + } + + return res; +} // namespace dataset DataType DataType::FromCVType(int cv_type) { auto depth = static_cast(cv_type) & static_cast(CV_MAT_DEPTH_MASK); @@ -176,72 +100,17 @@ DataType::DataType(const std::string &type_str) { type_ = DE_FLOAT32; else if (type_str == "float64") type_ = DE_FLOAT64; + else if (type_str == "string") + type_ = DE_STRING; else type_ = DE_UNKNOWN; } std::string DataType::ToString() const { - switch (type_) { - case DataType::DE_BOOL: - return "bool"; - case DataType::DE_INT8: - return "int8"; - case DataType::DE_UINT8: - return "uint8"; - case DataType::DE_INT16: - return "int16"; - case DataType::DE_UINT16: - return "uint16"; - case DataType::DE_INT32: - return "int32"; - case DataType::DE_UINT32: - return "uint32"; - case DataType::DE_INT64: - return "int64"; - case DataType::DE_UINT64: - return "uint64"; - case DataType::DE_FLOAT16: - return "float16"; - case DataType::DE_FLOAT32: - return "float32"; - case DataType::DE_FLOAT64: - return "float64"; - case DataType::DE_UNKNOWN: - return "unknown"; - default: - return "unknown"; - } -} - -DataType DataType::FromNpType(const py::dtype &type) { - if (type.is(py::dtype("bool"))) { - return DataType(DataType::DE_BOOL); - } else if (type.is(py::dtype("int8"))) { - return DataType(DataType::DE_INT8); - } else if (type.is(py::dtype("uint8"))) { - return DataType(DataType::DE_UINT8); - } else if (type.is(py::dtype("int16"))) { - return DataType(DataType::DE_INT16); - } else if (type.is(py::dtype("uint16"))) { - return DataType(DataType::DE_UINT16); - } else if (type.is(py::dtype("int32"))) { - return DataType(DataType::DE_INT32); - } else if (type.is(py::dtype("uint32"))) { - return DataType(DataType::DE_UINT32); - } else if (type.is(py::dtype("int64"))) { - return DataType(DataType::DE_INT64); - } else if (type.is(py::dtype("uint64"))) { - return DataType(DataType::DE_UINT64); - } else if (type.is(py::dtype("float16"))) { - return DataType(DataType::DE_FLOAT16); - } else if (type.is(py::dtype("float32"))) { - return DataType(DataType::DE_FLOAT32); - } else if (type.is(py::dtype("double"))) { - return DataType(DataType::DE_FLOAT64); - } else { - MS_LOG(ERROR) << "Cannot convert from numpy type. Unknown data type is returned!"; - return DataType(DataType::DE_UNKNOWN); - } + if (type_ < DataType::NUM_OF_TYPES) + return TO_STRINGS[type_]; + else + return "unknown"; } DataType DataType::FromNpArray(const py::array &arr) { @@ -269,6 +138,8 @@ DataType DataType::FromNpArray(const py::array &arr) { return DataType(DataType::DE_FLOAT32); } else if (py::isinstance>(arr)) { return DataType(DataType::DE_FLOAT64); + } else if (arr.dtype().kind() == 'S') { + return DataType(DataType::DE_STRING); } else { MS_LOG(ERROR) << "Cannot convert from numpy type. Unknown data type is returned!"; return DataType(DataType::DE_UNKNOWN); @@ -276,36 +147,16 @@ DataType DataType::FromNpArray(const py::array &arr) { } std::string DataType::GetPybindFormat() const { - switch (type_) { - case DataType::DE_BOOL: - return py::format_descriptor::format(); - case DataType::DE_INT8: - return py::format_descriptor::format(); - case DataType::DE_UINT8: - return py::format_descriptor::format(); - case DataType::DE_INT16: - return py::format_descriptor::format(); - case DataType::DE_UINT16: - return py::format_descriptor::format(); - case DataType::DE_INT32: - return py::format_descriptor::format(); - case DataType::DE_UINT32: - return py::format_descriptor::format(); - case DataType::DE_INT64: - return py::format_descriptor::format(); - case DataType::DE_UINT64: - return py::format_descriptor::format(); - case DataType::DE_FLOAT16: - // Eigen 3.3.7 doesn't support py::format_descriptor::format() - return "e"; - case DataType::DE_FLOAT32: - return py::format_descriptor::format(); - case DataType::DE_FLOAT64: - return py::format_descriptor::format(); - default: - MS_LOG(ERROR) << "Cannot convert from data type to pybind format descriptor!"; - return ""; + std::string res; + if (type_ < DataType::NUM_OF_TYPES) { + res = PYBIND_FORMAT_DESCRIPTOR[type_]; + } + + if (res.empty()) { + MS_LOG(ERROR) << "Cannot convert from data type to pybind format descriptor!"; } + return res; } + } // namespace dataset } // namespace mindspore diff --git a/mindspore/ccsrc/dataset/core/data_type.h b/mindspore/ccsrc/dataset/core/data_type.h index 70a9ffdedf..eb4bc24c77 100644 --- a/mindspore/ccsrc/dataset/core/data_type.h +++ b/mindspore/ccsrc/dataset/core/data_type.h @@ -16,18 +16,25 @@ #ifndef DATASET_CORE_DATA_TYPE_H_ #define DATASET_CORE_DATA_TYPE_H_ +#include + #include + #include "pybind11/numpy.h" #include "pybind11/pybind11.h" + +#include "dataset/core/constants.h" #include "dataset/core/pybind_support.h" namespace py = pybind11; namespace mindspore { namespace dataset { + // Class that represents basic data types in DataEngine. class DataType { public: enum Type : uint8_t { + DE_UNKNOWN = 0, DE_BOOL, DE_INT8, DE_UINT8, @@ -40,20 +47,60 @@ class DataType { DE_FLOAT16, DE_FLOAT32, DE_FLOAT64, - DE_UNKNOWN + DE_STRING, + NUM_OF_TYPES }; - static constexpr uint8_t DE_BOOL_SIZE = 1; - static constexpr uint8_t DE_UINT8_SIZE = 1; - static constexpr uint8_t DE_INT8_SIZE = 1; - static constexpr uint8_t DE_UINT16_SIZE = 2; - static constexpr uint8_t DE_INT16_SIZE = 2; - static constexpr uint8_t DE_UINT32_SIZE = 4; - static constexpr uint8_t DE_INT32_SIZE = 4; - static constexpr uint8_t DE_INT64_SIZE = 8; - static constexpr uint8_t DE_UINT64_SIZE = 8; - static constexpr uint8_t DE_FLOAT32_SIZE = 4; - static constexpr uint8_t DE_FLOAT64_SIZE = 8; + inline static constexpr uint8_t SIZE_IN_BYTES[] = {0, // DE_UNKNOWN + 1, // DE_BOOL + 1, // DE_INT8 + 1, // DE_UINT8 + 2, // DE_INT16 + 2, // DE_UINT16 + 4, // DE_INT32 + 4, // DE_UINT32 + 8, // DE_INT64 + 8, // DE_UINT64 + 2, // DE_FLOAT16 + 4, // DE_FLOAT32 + 8, // DE_FLOAT64 + 0}; // DE_STRING + + inline static const char *TO_STRINGS[] = {"unknown", "bool", "int8", "uint8", "int16", "uint16", "int32", + "uint32", "int64", "uint64", "float16", "float32", "float64", "string"}; + + inline static const char *PYBIND_TYPES[] = {"object", "bool", "int8", "uint8", "int16", "uint16", "int32", + "uint32", "int64", "uint64", "float16", "float32", "double", "bytes"}; + + inline static const std::string PYBIND_FORMAT_DESCRIPTOR[] = {"", // DE_UNKNOWN + py::format_descriptor::format(), // DE_BOOL + py::format_descriptor::format(), // DE_INT8 + py::format_descriptor::format(), // DE_UINT8 + py::format_descriptor::format(), // DE_INT16 + py::format_descriptor::format(), // DE_UINT16 + py::format_descriptor::format(), // DE_INT32 + py::format_descriptor::format(), // DE_UINT32 + py::format_descriptor::format(), // DE_INT64 + py::format_descriptor::format(), // DE_UINT64 + "e", // DE_FLOAT16 + py::format_descriptor::format(), // DE_FLOAT32 + py::format_descriptor::format(), // DE_FLOAT64 + "S"}; // DE_STRING + + inline static constexpr uint8_t CV_TYPES[] = {kCVInvalidType, // DE_UNKNOWN + CV_8U, // DE_BOOL + CV_8S, // DE_INT8 + CV_8U, // DE_UINT8 + CV_16S, // DE_INT16 + CV_16U, // DE_UINT16 + CV_32S, // DE_INT32 + kCVInvalidType, // DE_UINT32 + kCVInvalidType, // DE_INT64 + kCVInvalidType, // DE_UINT64 + CV_16F, // DE_FLOAT16 + CV_32F, // DE_FLOAT32 + CV_64F, // DE_FLOAT64 + kCVInvalidType}; // DE_STRING // No arg constructor to create an unknown shape DataType() : type_(DE_UNKNOWN) {} @@ -160,6 +207,8 @@ class DataType { bool IsBool() const { return type_ == DataType::DE_BOOL; } + bool IsNumeric() const { return type_ != DataType::DE_STRING; } + Type value() const { return type_; } private: @@ -226,6 +275,11 @@ inline bool DataType::IsCompatible() const { return type_ == DataType::DE_UINT8; } +template <> +inline bool DataType::IsCompatible() const { + return type_ == DataType::DE_STRING; +} + template <> inline bool DataType::IsLooselyCompatible() const { return type_ == DataType::DE_BOOL; diff --git a/mindspore/ccsrc/dataset/core/tensor.cc b/mindspore/ccsrc/dataset/core/tensor.cc index 3f41f27726..083fe85364 100644 --- a/mindspore/ccsrc/dataset/core/tensor.cc +++ b/mindspore/ccsrc/dataset/core/tensor.cc @@ -15,6 +15,7 @@ */ #include "dataset/core/tensor.h" +#include #include #include #include @@ -60,7 +61,7 @@ Tensor::Tensor(const TensorShape &shape, const DataType &type, const unsigned ch if (data != nullptr) { // Given the shape/type of this tensor, compute the data size and copy in the input bytes. int64_t byte_size = this->SizeInBytes(); - static_cast(this->StartAddr()); // Allocates data_ inside itself + static_cast(this->GetMutableBuffer()); // Allocates data_ inside itself if (data_ != nullptr) { int ret_code = memcpy_s(data_, byte_size, data, byte_size); if (ret_code != 0) { @@ -75,7 +76,7 @@ Tensor::Tensor(const TensorShape &shape, const DataType &type, const unsigned ch Tensor::Tensor(Tensor &&other) noexcept : shape_(other.shape()), type_(other.type()), - data_(other.StartAddr()), + data_(other.GetMutableBuffer()), data_allocator_(std::move(other.data_allocator_)) { other.Invalidate(); } @@ -84,7 +85,7 @@ Tensor &Tensor::operator=(Tensor &&other) noexcept { if (&other != this) { shape_ = other.shape(); type_ = other.type(); - data_ = other.StartAddr(); + data_ = other.GetMutableBuffer(); data_end_ = other.data_end_; data_allocator_ = std::move(other.data_allocator_); other.Invalidate(); @@ -92,6 +93,37 @@ Tensor &Tensor::operator=(Tensor &&other) noexcept { return *this; } +Tensor::Tensor(const std::vector &strings, const TensorShape &shape) + : Tensor(TensorShape({static_cast(strings.size())}), DataType(DataType::DE_STRING)) { + auto length_sum = [](dsize_t sum, const std::string &s) { return s.length() + sum; }; + dsize_t total_length = std::accumulate(strings.begin(), strings.end(), 0, length_sum); + + dsize_t num_bytes = (kOffsetSize + 1) * shape_.NumOfElements() + total_length; + + data_ = data_allocator_->allocate(num_bytes); + + auto offset_arr = reinterpret_cast(data_); + uchar *buf = GetStringsBuffer(); + + offset_t offset = -1; + uint32_t i = 0; + for (const auto &str : strings) { + // insert the end index of the string + // end index of a string is the end index of previous string + the length (including \0) + offset = offset + str.length() + 1; + offset_arr[i++] = offset; + // total bytes are reduced by kOffsetSize + num_bytes -= kOffsetSize; + // insert actual string + memcpy_s(buf, num_bytes, str.c_str(), str.length() + 1); + buf += str.length() + 1; + num_bytes -= str.length() + 1; + } + this->data_end_ = buf; + DS_ASSERT(num_bytes == 0); + if (shape.known()) Tensor::Reshape(shape); +} + Status Tensor::CreateTensor(std::shared_ptr *ptr, TensorImpl tensor_impl, const TensorShape &shape, DataType type, const unsigned char *data) { if (!shape.known()) { @@ -120,8 +152,28 @@ Status Tensor::CreateTensor(std::shared_ptr *ptr, TensorImpl tensor_impl } return Status::OK(); // returns base-class shared_ptr } +std::string to(std::string x) { return x; } +Status Tensor::CreateTensorFromNumpyString(std::shared_ptr *ptr, py::array arr) { + std::vector shape; + for (dsize_t i = 0; i < arr.ndim(); i++) { + shape.push_back(static_cast(arr.shape()[i])); + } + arr.resize({arr.size()}); + auto itr = arr.begin(); + std::vector strings; + for (; itr != arr.end(); itr++) { + std::string s = to(py::cast(*itr)); + strings.push_back(s); + } + arr.resize(shape); + + return CreateTensor(ptr, strings, TensorShape{shape}); +} Status Tensor::CreateTensor(std::shared_ptr *ptr, py::array arr) { + if (DataType::FromNpArray(arr) == DataType::DE_STRING) { + return CreateTensorFromNumpyString(ptr, arr); + } const TensorAlloc *alloc = GlobalContext::Instance()->tensor_allocator(); *ptr = std::allocate_shared(*alloc, TensorShape({}), DataType(DataType::DE_UNKNOWN)); @@ -138,7 +190,7 @@ Status Tensor::CreateTensor(std::shared_ptr *ptr, py::array arr) { std::shared_ptr global_pool = GlobalContext::Instance()->mem_pool(); (*ptr)->data_allocator_ = std::make_unique>(global_pool); - static_cast((*ptr)->StartAddr()); + static_cast((*ptr)->GetMutableBuffer()); int64_t byte_size = (*ptr)->SizeInBytes(); unsigned char *data = static_cast(arr.request().ptr); if ((*ptr)->data_ == nullptr) { @@ -173,6 +225,13 @@ Status Tensor::CreateTensor(std::shared_ptr *ptr, py::array arr) { return Status::OK(); // returns base-class shared_ptr } +Status Tensor::CreateTensor(std::shared_ptr *ptr, const std::vector &strings, + const TensorShape &shape) { + const TensorAlloc *alloc = GlobalContext::Instance()->tensor_allocator(); + *ptr = std::allocate_shared(*alloc, strings, shape); + return Status::OK(); +} + // Memcpy the given strided array's used part to consecutive memory // Consider a 3-d array // A[(i * shape[1] + j) * shape[2] + k] = B[i][j][k] = C[i * strides[0] + j * strides[1] + k * strides[2]] @@ -264,6 +323,12 @@ void Tensor::PrintItemAt(const std::vector &index, std::ostream &out) c CASE_PRINT(DataType::DE_FLOAT64, double); + case DataType::DE_STRING: { + std::string_view o{""}; + GetItemAt(&o, index); + out << "\"" << o << "\""; + break; + } default: { out << "?"; break; @@ -324,12 +389,12 @@ Status Tensor::ToFlatIndex(const std::vector &index, dsize_t *flat_inde return Status::OK(); } -const unsigned char *Tensor::StartAddr() const { +const unsigned char *Tensor::GetBuffer() const { // This version cannot modify anything. data_ could possibly be null. return data_; } -unsigned char *Tensor::StartAddr() { +unsigned char *Tensor::GetMutableBuffer() { if (!shape_.known() || type_ == DataType::DE_UNKNOWN) { return nullptr; } @@ -381,6 +446,25 @@ Status Tensor::GetItemPtr(T **ptr, const std::vector &index) const { dsize_t flat_idx; RETURN_IF_NOT_OK(ToFlatIndex(index, &flat_idx)); *ptr = reinterpret_cast(data_ + flat_idx * type_.SizeInBytes()); + + return Status::OK(); + } else { + std::string err = "data type not compatible"; + RETURN_STATUS_UNEXPECTED(err); + } +} + +Status Tensor::GetItemPtr(uchar **ptr, const std::vector &index, offset_t *length) const { + if (type_ == DataType::DE_STRING) { + if (data_ == nullptr) { + std::string err = "Data is not allocated yet"; + RETURN_STATUS_UNEXPECTED(err); + } + dsize_t flat_idx; + RETURN_IF_NOT_OK(ToFlatIndex(index, &flat_idx)); + offset_t length_temp = 0; + RETURN_IF_NOT_OK(GetStringAt(flat_idx, ptr, &length_temp)); + if (length != nullptr) *length = length_temp; return Status::OK(); } else { std::string err = "data type not compatible"; @@ -389,23 +473,27 @@ Status Tensor::GetItemPtr(T **ptr, const std::vector &index) const { } Status Tensor::StartAddrOfIndex(std::vector ind, uchar **start_addr_of_index, TensorShape *remaining) { + if (type() == DataType::DE_STRING) { + RETURN_STATUS_UNEXPECTED("StartAddrOfIndex does not support string tensors yet."); + } dsize_t flat_ind; std::vector t_shape = shape().AsVector(); std::vector r(t_shape.begin() + ind.size(), t_shape.end()); *remaining = TensorShape(r); ind.resize(this->Rank(), 0); // same as -> while (ind.size() < this->Rank()) ind.push_back(0); RETURN_IF_NOT_OK(ToFlatIndex(ind, &flat_ind)); - // check if StartAddr() returns null, we should flag this as an error, this sanity check will only + // check if GetBuffer() returns null, we should flag this as an error, this sanity check will only // be true is the tensor failed to allocate memory. - if (StartAddr() == nullptr) { - RETURN_STATUS_UNEXPECTED("Invalid StartAddr in Tensor, got nullptr"); + if (GetMutableBuffer() == nullptr) { + RETURN_STATUS_UNEXPECTED("Invalid GetBuffer in Tensor, got nullptr"); } - *start_addr_of_index = StartAddr() + flat_ind * this->type().SizeInBytes(); + *start_addr_of_index = GetMutableBuffer() + flat_ind * this->type().SizeInBytes(); return Status::OK(); } Status Tensor::InsertTensor(const std::vector &ind, const std::shared_ptr &tensor) { std::string err_msg; + err_msg += (this->type() == DataType::DE_STRING) ? "[Tensor] Cannot batch tensors of type string\n" : ""; err_msg += (!this->shape().known() || !tensor->shape().known()) ? "[Tensor] unknown shape\n" : ""; err_msg += (ind.size() + tensor->Rank() != this->Rank()) ? "[Tensor] incorrect index\n" : ""; err_msg += tensor->type().SizeInBytes() != this->type().SizeInBytes() ? "[Tensor] incorrect datatype\n" : ""; @@ -418,7 +506,8 @@ Status Tensor::InsertTensor(const std::vector &ind, const std::shared_p RETURN_STATUS_UNEXPECTED(err_msg); } else { if (start_addr_of_ind != nullptr) { - int ret_code = memcpy_s(start_addr_of_ind, tensor->SizeInBytes(), tensor->StartAddr(), tensor->SizeInBytes()); + int ret_code = + memcpy_s(start_addr_of_ind, tensor->SizeInBytes(), tensor->GetMutableBuffer(), tensor->SizeInBytes()); if (ret_code == 0) { return Status::OK(); } else { @@ -446,21 +535,20 @@ Status Tensor::ExpandDim(const dsize_t &axis) { } std::vector Tensor::Strides() { - std::vector strides(Rank()); - dsize_t count = shape_.NumOfElements(); - for (dsize_t i = 0; i < Rank(); i++) { - count /= shape_[i]; - strides[i] = type_.SizeInBytes() * count; - } + std::vector strides = shape_.Strides(); + uint8_t size = type_.SizeInBytes(); + std::transform(strides.begin(), strides.end(), strides.begin(), [&size](const auto &c) { return c * size; }); return strides; } Status Tensor::GetBufferInfo(Tensor &t, py::buffer_info *out) { + CHECK_FAIL_RETURN_UNEXPECTED(t.type().IsNumeric(), "Cannot use GetBufferInfo on tensor of strings."); + std::string format_desc = t.type().GetPybindFormat(); if (format_desc.empty()) { RETURN_STATUS_UNEXPECTED("Cannot convert DE type tp pybind format"); } - *out = py::buffer_info(t.StartAddr(), /* Pointer to buffer */ + *out = py::buffer_info(t.GetMutableBuffer(), /* Pointer to buffer */ t.type().SizeInBytes(), /* Size of one scalar */ format_desc, /* Python struct-style format descriptor */ t.Rank(), /* Number of dimensions */ @@ -495,6 +583,18 @@ Status Tensor::GetItemAt(T *o, const std::vector &index) const { return Status::OK(); } +Status Tensor::GetItemAt(std::string_view *o, const std::vector &index) const { + RETURN_UNEXPECTED_IF_NULL(data_); + RETURN_UNEXPECTED_IF_NULL(o); + CHECK_FAIL_RETURN_UNEXPECTED(type_ == DataType::DE_STRING, "Type is not DE_STRING"); + + uchar *buf = nullptr; + offset_t length = 0; + RETURN_IF_NOT_OK(GetItemPtr(&buf, index, &length)); + std::string_view sv{reinterpret_cast(buf), length}; + o->swap(sv); + return Status::OK(); +} // return data as numpy, should return status Status Tensor::GetDataAsNumpy(py::array *data) { RETURN_UNEXPECTED_IF_NULL(data_); @@ -523,11 +623,36 @@ Status Tensor::GetDataAsNumpy(py::array *data) { *data = py::array_t(shape_.AsVector(), reinterpret_cast(data_)); } else if (type_ == DataType::DE_FLOAT64) { *data = py::array_t(shape_.AsVector(), reinterpret_cast(data_)); + } else if (type_ == DataType::DE_STRING) { + GetDataAsNumpyStrings(data); } else { RETURN_STATUS_UNEXPECTED("Got unexpected type when returning numpy"); } return Status::OK(); } +Status Tensor::GetDataAsNumpyStrings(py::array *data) { + auto itr = begin(); + uint64_t max = 0; + for (; itr != end(); itr++) { + max = std::max((*itr).length(), max); + } + uint64_t total_size = shape_.NumOfElements() * max; + char *tmp_data = reinterpret_cast(data_allocator_->allocate(total_size)); + if (tmp_data == nullptr) RETURN_STATUS_UNEXPECTED("Cannot create temp array."); + memset(tmp_data, 0, total_size); + + itr = begin(); + uint64_t i = 0; + for (; itr != end(); itr++) { + (void)memcpy_s(tmp_data + i * max, total_size, (*itr).data(), (*itr).length()); + i++; + } + auto strides = shape_.Strides(); + std::transform(strides.begin(), strides.end(), strides.begin(), [&max](const auto &s) { return s * max; }); + *data = py::array(py::dtype("S" + std::to_string(max)), shape_.AsVector(), strides, tmp_data); + data_allocator_->deallocate(reinterpret_cast(tmp_data)); + return Status::OK(); +} void Tensor::Squeeze() { shape_ = shape_.Squeeze(); } @@ -647,5 +772,19 @@ Status Tensor::GetFloatAt(T *o, const std::vector &index) const { } return Status::OK(); } +Status Tensor::GetStringAt(dsize_t index, uchar **string_start, offset_t *length) const { + CHECK_FAIL_RETURN_UNEXPECTED(type_ == DataType::DE_STRING, "Type is not string"); + RETURN_UNEXPECTED_IF_NULL(data_); + RETURN_UNEXPECTED_IF_NULL(string_start); + RETURN_UNEXPECTED_IF_NULL(length); + auto *offset_ptr = reinterpret_cast(data_); // offsets starts here + offset_t end = offset_ptr[index]; + offset_t start = 0; + if (index != 0) start = offset_ptr[index - 1] + 1; // string starts at where the previous string ends + 1 + uchar *buf = GetStringsBuffer(); // string data starts here + *string_start = buf + start; + *length = end - start; + return Status::OK(); +} } // namespace dataset } // namespace mindspore diff --git a/mindspore/ccsrc/dataset/core/tensor.h b/mindspore/ccsrc/dataset/core/tensor.h index 4a41d4bd20..1f3a2a40f8 100644 --- a/mindspore/ccsrc/dataset/core/tensor.h +++ b/mindspore/ccsrc/dataset/core/tensor.h @@ -47,8 +47,6 @@ using TensorRow = std::vector>; // A row is a set of using TensorTable = std::vector; // The table of tensors is a vector of rows using TensorQTable = std::deque; // A different flavour of tensor table, this one has queue functionality -// Tensor base class which holds the data in an unsigned char* buffer. - class Tensor { public: Tensor() = delete; @@ -74,6 +72,27 @@ class Tensor { Tensor &operator=(Tensor &&other) noexcept; + // type of offest values to store strings information + using offset_t = uint32_t; + // const of the size of the offset variable + static constexpr uint8_t kOffsetSize = sizeof(offset_t); + // Tensor base class which holds the data in an unsigned char* buffer. + + // Construct a scalar string Tensor + explicit Tensor(const std::string &str) : Tensor(std::vector{str}, TensorShape::CreateScalar()) {} + + // Construct a tensor from a list of strings. Reshape the tensor with `shape` if given, otherwise assume the shape is + // the size of the vector `strings`. + // The memory layout of a Tensor of strings consists of the Offset_array followed by the strings. + // OFFSET1, OFFSET2, ... String1, String2, ... + // The value of each offset is the end index of the corresponding string + // Offsets is of type offest_t + // strings will ne null-terminated + // example: Tensor(['abc', 'de'], shape={2}, type=DE_STRING) + // 3 6 a b c \0 d e \0 + explicit Tensor(const std::vector &strings, + const TensorShape &shape = TensorShape::CreateUnknownRankShape()); + // A static factory method to create the given flavour of derived Tensor // Returns the base class reference for the Tensor. // @param ptr output argument to hold the created Tensor of given tensor_impl @@ -91,6 +110,17 @@ class Tensor { // @return Status Code static Status CreateTensor(std::shared_ptr *ptr, py::array arr); + // Helper function to create a tensor from Numpy of strings + static Status CreateTensorFromNumpyString(std::shared_ptr *ptr, py::array arr); + + // A static factory method to create a Tensor from a given list of strings. + // @param ptr output argument to hold the created Tensor + // @param strings elements of the tensor + // @param shape shape of the tensor + // @return Status Code + static Status CreateTensor(std::shared_ptr *ptr, const std::vector &strings, + const TensorShape &shape = TensorShape::CreateUnknownRankShape()); + // Copy raw data of a array based on shape and strides to the destination pointer // @param dst Pointer to the destination array where the content is to be copied // @param src Pointer to the source of strided array to be copied @@ -116,6 +146,11 @@ class Tensor { template Status GetItemAt(T *o, const std::vector &index) const; + // Get string located at `index`. + // @param index vector + // @return return std::string_view specified at index + Status GetItemAt(std::string_view *o, const std::vector &index) const; + template Status GetUnsignedIntAt(T *o, const std::vector &index) const; @@ -131,26 +166,44 @@ class Tensor { // @param value of type `T` template Status SetItemAt(const std::vector &index, const T &value) { - static_cast(StartAddr()); + static_cast(GetMutableBuffer()); T *ptr = nullptr; RETURN_IF_NOT_OK(GetItemPtr(&ptr, index)); *ptr = value; return Status::OK(); } - // fill tensor with Zeros + // set string item at location specified by index + // @param index + // @param value of type std::string + Status SetItemAt(const std::vector &index, const std::string &value) { + RETURN_UNEXPECTED_IF_NULL(data_); + uchar *ptr = nullptr; + offset_t length = 0; + RETURN_IF_NOT_OK(GetItemPtr(&ptr, index, &length)); + if (value.length() != length) { + RETURN_STATUS_UNEXPECTED("Length of the new string does not match the item."); + } + memcpy_s(reinterpret_cast(ptr), length, value.c_str(), length); + + return Status::OK(); + } + // fill tensor with Zeros. Does not support strings. Status Zero() { + CHECK_FAIL_RETURN_UNEXPECTED(type_ != DataType::DE_STRING, "Cannot use Zero on tensor of strings.."); dsize_t size = SizeInBytes(); - CHECK_FAIL_RETURN_UNEXPECTED(memset_sp(StartAddr(), size, 0, size) == 0, "Failed to fill tensor with zeroes."); + CHECK_FAIL_RETURN_UNEXPECTED(memset_sp(GetMutableBuffer(), size, 0, size) == 0, + "Failed to fill tensor with zeroes."); return Status::OK(); } - // Fill all elements in the Tensor with the given value of type `T` + // Fill all elements in the Tensor with the given value of type `T`. Does not support strings. // @tparam T // @param value template Status Fill(const T &value) { - static_cast(StartAddr()); + CHECK_FAIL_RETURN_UNEXPECTED(type_ != DataType::DE_STRING, "Cannot use fill on tensor of strings."); + static_cast(GetMutableBuffer()); int64_t cellSize = type_.SizeInBytes(); if ((data_ != nullptr) && type_.IsCompatible()) { for (dsize_t i = 0; i < Size(); i++) { @@ -177,7 +230,10 @@ class Tensor { dsize_t Size() const { return shape().NumOfElements(); } // @return the number of bytes this tensor is needs - dsize_t SizeInBytes() const { return Size() * type_.SizeInBytes(); } + dsize_t SizeInBytes() const { + if (data_end_ == nullptr) return type_.SizeInBytes() * shape_.NumOfElements(); + return data_end_ - data_; + } // @return the rank of the tensor dsize_t Rank() const { return shape().Rank(); } @@ -185,12 +241,12 @@ class Tensor { // Get the starting memory address as a constant for the data of the tensor. This potentially // drives an allocation if the data area. // @return const unsigned char* - const unsigned char *StartAddr() const; + const unsigned char *GetBuffer() const; // Get the starting memory address for the data of the tensor. This potentially // drives an allocation if the data area. // @return unsigned char* - unsigned char *StartAddr(); + unsigned char *GetMutableBuffer(); // Getter of the type // @return @@ -236,12 +292,12 @@ class Tensor { virtual void Squeeze(); - // Calculates the strides of the Tensor - // Ex: Tensor of shape <4,2,2> and type DE_UINT8 (1 byte) - // The strides will be {6,2,1}. - // Ex: Tensor of shape <4,2,2> and type DE_UINT32 (4 byte) - // The strides will be {24,8,4}. - // @return vector of integers + /// Calculates the strides of the Tensor + /// Ex: Tensor of shape <4,2,2> and type DE_UINT8 (1 byte) + /// The strides will be {6,2,1}. + /// Ex: Tensor of shape <4,2,2> and type DE_UINT32 (4 byte) + /// The strides will be {24,8,4}. + /// @return vector of integers std::vector Strides(); std::string ToString() { @@ -255,12 +311,14 @@ class Tensor { // @return Status code Status GetDataAsNumpy(py::array *data); + Status GetDataAsNumpyStrings(py::array *data); + static Status GetBufferInfo(Tensor &t, py::buffer_info *out); // TensorIterator is a linear iterator that can be used to iterate over the elements of the Tensor // The order elements is as the memory layout (i.e., row-major) [[1,2,3],[4,5,6] --> 1,2,3,4,5,6 // @tparam T type of values in the Tensor Iterator - template + template class TensorIterator { public: using iterator_category = std::random_access_iterator_tag; @@ -271,11 +329,14 @@ class Tensor { explicit TensorIterator(uchar *ptr = nullptr) { ptr_ = reinterpret_cast(ptr); } - TensorIterator(const TensorIterator &raw_iterator) = default; + TensorIterator(const TensorIterator &raw_iterator) { ptr_ = raw_iterator.ptr_; } ~TensorIterator() = default; - TensorIterator &operator=(const TensorIterator &rhs) = default; + TensorIterator &operator=(const TensorIterator &rhs) { + ptr_ = rhs.ptr_; + return *this; + } TensorIterator &operator=(T *rhs) { ptr_ = rhs; @@ -346,6 +407,99 @@ class Tensor { T *ptr_; }; + // Specialization of TensorIterator for strings. It returns std::string_view for every item. + // @tparam DUMMY, used to mbe able to specialize the inner class + template + class TensorIterator { + public: + using iterator_category = std::random_access_iterator_tag; + using value_type = std::string_view; + using difference_type = ptrdiff_t; + using pointer = std::string_view *; + using reference = std::string_view &; + + explicit TensorIterator(uchar *offset = nullptr, const uchar *buf = nullptr, dsize_t index = 0) { + offset_ = reinterpret_cast(offset); + buf_ = reinterpret_cast(buf); + index_ = index; + } + + TensorIterator(const TensorIterator &raw_iterator) { + offset_ = raw_iterator.offset_; + buf_ = raw_iterator.buf_; + index_ = raw_iterator.index_; + } + + ~TensorIterator() = default; + + bool operator==(const TensorIterator &rhs) { + return buf_ == rhs.buf_ && offset_ == rhs.offset_ && index_ == rhs.index_; + } + + bool operator!=(const TensorIterator &rhs) { return !(*this == rhs); } + + operator bool() const { return offset_ != nullptr; } + + std::string_view operator*() const { + offset_t start = 0; + if (index_ != 0) start = offset_[index_ - 1] + 1; + return std::string_view{buf_ + start}; + } + + TensorIterator &operator+=(const dsize_t &inc) { + index_ += inc; + return *this; + } + + TensorIterator &operator-=(const dsize_t &inc) { + index_ -= inc; + return *this; + } + + TensorIterator &operator++() { + ++index_; + return *this; + } + + TensorIterator &operator--() { + --index_; + return *this; + } + + TensorIterator operator++(int) { + auto temp(*this); + ++index_; + return temp; + } + + TensorIterator operator--(int) { + auto temp(*this); + --index_; + return temp; + } + + TensorIterator operator+(const dsize_t &inc) { + auto oldPtr = index_; + index_ += inc; + auto temp(*this); + index_ = oldPtr; + return temp; + } + + TensorIterator operator-(const dsize_t &inc) { + auto oldPtr = index_; + index_ -= inc; + auto temp(*this); + index_ = oldPtr; + return temp; + } + + protected: + dsize_t index_; + offset_t *offset_; + const char *buf_; + }; + // Return a TensorIterator that points to the start of the Tensor. // It's the user responsibility to use the correct type that matches the Tensor type // @tparam T The type of values in the Tensor @@ -391,6 +545,22 @@ class Tensor { template Status GetItemPtr(T **, const std::vector &index) const; + // Get pointer to string located at `index` and the length of string + // @param index vector + // @return return a pointer to the string specified at index and the length of the string + Status GetItemPtr(uchar **, const std::vector &index, offset_t *length = nullptr) const; + + // Given a flat index of an item string, return the start and length of the item + // @param index flat index of the item + // @return start address of the ths string + // @return length of the string + Status GetStringAt(dsize_t index, uchar **string_start, offset_t *length) const; + + // Skip the offsets and returns the start of the buffer where the real strings is stored. Caller needs to check if the + // tensor's type is a string, otherwise undefined address would be returned. + // @return address of the first string of the tensor. + uchar *GetStringsBuffer() const { return data_ + kOffsetSize * shape_.NumOfElements(); } + // all access to shape_ should be via shape TensorShape shape_; // data type of tensor @@ -402,6 +572,16 @@ class Tensor { // pointer to the end of the physical data unsigned char *data_end_ = nullptr; }; +template <> +inline Tensor::TensorIterator Tensor::begin() { + uchar *buf = GetStringsBuffer(); + return TensorIterator(data_, buf); +} +template <> +inline Tensor::TensorIterator Tensor::end() { + uchar *buf = GetStringsBuffer(); + return TensorIterator(data_, buf, shape_.NumOfElements()); +} } // namespace dataset } // namespace mindspore #endif // DATASET_CORE_TENSOR_H_ diff --git a/mindspore/ccsrc/dataset/core/tensor_shape.cc b/mindspore/ccsrc/dataset/core/tensor_shape.cc index 3a6514034f..e24b2bc12b 100644 --- a/mindspore/ccsrc/dataset/core/tensor_shape.cc +++ b/mindspore/ccsrc/dataset/core/tensor_shape.cc @@ -215,5 +215,17 @@ TensorShape TensorShape::Squeeze() const { } return TensorShape(new_shape); } +std::vector TensorShape::Strides() { + std::vector strides(Rank()); + dsize_t count = NumOfElements(); + for (dsize_t i = 0; i < Rank(); i++) { + if (raw_shape_[i] != 0) + count /= raw_shape_[i]; + else + count = 0; + strides[i] = count; + } + return strides; +} } // namespace dataset } // namespace mindspore diff --git a/mindspore/ccsrc/dataset/core/tensor_shape.h b/mindspore/ccsrc/dataset/core/tensor_shape.h index 230b36cda2..33dd0a81ee 100644 --- a/mindspore/ccsrc/dataset/core/tensor_shape.h +++ b/mindspore/ccsrc/dataset/core/tensor_shape.h @@ -156,6 +156,8 @@ class TensorShape { TensorShape Squeeze() const; + std::vector Strides(); + private: // True if known and valid shape, false otherwise bool known_; diff --git a/mindspore/ccsrc/dataset/engine/datasetops/batch_op.cc b/mindspore/ccsrc/dataset/engine/datasetops/batch_op.cc index 5080a719b4..13f3d4b2ba 100644 --- a/mindspore/ccsrc/dataset/engine/datasetops/batch_op.cc +++ b/mindspore/ccsrc/dataset/engine/datasetops/batch_op.cc @@ -74,6 +74,10 @@ Status BatchOp::operator()() { std::unique_ptr table = std::make_unique(); child_iterator_ = std::make_unique(this, 0, 0); RETURN_IF_NOT_OK(child_iterator_->FetchNextTensorRow(&new_row)); + for (const auto &t : new_row) { + CHECK_FAIL_RETURN_UNEXPECTED(t->type().IsNumeric(), + "[Batch ERROR] Batch does not support Tensor of type string yet."); + } RETURN_IF_NOT_OK(DatasetOp::AssignColMapFromChild()); // must come after the first fetch above int32_t cur_batch_size = 0; RETURN_IF_NOT_OK(GetBatchSize(&cur_batch_size, CBatchInfo(0, 0, 0))); @@ -445,8 +449,8 @@ Status BatchOp::PadHelper(std::shared_ptr src, std::shared_ptr d src_flat_ind += src_s[i] * cur_ind[i]; dst_flat_ind += dst_s[i] * cur_ind[i]; } - unsigned char *src_addr = src->StartAddr() + src_flat_ind * type_size; - unsigned char *dst_addr = dst->StartAddr() + dst_flat_ind * type_size; + unsigned char *src_addr = src->GetMutableBuffer() + src_flat_ind * type_size; + unsigned char *dst_addr = dst->GetMutableBuffer() + dst_flat_ind * type_size; CHECK_FAIL_RETURN_UNEXPECTED(memcpy_s(dst_addr, len, src_addr, len) == 0, "memcpy error"); } else { // not the last dimension, keep doing recursion dsize_t min_ind = std::min(dst->shape()[cur_dim], src->shape()[cur_dim]); diff --git a/mindspore/ccsrc/dataset/engine/datasetops/device_queue_op.cc b/mindspore/ccsrc/dataset/engine/datasetops/device_queue_op.cc index 064cd0cf6d..0815088fa5 100644 --- a/mindspore/ccsrc/dataset/engine/datasetops/device_queue_op.cc +++ b/mindspore/ccsrc/dataset/engine/datasetops/device_queue_op.cc @@ -85,6 +85,13 @@ Status DeviceQueueOp::operator()() { Status DeviceQueueOp::CheckExceptions(const std::unique_ptr &buffer) const { // this method checks if the buffer meets the conditions to be sent to TDT + if (buffer->NumRows() != 0) { + TensorRow row; + buffer->GetRow(0, &row); + for (const auto &item : row) { + CHECK_FAIL_RETURN_UNEXPECTED(item->type().IsNumeric(), "Cannot send tensor of string type to device."); + } + } return Status::OK(); } @@ -207,7 +214,7 @@ Status DeviceQueueOp::MallocForGPUData(std::vector *items, return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, "memory malloc failed."); } (void)memset_s(sub_item.data_ptr_, sub_item.data_len_, 0, sub_item.data_len_); - unsigned char *column_data = curr_row[i]->StartAddr(); + unsigned char *column_data = curr_row[i]->GetMutableBuffer(); if (memcpy_s(sub_item.data_ptr_, sub_item.data_len_, column_data, static_cast(curr_row[i++]->SizeInBytes())) != 0) { MS_LOG(ERROR) << "memcpy_s failed!"; diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/celeba_op.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/celeba_op.cc index 06c2b06727..8c6d9fea85 100644 --- a/mindspore/ccsrc/dataset/engine/datasetops/source/celeba_op.cc +++ b/mindspore/ccsrc/dataset/engine/datasetops/source/celeba_op.cc @@ -407,7 +407,7 @@ Status CelebAOp::LoadTensorRow(const std::pair RETURN_IF_NOT_OK(Tensor::CreateTensor(&image, data_schema_->column(0).tensorImpl(), TensorShape(std::vector(1, num_elements)), data_schema_->column(0).type())); - (void)handle.read(reinterpret_cast(image->StartAddr()), num_elements); + (void)handle.read(reinterpret_cast(image->GetMutableBuffer()), num_elements); if (decode_ == true) { Status rc = Decode(image, &image); if (rc.IsError()) { diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/cifar_op.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/cifar_op.cc index 4a0a882103..d0a17b56f9 100644 --- a/mindspore/ccsrc/dataset/engine/datasetops/source/cifar_op.cc +++ b/mindspore/ccsrc/dataset/engine/datasetops/source/cifar_op.cc @@ -197,7 +197,7 @@ Status CifarOp::LoadTensorRow(uint64_t index, TensorRow *trow) { std::shared_ptr fine_label; std::shared_ptr ori_image = cifar_image_label_pairs_[index].first; std::shared_ptr copy_image = - std::make_shared(ori_image->shape(), ori_image->type(), ori_image->StartAddr()); + std::make_shared(ori_image->shape(), ori_image->type(), ori_image->GetMutableBuffer()); RETURN_IF_NOT_OK(Tensor::CreateTensor(&label, data_schema_->column(1).tensorImpl(), data_schema_->column(1).shape(), data_schema_->column(1).type(), reinterpret_cast(&cifar_image_label_pairs_[index].second[0]))); @@ -394,7 +394,7 @@ Status CifarOp::ParseCifarData() { data_schema_->column(0).type())); for (int ch = 0; ch < kCifarImageChannel; ++ch) { for (int pix = 0; pix < kCifarImageHeight * kCifarImageWidth; ++pix) { - (image_tensor->StartAddr())[pix * kCifarImageChannel + ch] = block[cur_block_index++]; + (image_tensor->GetMutableBuffer())[pix * kCifarImageChannel + ch] = block[cur_block_index++]; } } cifar_image_label_pairs_.emplace_back(std::make_pair(image_tensor, labels)); diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/image_folder_op.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/image_folder_op.cc index 4503855b34..77b94fb6ce 100644 --- a/mindspore/ccsrc/dataset/engine/datasetops/source/image_folder_op.cc +++ b/mindspore/ccsrc/dataset/engine/datasetops/source/image_folder_op.cc @@ -216,7 +216,7 @@ Status ImageFolderOp::LoadTensorRow(ImageLabelPair pairPtr, TensorRow *trow) { RETURN_IF_NOT_OK(Tensor::CreateTensor(&image, data_schema_->column(0).tensorImpl(), TensorShape(std::vector(1, num_elements)), data_schema_->column(0).type(), nullptr)); - (void)fs.read(reinterpret_cast(image->StartAddr()), num_elements); + (void)fs.read(reinterpret_cast(image->GetMutableBuffer()), num_elements); fs.close(); if (decode_ == true) { Status rc = Decode(image, &image); diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/manifest_op.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/manifest_op.cc index 349ca7cf6e..5892b10701 100644 --- a/mindspore/ccsrc/dataset/engine/datasetops/source/manifest_op.cc +++ b/mindspore/ccsrc/dataset/engine/datasetops/source/manifest_op.cc @@ -210,7 +210,7 @@ Status ManifestOp::LoadTensorRow(const std::paircolumn(0).tensorImpl(), TensorShape(std::vector(1, num_elements)), data_schema_->column(0).type(), nullptr)); - (void)fs.read(reinterpret_cast(image->StartAddr()), num_elements); + (void)fs.read(reinterpret_cast(image->GetMutableBuffer()), num_elements); if (fs.fail()) { fs.close(); RETURN_STATUS_UNEXPECTED("Fail to read file: " + data.first); diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/mnist_op.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/mnist_op.cc index 8ca65fe20f..217ebebbd7 100644 --- a/mindspore/ccsrc/dataset/engine/datasetops/source/mnist_op.cc +++ b/mindspore/ccsrc/dataset/engine/datasetops/source/mnist_op.cc @@ -170,7 +170,7 @@ Status MnistOp::LoadTensorRow(const MnistLabelPair &mnist_pair, TensorRow *trow) int32_t l = mnist_pair.second; // make a copy of cached tensor RETURN_IF_NOT_OK(Tensor::CreateTensor(&image, data_schema_->column(0).tensorImpl(), mnist_pair.first->shape(), - mnist_pair.first->type(), mnist_pair.first->StartAddr())); + mnist_pair.first->type(), mnist_pair.first->GetMutableBuffer())); RETURN_IF_NOT_OK(Tensor::CreateTensor(&label, data_schema_->column(1).tensorImpl(), data_schema_->column(1).shape(), data_schema_->column(1).type(), reinterpret_cast(&l))); (*trow) = {std::move(image), std::move(label)}; diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/random_data_op.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/random_data_op.cc index 3d0da63544..9e3d1140a7 100644 --- a/mindspore/ccsrc/dataset/engine/datasetops/source/random_data_op.cc +++ b/mindspore/ccsrc/dataset/engine/datasetops/source/random_data_op.cc @@ -127,7 +127,7 @@ Status RandomDataOp::GenerateSchema() { // For each column: // - choose a datatype // - generate a shape that randomly chooses the number of dimensions and the dimension values. - DataType::Type newType = static_cast(GenRandomInt(0, kMaxDataType)); + DataType::Type newType = static_cast(GenRandomInt(0, DataType::NUM_OF_TYPES - 2)); int32_t rank = GenRandomInt(1, kMaxRank); std::vector dims; for (int32_t d = 0; d < rank; d++) { diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/random_data_op.h b/mindspore/ccsrc/dataset/engine/datasetops/source/random_data_op.h index a9566b9c9f..92d05d7318 100644 --- a/mindspore/ccsrc/dataset/engine/datasetops/source/random_data_op.h +++ b/mindspore/ccsrc/dataset/engine/datasetops/source/random_data_op.h @@ -43,7 +43,6 @@ class RandomDataOp : public ParallelOp { static constexpr int32_t kMaxNumColumns = 4; static constexpr int32_t kMaxRank = 4; static constexpr int32_t kMaxDimValue = 2048; - static constexpr int32_t kMaxDataType = (DataType::DE_UNKNOWN - 1); static constexpr int32_t kMaxTotalRows = 1024; // A nested builder class to aid in the construction of a RandomDataOp diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/distributed_sampler.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/distributed_sampler.cc index 5b5a9321df..92dfbf594d 100644 --- a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/distributed_sampler.cc +++ b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/distributed_sampler.cc @@ -58,7 +58,7 @@ Status DistributedSampler::GetNextBuffer(std::unique_ptr *out_buffer (*out_buffer) = std::make_unique(cnt_, DataBuffer::kDeBFlagNone); std::shared_ptr sample_ids; RETURN_IF_NOT_OK(CreateSamplerTensor(&sample_ids, samples_per_buffer_)); - int64_t *id_ptr = reinterpret_cast(sample_ids->StartAddr()); + int64_t *id_ptr = reinterpret_cast(sample_ids->GetMutableBuffer()); while (cnt_ < samples_per_buffer_) { int64_t next_id = (num_devices_ * (cnt_++) + device_id_) % num_rows_; *(id_ptr++) = shuffle_ ? shuffle_vec_[static_cast(next_id)] : next_id; diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/pk_sampler.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/pk_sampler.cc index 8198204437..f4c1189b8c 100644 --- a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/pk_sampler.cc +++ b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/pk_sampler.cc @@ -58,7 +58,7 @@ Status PKSampler::GetNextBuffer(std::unique_ptr *out_buffer) { int64_t last_id = (samples_per_buffer_ + next_id_ > num_pk_samples_) ? num_pk_samples_ : samples_per_buffer_ + next_id_; RETURN_IF_NOT_OK(CreateSamplerTensor(&sample_ids, last_id - next_id_)); - int64_t *id_ptr = reinterpret_cast(sample_ids->StartAddr()); + int64_t *id_ptr = reinterpret_cast(sample_ids->GetMutableBuffer()); while (next_id_ < last_id) { int64_t cls_id = next_id_++ / samples_per_class_; const std::vector &samples = label_to_ids_[labels_[cls_id]]; diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/random_sampler.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/random_sampler.cc index e0efda6e53..967632a5d9 100644 --- a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/random_sampler.cc +++ b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/random_sampler.cc @@ -38,7 +38,7 @@ Status RandomSampler::GetNextBuffer(std::unique_ptr *out_buffer) { std::shared_ptr sampleIds; int64_t last_id = samples_per_buffer_ + next_id_ > num_samples_ ? num_samples_ : samples_per_buffer_ + next_id_; RETURN_IF_NOT_OK(CreateSamplerTensor(&sampleIds, last_id - next_id_)); - int64_t *id_ptr = reinterpret_cast(sampleIds->StartAddr()); + int64_t *id_ptr = reinterpret_cast(sampleIds->GetMutableBuffer()); for (int64_t i = 0; i < (last_id - next_id_); i++) { *(id_ptr + i) = replacement_ ? (*dist)(rnd_) : shuffled_ids_[static_cast(i + next_id_)]; } diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/sampler.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/sampler.cc index 9fe752448a..93c8c305bc 100644 --- a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/sampler.cc +++ b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/sampler.cc @@ -40,7 +40,7 @@ Status Sampler::CreateSamplerTensor(std::shared_ptr *sample_ids, int64_t } TensorShape shape(std::vector(1, num_elements)); RETURN_IF_NOT_OK(Tensor::CreateTensor(sample_ids, col_desc_->tensorImpl(), shape, col_desc_->type())); - (void)(*sample_ids)->StartAddr(); // allocate memory in case user forgets! + (void)(*sample_ids)->GetMutableBuffer(); // allocate memory in case user forgets! return Status::OK(); } diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/sequential_sampler.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/sequential_sampler.cc index 6ed06b527f..e405479360 100644 --- a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/sequential_sampler.cc +++ b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/sequential_sampler.cc @@ -31,7 +31,7 @@ Status SequentialSampler::GetNextBuffer(std::unique_ptr *out_buffer) std::shared_ptr sampleIds; int64_t lastId = (samples_per_buffer_ + next_id_ > num_samples_) ? num_samples_ : samples_per_buffer_ + next_id_; RETURN_IF_NOT_OK(CreateSamplerTensor(&sampleIds, lastId - next_id_)); - int64_t *idPtr = reinterpret_cast(sampleIds->StartAddr()); + int64_t *idPtr = reinterpret_cast(sampleIds->GetMutableBuffer()); while (next_id_ < lastId) { *(idPtr++) = next_id_++; } diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/subset_random_sampler.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/subset_random_sampler.cc index c377fddb49..698edf5e68 100644 --- a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/subset_random_sampler.cc +++ b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/subset_random_sampler.cc @@ -78,7 +78,7 @@ Status SubsetRandomSampler::GetNextBuffer(std::unique_ptr *out_buffe RETURN_IF_NOT_OK(CreateSamplerTensor(&outputIds, last_id - sample_id_)); // Initialize tensor - int64_t *id_ptr = reinterpret_cast(outputIds->StartAddr()); + int64_t *id_ptr = reinterpret_cast(outputIds->GetMutableBuffer()); while (sample_id_ < last_id) { if (indices_[sample_id_] >= num_rows_) { std::string err_msg = diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/weighted_random_sampler.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/weighted_random_sampler.cc index 06afc219e6..91fc7f7d81 100644 --- a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/weighted_random_sampler.cc +++ b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/weighted_random_sampler.cc @@ -111,7 +111,7 @@ Status WeightedRandomSampler::GetNextBuffer(std::unique_ptr *out_buf RETURN_IF_NOT_OK(CreateSamplerTensor(&outputIds, last_id - sample_id_)); // Initialize tensor. - int64_t *id_ptr = reinterpret_cast(outputIds->StartAddr()); + int64_t *id_ptr = reinterpret_cast(outputIds->GetMutableBuffer()); // Assign the data to tensor element. while (sample_id_ < last_id) { int64_t genId; diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/text_file_op.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/text_file_op.cc index 9336446852..e51eb4e00d 100644 --- a/mindspore/ccsrc/dataset/engine/datasetops/source/text_file_op.cc +++ b/mindspore/ccsrc/dataset/engine/datasetops/source/text_file_op.cc @@ -146,10 +146,7 @@ Status TextFileOp::LoadTensor(const std::string &line, std::unique_ptrpush_back(std::move(tRow)); std::shared_ptr tensor; - RETURN_IF_NOT_OK( - Tensor::CreateTensor(&tensor, data_schema_->column(0).tensorImpl(), - TensorShape(std::vector(1, line.size())), data_schema_->column(0).type(), - const_cast(reinterpret_cast(common::SafeCStr(line))))); + RETURN_IF_NOT_OK(Tensor::CreateTensor(&tensor, {line}, TensorShape::CreateScalar())); (**tensor_table)[row][0] = std::move(tensor); return Status::OK(); } diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/tf_reader_op.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/tf_reader_op.cc index aac2d08997..556aa53c73 100644 --- a/mindspore/ccsrc/dataset/engine/datasetops/source/tf_reader_op.cc +++ b/mindspore/ccsrc/dataset/engine/datasetops/source/tf_reader_op.cc @@ -759,7 +759,7 @@ Status TFReaderOp::LoadBytesList(const ColDescriptor ¤t_col, const dataeng RETURN_IF_NOT_OK(Tensor::CreateTensor(tensor, current_col.tensorImpl(), current_shape, current_col.type())); // Tensors are lazily allocated, this eagerly allocates memory for the tensor. - unsigned char *current_tensor_addr = (*tensor)->StartAddr(); + unsigned char *current_tensor_addr = (*tensor)->GetMutableBuffer(); int64_t tensor_bytes_remaining = (*num_elements) * pad_size; if (current_tensor_addr == nullptr) { @@ -878,7 +878,7 @@ Status TFReaderOp::LoadIntList(const ColDescriptor ¤t_col, const dataengin RETURN_IF_NOT_OK(Tensor::CreateTensor(tensor, current_col.tensorImpl(), current_shape, current_col.type())); // Tensors are lazily allocated, this eagerly allocates memory for the tensor. - (void)(*tensor)->StartAddr(); + (void)(*tensor)->GetMutableBuffer(); int64_t i = 0; auto it = (*tensor)->begin(); diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/voc_op.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/voc_op.cc index 63deec26a9..2d700c0a94 100644 --- a/mindspore/ccsrc/dataset/engine/datasetops/source/voc_op.cc +++ b/mindspore/ccsrc/dataset/engine/datasetops/source/voc_op.cc @@ -388,7 +388,7 @@ Status VOCOp::ReadImageToTensor(const std::string &path, const ColDescriptor &co (void)fs.seekg(0, std::ios::beg); RETURN_IF_NOT_OK( Tensor::CreateTensor(tensor, col.tensorImpl(), TensorShape(std::vector(1, num_elements)), col.type())); - (void)fs.read(reinterpret_cast((*tensor)->StartAddr()), num_elements); + (void)fs.read(reinterpret_cast((*tensor)->GetMutableBuffer()), num_elements); fs.close(); if (decode_ == true) { Status rc = Decode(*tensor, tensor); diff --git a/mindspore/ccsrc/dataset/engine/tdt/tdt_plugin.cc b/mindspore/ccsrc/dataset/engine/tdt/tdt_plugin.cc index 27c408d681..e457de52ae 100644 --- a/mindspore/ccsrc/dataset/engine/tdt/tdt_plugin.cc +++ b/mindspore/ccsrc/dataset/engine/tdt/tdt_plugin.cc @@ -110,7 +110,7 @@ TdtStatus TdtPlugin::translate(const TensorRow &ts_row, std::vector &i data_item.tensorShape_ = dataShapes; data_item.tensorType_ = datatype; data_item.dataLen_ = ts->SizeInBytes(); - data_item.dataPtr_ = std::shared_ptr(reinterpret_cast(ts->StartAddr()), [](void *elem) {}); + data_item.dataPtr_ = std::shared_ptr(reinterpret_cast(ts->GetMutableBuffer()), [](void *elem) {}); items.emplace_back(data_item); MS_LOG(INFO) << "TDT data type is " << datatype << ", data shape is " << dataShapes << ", data length is " << ts->Size() << "."; diff --git a/mindspore/ccsrc/dataset/kernels/data/data_utils.cc b/mindspore/ccsrc/dataset/kernels/data/data_utils.cc index f1b190578a..03f1b99e2a 100644 --- a/mindspore/ccsrc/dataset/kernels/data/data_utils.cc +++ b/mindspore/ccsrc/dataset/kernels/data/data_utils.cc @@ -162,7 +162,7 @@ void CastFrom(const std::shared_ptr &input, std::shared_ptr *out Status TypeCast(const std::shared_ptr &input, std::shared_ptr *output, const DataType &data_type) { RETURN_IF_NOT_OK(Tensor::CreateTensor(output, TensorImpl::kFlexible, input->shape(), data_type)); - static_cast((*output)->StartAddr()); + static_cast((*output)->GetMutableBuffer()); switch (input->type().value()) { case DataType::DE_BOOL: CastFrom(input, output); @@ -211,7 +211,7 @@ Status ToFloat16(const std::shared_ptr &input, std::shared_ptr * // initiate new tensor for type cast DataType new_type = DataType("float16"); RETURN_IF_NOT_OK(Tensor::CreateTensor(output, TensorImpl::kFlexible, input->shape(), new_type)); - static_cast((*output)->StartAddr()); + static_cast((*output)->GetMutableBuffer()); auto in_itr = input->begin(); auto out_itr = (*output)->begin(); diff --git a/mindspore/ccsrc/dataset/kernels/image/image_utils.cc b/mindspore/ccsrc/dataset/kernels/image/image_utils.cc index 19cf0aecd9..a166f863b5 100644 --- a/mindspore/ccsrc/dataset/kernels/image/image_utils.cc +++ b/mindspore/ccsrc/dataset/kernels/image/image_utils.cc @@ -64,7 +64,7 @@ Status Flip(std::shared_ptr input, std::shared_ptr *output, int std::shared_ptr output_cv = std::make_shared(input_cv->shape(), input_cv->type()); RETURN_UNEXPECTED_IF_NULL(output_cv); - (void)output_cv->StartAddr(); + (void)output_cv->GetMutableBuffer(); if (input_cv->mat().data) { try { cv::flip(input_cv->mat(), output_cv->mat(), flip_code); @@ -125,10 +125,10 @@ bool HasJpegMagic(const unsigned char *data, size_t data_size) { } Status Decode(const std::shared_ptr &input, std::shared_ptr *output) { - if (input->StartAddr() == nullptr) { + if (input->GetMutableBuffer() == nullptr) { RETURN_STATUS_UNEXPECTED("Tensor is nullptr"); } - if (HasJpegMagic(input->StartAddr(), input->SizeInBytes())) { + if (HasJpegMagic(input->GetMutableBuffer(), input->SizeInBytes())) { return JpegCropAndDecode(input, output); } else { return DecodeCv(input, output); @@ -282,7 +282,7 @@ Status JpegCropAndDecode(const std::shared_ptr &input, std::shared_ptrStartAddr(), input->SizeInBytes()); + JpegSetSource(&cinfo, input->GetMutableBuffer(), input->SizeInBytes()); (void)jpeg_read_header(&cinfo, TRUE); RETURN_IF_NOT_OK(JpegSetColorSpace(&cinfo)); jpeg_calc_output_dimensions(&cinfo); @@ -311,7 +311,7 @@ Status JpegCropAndDecode(const std::shared_ptr &input, std::shared_ptr(ts, DataType(DataType::DE_UINT8)); const int buffer_size = output_tensor->SizeInBytes(); - JSAMPLE *buffer = static_cast(output_tensor->StartAddr()); + JSAMPLE *buffer = static_cast(output_tensor->GetMutableBuffer()); const int max_scanlines_to_read = skipped_scanlines + crop_h; // stride refers to output tensor, which has 3 components at most const int stride = crop_w * kOutNumComponents; diff --git a/mindspore/ccsrc/dataset/kernels/image/random_crop_decode_resize_op.cc b/mindspore/ccsrc/dataset/kernels/image/random_crop_decode_resize_op.cc index 732aaa9031..c11b5b5968 100644 --- a/mindspore/ccsrc/dataset/kernels/image/random_crop_decode_resize_op.cc +++ b/mindspore/ccsrc/dataset/kernels/image/random_crop_decode_resize_op.cc @@ -31,7 +31,7 @@ Status RandomCropDecodeResizeOp::Compute(const std::shared_ptr &input, s if (input == nullptr) { RETURN_STATUS_UNEXPECTED("input tensor is null"); } - if (!HasJpegMagic(input->StartAddr(), input->SizeInBytes())) { + if (!HasJpegMagic(input->GetMutableBuffer(), input->SizeInBytes())) { DecodeOp op(true); std::shared_ptr decoded; RETURN_IF_NOT_OK(op.Compute(input, &decoded)); @@ -43,7 +43,7 @@ Status RandomCropDecodeResizeOp::Compute(const std::shared_ptr &input, s jerr.pub.error_exit = JpegErrorExitCustom; try { jpeg_create_decompress(&cinfo); - JpegSetSource(&cinfo, input->StartAddr(), input->SizeInBytes()); + JpegSetSource(&cinfo, input->GetMutableBuffer(), input->SizeInBytes()); (void)jpeg_read_header(&cinfo, TRUE); jpeg_calc_output_dimensions(&cinfo); } catch (std::runtime_error &e) { diff --git a/tests/ut/cpp/dataset/CMakeLists.txt b/tests/ut/cpp/dataset/CMakeLists.txt index 3f7c194b19..ca1963a293 100644 --- a/tests/ut/cpp/dataset/CMakeLists.txt +++ b/tests/ut/cpp/dataset/CMakeLists.txt @@ -50,6 +50,7 @@ SET(DE_UT_SRCS storage_op_test.cc task_manager_test.cc tensor_test.cc + tensor_string_test.cc tensorshape_test.cc tfReader_op_test.cc to_float16_op_test.cc diff --git a/tests/ut/cpp/dataset/common/cvop_common.cc b/tests/ut/cpp/dataset/common/cvop_common.cc index 7f99ff27b1..7ee080dd68 100644 --- a/tests/ut/cpp/dataset/common/cvop_common.cc +++ b/tests/ut/cpp/dataset/common/cvop_common.cc @@ -60,7 +60,7 @@ void CVOpCommon::GetInputImage(std::string filename) { TensorShape in_shape({file_size}); raw_input_tensor_ = std::make_shared(in_shape, DataType(DataType::DE_UINT8)); - file.read(reinterpret_cast(raw_input_tensor_->StartAddr()), raw_input_tensor_->SizeInBytes()); + file.read(reinterpret_cast(raw_input_tensor_->GetMutableBuffer()), raw_input_tensor_->SizeInBytes()); raw_cv_image_ = cv::imread(filename, cv::ImreadModes::IMREAD_COLOR); input_tensor_ = std::dynamic_pointer_cast(std::make_shared(raw_cv_image_)); SwapRedAndBlue(input_tensor_, &input_tensor_); diff --git a/tests/ut/cpp/dataset/datatype_test.cc b/tests/ut/cpp/dataset/datatype_test.cc index ee49037a19..82843d4285 100644 --- a/tests/ut/cpp/dataset/datatype_test.cc +++ b/tests/ut/cpp/dataset/datatype_test.cc @@ -32,47 +32,47 @@ class MindDataTestDatatype : public UT::Common { TEST_F(MindDataTestDatatype, TestSizes) { - uint8_t x = DataType::DE_BOOL_SIZE; + uint8_t x = DataType::SIZE_IN_BYTES[DataType::DE_BOOL]; DataType d = DataType(DataType::DE_BOOL); ASSERT_EQ(x, 1); ASSERT_EQ(d.SizeInBytes(), x); - x = DataType::DE_INT8_SIZE; + x = DataType::SIZE_IN_BYTES[DataType::DE_INT8]; d = DataType(DataType::DE_INT8); ASSERT_EQ(x, 1); ASSERT_EQ(d.SizeInBytes(), x); - x = DataType::DE_UINT8_SIZE; + x = DataType::SIZE_IN_BYTES[DataType::DE_UINT8]; d = DataType(DataType::DE_UINT8); ASSERT_EQ(x, 1); ASSERT_EQ(d.SizeInBytes(), x); - x = DataType::DE_INT16_SIZE; + x = DataType::SIZE_IN_BYTES[DataType::DE_INT16]; d = DataType(DataType::DE_INT16); ASSERT_EQ(x, 2); ASSERT_EQ(d.SizeInBytes(), x); - x = DataType::DE_UINT16_SIZE; + x = DataType::SIZE_IN_BYTES[DataType::DE_UINT16]; d = DataType(DataType::DE_UINT16); ASSERT_EQ(x, 2); ASSERT_EQ(d.SizeInBytes(), x); - x = DataType::DE_INT32_SIZE; + x = DataType::SIZE_IN_BYTES[DataType::DE_INT32]; d = DataType(DataType::DE_INT32); ASSERT_EQ(x, 4); ASSERT_EQ(d.SizeInBytes(), x); - x = DataType::DE_UINT32_SIZE; + x = DataType::SIZE_IN_BYTES[DataType::DE_UINT32]; d = DataType(DataType::DE_UINT32); ASSERT_EQ(x, 4); ASSERT_EQ(d.SizeInBytes(), x); - x = DataType::DE_INT64_SIZE; + x = DataType::SIZE_IN_BYTES[DataType::DE_INT64]; d = DataType(DataType::DE_INT64); ASSERT_EQ(x, 8); ASSERT_EQ(d.SizeInBytes(), x); - x = DataType::DE_UINT64_SIZE; + x = DataType::SIZE_IN_BYTES[DataType::DE_UINT64]; d = DataType(DataType::DE_UINT64); ASSERT_EQ(x, 8); ASSERT_EQ(d.SizeInBytes(), x); - x = DataType::DE_FLOAT32_SIZE; + x = DataType::SIZE_IN_BYTES[DataType::DE_FLOAT32]; d = DataType(DataType::DE_FLOAT32); ASSERT_EQ(x, 4); ASSERT_EQ(d.SizeInBytes(), x); - x = DataType::DE_FLOAT64_SIZE; + x = DataType::SIZE_IN_BYTES[DataType::DE_FLOAT64]; d = DataType(DataType::DE_FLOAT64); ASSERT_EQ(x, 8); ASSERT_EQ(d.SizeInBytes(), x); diff --git a/tests/ut/cpp/dataset/image_folder_op_test.cc b/tests/ut/cpp/dataset/image_folder_op_test.cc index e149e687c6..dbe43ab355 100644 --- a/tests/ut/cpp/dataset/image_folder_op_test.cc +++ b/tests/ut/cpp/dataset/image_folder_op_test.cc @@ -74,7 +74,7 @@ Status Create1DTensor(std::shared_ptr *sample_ids, int64_t num_elements, RETURN_IF_NOT_OK( Tensor::CreateTensor(sample_ids, TensorImpl::kFlexible, shape, DataType(data_type), data)); if (data == nullptr) { - (*sample_ids)->StartAddr(); // allocate memory in case user forgets! + (*sample_ids)->GetMutableBuffer(); // allocate memory in case user forgets! } return Status::OK(); } diff --git a/tests/ut/cpp/dataset/map_op_test.cc b/tests/ut/cpp/dataset/map_op_test.cc index 271cbbe190..7a99007437 100644 --- a/tests/ut/cpp/dataset/map_op_test.cc +++ b/tests/ut/cpp/dataset/map_op_test.cc @@ -190,7 +190,7 @@ TEST_F(MindDataTestMapOp, TestByPosition) { EXPECT_EQ(tensor_list[i]->type(), golden_types[i]); EXPECT_EQ(tensor_list[i]->Rank(), golden_ranks[i]); EXPECT_EQ(tensor_list[i]->shape(), golden_shapes[i]); - EXPECT_NE(tensor_list[i]->StartAddr(), nullptr); + EXPECT_NE(tensor_list[i]->GetMutableBuffer(), nullptr); } } @@ -366,7 +366,7 @@ TEST_F(MindDataTestMapOp, Test1to3) { EXPECT_EQ(tensor_list[i]->type(), golden_types[i]); EXPECT_EQ(tensor_list[i]->Rank(), golden_ranks[i]); EXPECT_EQ(tensor_list[i]->shape(), golden_shapes[i]); - EXPECT_NE(tensor_list[i]->StartAddr(), nullptr); + EXPECT_NE(tensor_list[i]->GetMutableBuffer(), nullptr); } rc = di.FetchNextTensorRow(&tensor_list); EXPECT_TRUE(rc.IsOk()); @@ -700,7 +700,7 @@ TEST_F(MindDataTestMapOp, ImageFolder_Decode_Repeat_Resize) { MS_LOG(DEBUG) << "row:" << i << "\tlabel:" << label << "\n"; EXPECT_TRUE(img_class[(i % 44) / 11] == label); // Dump all the image into string, to be used as a comparison later. - result.append((char *) tensor_map["image"]->StartAddr(), (int64_t) tensor_map["image"]->Size()); + result.append((char *)tensor_map["image"]->GetMutableBuffer(), (int64_t) tensor_map["image"]->Size()); di.GetNextAsMap(&tensor_map); i++; } @@ -745,7 +745,7 @@ TEST_F(MindDataTestMapOp, ImageFolder_Decode_Repeat_Resize) { tensor_map["label"]->GetItemAt(&label, {}); MS_LOG(DEBUG) << "row:" << i << "\tlabel:" << label << "\n"; EXPECT_TRUE(img_class[(i % 44) / 11] == label); - result2.append((char *) tensor_map["image"]->StartAddr(), (int64_t) tensor_map["image"]->Size()); + result2.append((char *)tensor_map["image"]->GetMutableBuffer(), (int64_t) tensor_map["image"]->Size()); di2.GetNextAsMap(&tensor_map); i++; } diff --git a/tests/ut/cpp/dataset/random_crop_decode_resize_op_test.cc b/tests/ut/cpp/dataset/random_crop_decode_resize_op_test.cc index d7e0b16aff..988a83421b 100644 --- a/tests/ut/cpp/dataset/random_crop_decode_resize_op_test.cc +++ b/tests/ut/cpp/dataset/random_crop_decode_resize_op_test.cc @@ -57,8 +57,8 @@ TEST_F(MindDataTestRandomCropDecodeResizeOp, TestOp2) { for (int i = 0; i < 100; i++) { (void)crop_and_decode.Compute(raw_input_tensor_, &crop_and_decode_output); (void)decode_and_crop.Compute(input_tensor_, &decode_and_crop_output); - cv::Mat output1(target_height, target_width, CV_8UC3, crop_and_decode_output->StartAddr()); - cv::Mat output2(target_height, target_width, CV_8UC3, decode_and_crop_output->StartAddr()); + cv::Mat output1(target_height, target_width, CV_8UC3, crop_and_decode_output->GetMutableBuffer()); + cv::Mat output2(target_height, target_width, CV_8UC3, decode_and_crop_output->GetMutableBuffer()); long int mse_sum = 0; long int count = 0; int a, b; @@ -133,8 +133,8 @@ TEST_F(MindDataTestRandomCropDecodeResizeOp, TestOp1) { crop_and_decode_status = Crop(decoded, &decoded_and_cropped, x, y, crop_width, crop_height); decode_and_crop_status = JpegCropAndDecode(raw_input_tensor_, &cropped_and_decoded, x, y, crop_width, crop_height); { - cv::Mat M1(crop_height, crop_width, CV_8UC3, decoded_and_cropped->StartAddr()); - cv::Mat M2(crop_height, crop_width, CV_8UC3, cropped_and_decoded->StartAddr()); + cv::Mat M1(crop_height, crop_width, CV_8UC3, decoded_and_cropped->GetMutableBuffer()); + cv::Mat M2(crop_height, crop_width, CV_8UC3, cropped_and_decoded->GetMutableBuffer()); for (int i = 0; i < crop_height; ++i) { for (int j = 0; j < crop_width; ++j) { m1 = M1.at(i, j)[1]; diff --git a/tests/ut/cpp/dataset/stand_alone_samplers_test.cc b/tests/ut/cpp/dataset/stand_alone_samplers_test.cc index ea0ae78aef..6ab7d0498f 100644 --- a/tests/ut/cpp/dataset/stand_alone_samplers_test.cc +++ b/tests/ut/cpp/dataset/stand_alone_samplers_test.cc @@ -34,7 +34,7 @@ Status CreateINT64Tensor(std::shared_ptr *sample_ids, int64_t num_elemen RETURN_IF_NOT_OK(Tensor::CreateTensor(sample_ids, TensorImpl::kFlexible, shape, DataType(DataType::DE_INT64), data)); if (data == nullptr) { - (*sample_ids)->StartAddr(); // allocate memory in case user forgets! + (*sample_ids)->GetMutableBuffer(); // allocate memory in case user forgets! } return Status::OK(); } diff --git a/tests/ut/cpp/dataset/tensor_string_test.cc b/tests/ut/cpp/dataset/tensor_string_test.cc new file mode 100644 index 0000000000..8c58f68982 --- /dev/null +++ b/tests/ut/cpp/dataset/tensor_string_test.cc @@ -0,0 +1,153 @@ +/** + * Copyright 2019 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include "dataset/core/client.h" +#include "common/common.h" +#include "gtest/gtest.h" +#include "securec.h" +#include "dataset/core/tensor.h" +#include "dataset/core/cv_tensor.h" +#include "dataset/core/data_type.h" +#include "dataset/util/de_error.h" + +using namespace mindspore::dataset; + +namespace py = pybind11; + +class MindDataTestStringTensorDE : public UT::Common { + public: + MindDataTestStringTensorDE() = default; + + void SetUp() override { GlobalInit(); } +}; + +TEST_F(MindDataTestStringTensorDE, Basics) { + std::shared_ptr t = std::make_shared("Hi"); + ASSERT_TRUE(t->shape() == TensorShape({})); + std::string_view s = ""; + t->GetItemAt(&s, {}); + ASSERT_TRUE(s == "Hi"); + + std::shared_ptr t2 = std::make_shared(std::vector{"Hi", "Bye"}); + ASSERT_TRUE(t2->shape() == TensorShape({2})); + t2->GetItemAt(&s, {0}); + ASSERT_TRUE(s == "Hi"); + t2->GetItemAt(&s, {1}); + ASSERT_TRUE(s == "Bye"); + + std::vector strings{"abc", "defg", "hi", "klmno", "123", "789"}; + std::shared_ptr t3 = std::make_shared(strings, TensorShape({2, 3})); + ASSERT_TRUE(t3->shape() == TensorShape({2, 3})); + uint32_t index = 0; + for (uint32_t i = 0; i < 2; i++) { + for (uint32_t j = 0; j < 3; j++) { + std::string_view s = ""; + t3->GetItemAt(&s, {i, j}); + ASSERT_TRUE(s == strings[index++]); + } + } +} + +TEST_F(MindDataTestStringTensorDE, Basics2) { + std::shared_ptr t = + std::make_shared(std::vector{"abc", "defg", "hi", "klmno", "123", "789"}, TensorShape({2, 3})); + ASSERT_TRUE(t->SizeInBytes() == 6 * 5 + 20); + std::vector offsets = {3, 8, 11, 17, 21, 25}; + uint32_t ctr = 0; + for (auto i : offsets) { + ASSERT_TRUE(*(reinterpret_cast(t->GetMutableBuffer() + ctr)) == i); + ctr += 4; + } + const char *buf = reinterpret_cast(t->GetMutableBuffer()) + 6 * 4; + std::vector starts = {0, 4, 9, 12, 18, 22}; + + uint32_t index = 0; + for (uint32_t i = 0; i < 2; i++) { + for (uint32_t j = 0; j < 3; j++) { + std::string_view s = ""; + t->GetItemAt(&s, {i, j}); + ASSERT_TRUE(s.data() == buf + starts[index++]); + } + } +} + +TEST_F(MindDataTestStringTensorDE, Empty) { + std::vector strings{"abc", "defg", "", "", "123", ""}; + std::shared_ptr t = std::make_shared(strings, TensorShape({2, 3})); + // abc_defg___123__ + // 0123456789012345 + ASSERT_TRUE(t->SizeInBytes() == 6 * 5 + 10); + std::vector offsets = {3, 8, 9, 10, 14, 15}; + uint32_t ctr = 0; + for (auto i : offsets) { + ASSERT_TRUE(*(reinterpret_cast(t->GetMutableBuffer() + ctr)) == i); + ctr += 4; + } + const char *buf = reinterpret_cast(t->GetMutableBuffer()) + 6 * 4; + std::vector starts = {0, 4, 9, 10, 11, 15}; + + uint32_t index = 0; + for (uint32_t i = 0; i < 2; i++) { + for (uint32_t j = 0; j < 3; j++) { + std::string_view s = ""; + t->GetItemAt(&s, {i, j}); + ASSERT_TRUE(s.data() == buf + starts[index]); + ASSERT_TRUE(s == strings[index++]); + } + } +} + +TEST_F(MindDataTestStringTensorDE, SetItem) { + std::vector strings{"abc", "defg", "hi", "klmno", "123", "789"}; + std::shared_ptr t3 = std::make_shared(strings, TensorShape({2, 3})); + ASSERT_TRUE(t3->shape() == TensorShape({2, 3})); + + t3->SetItemAt({0, 1}, std::string{"xyzz"}); + strings[1] = "xyzz"; + + t3->SetItemAt({0, 2}, std::string{"07"}); + strings[2] = "07"; + + t3->SetItemAt({1, 2}, std::string{"987"}); + strings[5] = "987"; + + uint32_t index = 0; + for (uint32_t i = 0; i < 2; i++) { + for (uint32_t j = 0; j < 3; j++) { + std::string_view s = ""; + t3->GetItemAt(&s, {i, j}); + ASSERT_TRUE(s == strings[index++]); + } + } +} + +TEST_F(MindDataTestStringTensorDE, Iterator) { + std::vector strings{"abc", "defg", "hi", "klmno", "123", "789"}; + std::shared_ptr t = std::make_shared(strings, TensorShape({2, 3})); + uint32_t index = 0; + auto itr = t->begin(); + for (; itr != t->end(); itr++) { + ASSERT_TRUE(*itr == strings[index++]); + } + + index = 0; + itr = t->begin(); + for (; itr != t->end(); itr += 2) { + ASSERT_TRUE(*itr == strings[index]); + index += 2; + } +} \ No newline at end of file diff --git a/tests/ut/cpp/dataset/tensor_test.cc b/tests/ut/cpp/dataset/tensor_test.cc index 494d4b2329..615427ab92 100644 --- a/tests/ut/cpp/dataset/tensor_test.cc +++ b/tests/ut/cpp/dataset/tensor_test.cc @@ -111,17 +111,17 @@ TEST_F(MindDataTestTensorDE, CopyTensor) { int16_t o; t->GetItemAt(&o, {}); ASSERT_EQ(o, -66); - unsigned char *addr = t->StartAddr(); + unsigned char *addr = t->GetMutableBuffer(); auto t2 = std::make_shared(std::move(*t)); ASSERT_EQ(t2->shape(), TensorShape({})); ASSERT_EQ(t2->type(), DataType::DE_INT16); t2->GetItemAt(&o, {}); ASSERT_EQ(o, -66); - unsigned char *new_addr = t2->StartAddr(); + unsigned char *new_addr = t2->GetMutableBuffer(); ASSERT_EQ(addr, new_addr); ASSERT_EQ(t->shape(), TensorShape::CreateUnknownRankShape()); ASSERT_EQ(t->type(), DataType::DE_UNKNOWN); - ASSERT_EQ(t->StartAddr(), nullptr); + ASSERT_EQ(t->GetMutableBuffer(), nullptr); Status rc = t->GetItemAt(&o, {}); ASSERT_TRUE(rc.IsError()); } @@ -237,7 +237,7 @@ TEST_F(MindDataTestTensorDE, Strides) { void checkCvMat(TensorShape shape, DataType type) { std::shared_ptr t = std::make_shared(shape, type); cv::Mat m = t->mat(); - ASSERT_EQ(m.data, t->StartAddr()); + ASSERT_EQ(m.data, t->GetMutableBuffer()); ASSERT_EQ(static_cast(m.type()) & static_cast(CV_MAT_DEPTH_MASK), type.AsCVType()); if (shape.Rank() < 4) { if (shape.Rank() > 1) { @@ -311,15 +311,15 @@ TEST_F(MindDataTestTensorDE, CVTensorFromMat) { TEST_F(MindDataTestTensorDE, CVTensorAs) { std::shared_ptr t = std::make_shared(TensorShape({3, 2}), DataType(DataType::DE_FLOAT64)); t->Fill(2.2); - unsigned char *addr = t->StartAddr(); + unsigned char *addr = t->GetMutableBuffer(); std::shared_ptr t2 = std::make_shared(TensorShape({3, 2}), DataType(DataType::DE_FLOAT64)); t2->Fill(4.4); std::shared_ptr ctv = CVTensor::AsCVTensor(t); - ASSERT_EQ(t->StartAddr(), nullptr); - ASSERT_EQ(ctv->StartAddr(), addr); + ASSERT_EQ(t->GetMutableBuffer(), nullptr); + ASSERT_EQ(ctv->GetMutableBuffer(), addr); cv::Mat m = ctv->mat(); m = 2 * m; - ASSERT_EQ(ctv->StartAddr(), addr); + ASSERT_EQ(ctv->GetMutableBuffer(), addr); ASSERT_TRUE(*t2 == *ctv); MS_LOG(DEBUG) << *t2 << std::endl << *ctv; } diff --git a/tests/ut/python/dataset/test_datasets_textfileop.py b/tests/ut/python/dataset/test_datasets_textfileop.py index fdf4907404..b13a86d80b 100644 --- a/tests/ut/python/dataset/test_datasets_textfileop.py +++ b/tests/ut/python/dataset/test_datasets_textfileop.py @@ -41,8 +41,8 @@ def test_textline_dataset_totext(): count = 0 line = ["This is a text file.", "Another file.", "Be happy every day.", "End of file.", "Good luck to everyone."] for i in data.create_dict_iterator(): - str = nlp.as_text(i["text"]) - assert(str == line[count]) + str = i["text"].item().decode("utf8") + assert(str == line[count]) count += 1 assert(count == 5) @@ -68,8 +68,8 @@ def test_textline_dataset_repeat(): "This is a text file.", "Be happy every day.", "Good luck to everyone.", "This is a text file.", "Be happy every day.", "Good luck to everyone."] for i in data.create_dict_iterator(): - str = nlp.as_text(i["text"]) - assert(str == line[count]) + str = i["text"].item().decode("utf8") + assert(str == line[count]) count += 1 assert(count == 9) diff --git a/tests/ut/python/dataset/test_flat_map.py b/tests/ut/python/dataset/test_flat_map.py index 5790ab7e8c..0e2e01d3ea 100644 --- a/tests/ut/python/dataset/test_flat_map.py +++ b/tests/ut/python/dataset/test_flat_map.py @@ -26,7 +26,7 @@ def test_flat_map_1(): import mindspore.dataset.transforms.nlp.utils as nlp def flat_map_func(x): - data_dir = nlp.as_text(x[0]) + data_dir = x[0].item().decode('utf8') d = ds.ImageFolderDatasetV2(data_dir) return d @@ -47,12 +47,12 @@ def test_flat_map_2(): import mindspore.dataset.transforms.nlp.utils as nlp def flat_map_func_1(x): - data_dir = nlp.as_text(x[0]) + data_dir = x[0].item().decode('utf8') d = ds.ImageFolderDatasetV2(data_dir) return d def flat_map_func_2(x): - text_file = nlp.as_text(x[0]) + text_file = x[0].item().decode('utf8') d = ds.TextFileDataset(text_file) d = d.flat_map(flat_map_func_1) return d diff --git a/tests/ut/python/dataset/test_tensor_string.py b/tests/ut/python/dataset/test_tensor_string.py new file mode 100644 index 0000000000..7fd6300865 --- /dev/null +++ b/tests/ut/python/dataset/test_tensor_string.py @@ -0,0 +1,65 @@ +# Copyright 2019 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +import mindspore._c_dataengine as cde +import mindspore.dataset as ds +import pytest + +import numpy as np + + +def test_basic(): + x = np.array([["ab", "cde", "121"], ["x", "km", "789"]], dtype='S') + # x = np.array(["ab", "cde"], dtype='S') + n = cde.Tensor(x) + arr = n.as_array() + y = np.array([1, 2]) + assert all(y == y) + # assert np.testing.assert_array_equal(y,y) + + +def compare(strings): + arr = np.array(strings, dtype='S') + + def gen(): + yield arr, + + data = ds.GeneratorDataset(gen, column_names=["col"]) + + for d in data: + np.testing.assert_array_equal(d[0], arr) + + +def test_generator(): + compare(["ab"]) + compare(["ab", "cde", "121"]) + compare([["ab", "cde", "121"], ["x", "km", "789"]]) + + +def test_batching_strings(): + def gen(): + yield np.array(["ab", "cde", "121"], dtype='S'), + + data = ds.GeneratorDataset(gen, column_names=["col"]).batch(10) + + with pytest.raises(RuntimeError) as info: + for _ in data: + pass + assert "[Batch ERROR] Batch does not support" in str(info) + + +if __name__ == '__main__': + test_generator() + test_basic() + test_batching_strings() \ No newline at end of file