/** * Copyright 2019 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_CORE_TENSOR_H_ #define MINDSPORE_CCSRC_MINDDATA_DATASET_CORE_TENSOR_H_ #include #include #include #include #include "./securec.h" #include "utils/log_adapter.h" #if defined(_WIN32) || defined(_WIN64) #undef HAVE_STDDEF_H #undef HAVE_STDLIB_H #endif #ifdef ENABLE_PYTHON #include "pybind11/numpy.h" #include "pybind11/pybind11.h" #include "pybind11/stl.h" #endif #include "utils/ms_utils.h" #include "minddata/dataset/core/constants.h" #include "minddata/dataset/core/data_type.h" #include "minddata/dataset/core/tensor_shape.h" #include "minddata/dataset/util/status.h" #ifndef ENABLE_ANDROID #include "proto/example.pb.h" #else #include "minddata/dataset/include/de_tensor.h" #endif #ifdef ENABLE_PYTHON namespace py = pybind11; #endif namespace mindspore { #ifdef ENABLE_ANDROID namespace tensor { class DETensor; } // namespace tensor #endif namespace dataset { class Tensor; template class Allocator; using CharAllocPtr = std::unique_ptr>; using TensorAllocPtr = std::shared_ptr>; // An allocator shared_ptr for Tensors using offset_t = uint32_t; // type of offset values to store strings locations using TensorPtr = std::shared_ptr; class Tensor { public: Tensor() = delete; Tensor(const Tensor &other) = delete; Tensor &operator=(const Tensor &other) = delete; /// Create a tensor using shape and type. This constructor should not be used directly, use CreateFromTensor instead /// \note The shape and type information should be known and valid /// \note The constructor does not allocate data /// \param shape TensorShape /// \param type DataType Tensor(const TensorShape &shape, const DataType &type); /// Move constructor /// \param other Tensor to be moved Tensor(Tensor &&other) noexcept; /// Move assigment operator /// \param other Tensor to be moved Tensor &operator=(Tensor &&other) noexcept; /// Create a numeric tensor with type and shape. Items of the tensor would be uninitialized. /// \param[in] shape shape of the output tensor /// \param[in] type type of the output tensor /// \param[out] out Generated tensor /// \return Status code static Status CreateEmpty(const TensorShape &shape, const DataType &type, TensorPtr *out); /// Create a numeric tensor from a pointer in memory. Length of the source data is determined from the shape and type. /// Data will be copied into the new created tensor. /// \param[in] shape shape of the output tensor /// \param[in] type type of the output tensor /// \param[in] src pointer to the source data /// \param[out] out Generated tensor /// \return Status code static Status CreateFromMemory(const TensorShape &shape, const DataType &type, const uchar *src, TensorPtr *out); /// Create a tensor from a pointer in memory and length. Data will be copied into the new created tensor. /// \param[in] shape shape of the output tensor /// \param[in] type type of the output tensor /// \param[in] src pointer to the source data /// \param[in] length length of the src data /// \param[out] out Generated tensor /// \return Status code static Status CreateFromMemory(const TensorShape &shape, const DataType &type, const uchar *src, const dsize_t &length, TensorPtr *out); /// Create a copy of the input tensor /// \param[in] in original tensor to be copied /// \param[out] out output tensor to be generated /// \return Status static Status CreateFromTensor(const TensorPtr &in, TensorPtr *out) { return CreateFromMemory(in->shape(), in->type(), in->GetBuffer(), in->SizeInBytes(), out); } #ifdef ENABLE_PYTHON /// Create a Tensor from a given py::array /// \param[in] arr py::array /// \param[out] out Created tensor /// \return Status Code static Status CreateFromNpArray(const py::array &arr, TensorPtr *out); #endif #ifndef ENABLE_ANDROID /// Create a tensor of type DE_STRING from a BytesList. /// \param[in] bytes_list protobuf's Bytelist /// \param[in] shape shape of the outout tensor /// \param[out] out created Tensor /// \return Status Code static Status CreateFromByteList(const dataengine::BytesList &bytes_list, const TensorShape &shape, TensorPtr *out); /// Create a tensor of type UINT8 or INT8 from a BytesList. /// The tensor will be padded with ' ' to reach the required pad_size. /// \param[in] bytes_list protobuf's Bytelist /// \param[in] shape shape of the output tensor /// \param[in] type type of created tensor. Should be DE_UINT8 or INT8 /// \param[in] pad_size The size of the tensor after padding /// \param[out] out created Tensor /// \return Status Code static Status CreateFromByteList(const dataengine::BytesList &bytes_list, const TensorShape &shape, const DataType &type, dsize_t pad_size, TensorPtr *out); #endif /// Create a Tensor from a given list of values. /// \tparam type of the values to be inserted. /// \param[in] items elements of the tensor /// \param[in] shape shape of the output tensor /// \param[out] out output argument to hold the created Tensor /// \return Status Code template static Status CreateFromVector(const std::vector &items, const TensorShape &shape, TensorPtr *out) { CHECK_FAIL_RETURN_UNEXPECTED( items.size() == shape.NumOfElements(), "Number of elements in the vector does not match the number of elements of the shape required"); DataType type = DataType::FromCType(); // if items is empty, items_ptr would be nullptr. CreateFromMemory will handle this case. auto items_ptr = reinterpret_cast(&items[0]); return CreateFromMemory(shape, type, items_ptr, out); } /// Create a 1D Tensor from a given list of values. /// \tparam type of the values to be inserted. /// \param[in] items elements of the tensor /// \param[out] out output argument to hold the created Tensor /// \return Status Code template static Status CreateFromVector(const std::vector &items, TensorPtr *out) { return CreateFromVector(items, TensorShape({static_cast(items.size())}), out); } /// Create a numeric scalar Tensor from the given value. /// \tparam T type of value /// \param[in] item value /// \param[out] out Created tensor /// \return Status code template static Status CreateScalar(const T &item, TensorPtr *out) { DataType type = DataType::FromCType(); auto item_ptr = reinterpret_cast(&item); return CreateFromMemory(TensorShape::CreateScalar(), type, item_ptr, out); } /// Create a tensor from a binary file on disk. /// \param[in] path file to be read /// \param[out] out Created Tensor /// \return Status code static Status CreateFromFile(const std::string &path, TensorPtr *out); /// Destruct the tensor and release the memory using the allocator virtual ~Tensor(); /// Equality operator. compares tensor shape, type and data /// \param[in] rhs Tensor to be compared with /// \return bool bool operator==(const Tensor &rhs) const; bool operator!=(const Tensor &rhs) const { return !((*this) == rhs); } /// Get item located at `index`, caller needs to provide the type. /// \tparam T /// \param[in] index vector /// \return return the item specified at index template Status GetItemAt(T *o, const std::vector &index) const; /// Get string located at `index`. /// \param[in] index vector /// \return return std::string_view specified at index Status GetItemAt(std::string_view *o, const std::vector &index) const; template Status GetUnsignedIntAt(T *o, const std::vector &index) const; template Status GetSignedIntAt(T *o, const std::vector &index) const; template Status GetFloatAt(T *o, const std::vector &index) const; /// set item at location specified by index /// \tparam `T` /// \param[in] index /// \param[in] value of type `T` template Status SetItemAt(const std::vector &index, const T &value) { T *ptr = nullptr; RETURN_IF_NOT_OK(GetItemPtr(&ptr, index)); *ptr = value; return Status::OK(); } /// set string item at location specified by index /// \param[in] index /// \param[in] value of type std::string Status SetItemAt(const std::vector &index, const std::string &value) { RETURN_UNEXPECTED_IF_NULL(data_); uchar *ptr = nullptr; offset_t length = 0; RETURN_IF_NOT_OK(GetItemPtr(&ptr, index, &length)); if (value.length() != length) { RETURN_STATUS_UNEXPECTED("Length of the new string does not match the item."); } memcpy_s(reinterpret_cast(ptr), length, value.c_str(), length); return Status::OK(); } /// fill tensor with Zeros. Does not support strings. Status Zero() { CHECK_FAIL_RETURN_UNEXPECTED(type_ != DataType::DE_STRING, "Cannot use Zero on tensor of strings.."); dsize_t size = SizeInBytes(); CHECK_FAIL_RETURN_UNEXPECTED(memset_sp(GetMutableBuffer(), size, 0, size) == 0, "Failed to fill tensor with zeroes."); return Status::OK(); } /// Fill all elements in the Tensor with the given value of type `T`. Does not support strings. /// \tparam T /// \param value[in] template Status Fill(const T &value) { CHECK_FAIL_RETURN_UNEXPECTED(type_ != DataType::DE_STRING, "Cannot use fill on tensor of strings."); int64_t cellSize = type_.SizeInBytes(); if ((data_ != nullptr) && type_.IsCompatible()) { for (dsize_t i = 0; i < Size(); i++) { CHECK_FAIL_RETURN_UNEXPECTED(memcpy_s((data_ + i * cellSize), cellSize, &value, cellSize) == 0, "memcpy err"); } return Status::OK(); } else { std::string err; err += (data_ == nullptr) ? "data_ is nullptr \t" : ""; err += type_.IsCompatible() ? "data type not compatible\t" : ""; return Status(StatusCode::kUnexpectedError, err); } } /// Getter function for shape /// \return const TensorShape &shape() const { return shape_; } /// Check if tensor has data /// \return bool - true if tensor is empty bool HasData() const { return data_ != nullptr; } /// Reshape the tensor. The given shape should have the same number of elements in the Tensor /// \param shape virtual Status Reshape(const TensorShape &shape); /// \return number of elements in this tensor dsize_t Size() const { return shape().NumOfElements(); } /// \return the number of bytes this tensor is needs dsize_t SizeInBytes() const { if (data_end_ == nullptr) return type_.SizeInBytes() * shape_.NumOfElements(); return data_end_ - data_; } /// \return the rank of the tensor dsize_t Rank() const { return shape().Rank(); } /// Get the starting memory address as a constant for the data of the tensor. This potentially /// drives an allocation if the data area. /// \return const unsigned char* const unsigned char *GetBuffer() const { return data_; } /// Getter of the type /// \return DataType type() const { return type_; } /// Provide stream operator for displaying it /// \param output stream /// \param so the Tensor object to be printed /// \return output stream friend std::ostream &operator<<(std::ostream &out, const Tensor &so) { so.Print(out); return out; } /// Invalidate this Tensor by setting the type and shape to unknown and MData to null. /// Calling this method will make the Tensor and its data inaccessible, use it with caution. void Invalidate(); /// Copy input tensor into self at the location index. /// Index is a vector of axises which can be incomplete: /// Ex: shape <2,3>, inserting into index {0} will replace the first row. index {1,2} will replace the last cell. /// \param index /// \param input /// \param partial_insert: boolean to determine if insertion along the full axis is enforced /// \return Status code Status InsertTensor(const std::vector &index, const std::shared_ptr &input, const bool partial_insert = false); /// Find the address of the given index. Used in InsertTensor. /// Example: /// Tensor t= [[1,2],[3,4]] , StartAddrOfIndex({0}) -> &1 /// \param index incomplete index /// \param output: startAddrofIndex /// \param output: remaining /// \return Status code Status StartAddrOfIndex(std::vector ind, uchar **start_addr_of_index, TensorShape *remaining); /// Expand the shape of the Tensor with one extra dimension. /// For example, if the shape is <512,512,3>: /// *- ExpandDim(0) gives: <1,512,512,3> /// *- ExpandDim(1) gives: <512,1,512,3> /// *- ExpandDim(3) gives: <512,512,3,1> /// \param axis location of the dim virtual Status ExpandDim(const dsize_t &axis); virtual void Squeeze(); /// Calculates the strides of the Tensor /// Ex: Tensor of shape <4,2,2> and type DE_UINT8 (1 byte) /// The strides will be {6,2,1}. /// Ex: Tensor of shape <4,2,2> and type DE_UINT32 (4 byte) /// The strides will be {24,8,4}. /// \return vector of integers std::vector Strides() const; std::string ToString() { std::stringstream ss; this->Print(ss); return ss.str(); } /// Handle negative indices. static inline dsize_t HandleNeg(dsize_t index, dsize_t length) { return (index < 0) ? (index + length) : index; } /// Slice tensor bases on the given indicies. Copy the sliced data into out tensor. Only rank1 tensors are supported. /// Based on the type of tensor, SliceNumeric or SliceString will be called /// \param[out] out Tensor /// \param[in] indices vector of indices /// \return Status error code Status Slice(TensorPtr *out, const std::vector &indices); /// Slice numeric tensors. Status SliceNumeric(TensorPtr *out, const std::vector &indices); /// Slice string tensors Status SliceString(TensorPtr *out, const std::vector &indices); #ifdef ENABLE_PYTHON /// Constructs numpy array from input tensor /// \param[in] data this data is the location of python data /// \return Status code Status GetDataAsNumpy(py::array *data); Status GetDataAsNumpyStrings(py::array *data); static Status GetBufferInfo(Tensor *t, py::buffer_info *out); #endif /// TensorIterator is a linear iterator that can be used to iterate over the elements of the Tensor /// The order elements is as the memory layout (i.e., row-major) [[1,2,3],[4,5,6] --> 1,2,3,4,5,6 /// \tparam T type of values in the Tensor Iterator template class TensorIterator { public: using iterator_category = std::random_access_iterator_tag; using value_type = T; using difference_type = ptrdiff_t; using pointer = T *; using reference = T &; explicit TensorIterator(uchar *ptr = nullptr) { ptr_ = reinterpret_cast(ptr); } TensorIterator(const TensorIterator &raw_iterator) { ptr_ = raw_iterator.ptr_; } ~TensorIterator() = default; TensorIterator &operator=(const TensorIterator &rhs) { ptr_ = rhs.ptr_; return *this; } TensorIterator &operator=(T *rhs) { ptr_ = rhs; return *this; } bool operator==(const TensorIterator &rhs) { return ptr_ == rhs.ptr_; } bool operator!=(const TensorIterator &rhs) { return !(*this == rhs); } operator bool() const { return ptr_ != nullptr; } T &operator*() { return *ptr_; } const T &operator*() const { return *ptr_; } T *operator->() { return ptr_; } TensorIterator &operator+=(const ptrdiff_t &inc) { ptr_ += inc; return *this; } TensorIterator &operator-=(const ptrdiff_t &inc) { ptr_ -= inc; return *this; } TensorIterator &operator++() { ++ptr_; return *this; } TensorIterator &operator--() { --ptr_; return *this; } TensorIterator operator++(int) { auto temp(*this); ++ptr_; return temp; } TensorIterator operator--(int) { auto temp(*this); --ptr_; return temp; } TensorIterator operator+(const ptrdiff_t &inc) { auto oldPtr = ptr_; ptr_ += inc; auto temp(*this); ptr_ = oldPtr; return temp; } TensorIterator operator-(const ptrdiff_t &inc) { auto oldPtr = ptr_; ptr_ -= inc; auto temp(*this); ptr_ = oldPtr; return temp; } protected: T *ptr_; }; // Specialization of TensorIterator for strings. It returns std::string_view for every item. // \tparam DUMMY, used to mbe able to specialize the inner class template class TensorIterator { public: using iterator_category = std::random_access_iterator_tag; using value_type = std::string_view; using difference_type = ptrdiff_t; using pointer = std::string_view *; using reference = std::string_view &; explicit TensorIterator(uchar *data = nullptr, dsize_t index = 0) { data_ = reinterpret_cast(data); index_ = index; } TensorIterator(const TensorIterator &raw_iterator) { data_ = raw_iterator.data_; index_ = raw_iterator.index_; } ~TensorIterator() = default; bool operator==(const TensorIterator &rhs) { return data_ == rhs.data_ && index_ == rhs.index_; } bool operator!=(const TensorIterator &rhs) { return !(*this == rhs); } operator bool() const { return data_ != nullptr; } std::string_view operator*() const { auto offset_ = reinterpret_cast(data_); offset_t start = offset_[index_]; return std::string_view{data_ + start}; } TensorIterator &operator+=(const dsize_t &inc) { index_ += inc; return *this; } TensorIterator &operator-=(const dsize_t &inc) { index_ -= inc; return *this; } TensorIterator &operator++() { ++index_; return *this; } TensorIterator &operator--() { --index_; return *this; } TensorIterator operator++(int) { auto temp(*this); ++index_; return temp; } TensorIterator operator--(int) { auto temp(*this); --index_; return temp; } TensorIterator operator+(const dsize_t &inc) { auto oldPtr = index_; index_ += inc; auto temp(*this); index_ = oldPtr; return temp; } TensorIterator operator-(const dsize_t &inc) { auto oldPtr = index_; index_ -= inc; auto temp(*this); index_ = oldPtr; return temp; } protected: dsize_t index_; const char *data_; }; /// Return a TensorIterator that points to the start of the Tensor. /// It's the user responsibility to use the correct type that matches the Tensor type /// \tparam T The type of values in the Tensor /// \return TensorIterator template TensorIterator begin() { return TensorIterator(data_); } /// Return a linear iterator that points to the place after the last element of the Tensor. /// \tparam T The type of values in the Tensor /// \return TensorIterator template TensorIterator end() { return TensorIterator(data_end_); } /// Copies the last dimension at `index` from Tensor `src` to this Tensor. /// \param[in] src Tensor /// \param[in] index vector to the start of the dimension. The last dim should be 0 /// \return Status Status CopyLastDimAt(const std::shared_ptr &src, const std::vector &index); protected: /// Allocate memory for the tensor using the data_allocator /// \param[in] length number of bytes to be allocated /// \return Error Status Status AllocateBuffer(const dsize_t &length); /// Get the starting memory address for the data of the tensor. This potentially /// drives an allocation if the data is null. /// \return unsigned char* unsigned char *GetMutableBuffer() { return data_; } /// A function that prints Tensor recursively, first called by print /// \param[in] out /// \param[in] cur_dim /// \param[in] cur_index void PrintRecursive(std::ostream &out, int32_t cur_dim, const std::vector &cur_index) const; /// A function that prints info about the tensor /// \param[out] out output stream void Print(std::ostream &out) const; /// A function that print the value as specified by its index /// \param[in] index vector representing the index /// \param[out] out void PrintItemAt(const std::vector &index, std::ostream &out) const; /// Get pointer to item located at `index`, caller needs to provide the type. /// \tparam T /// \param[in] index vector /// \return return a pointer to the item specified at index of type `T` template Status GetItemPtr(T **, const std::vector &index) const; /// Get pointer to string located at `index` and the length of string /// \param[in] index vector /// \return return a pointer to the string specified at index and the length of the string Status GetItemPtr(uchar **, const std::vector &index, offset_t *length = nullptr) const; /// Given a flat index of an item string, return the start and length of the item /// \param[in] index flat index of the item /// \param[out] start address of the ths string /// \param[out] length of the string Status GetStringAt(dsize_t index, uchar **string_start, offset_t *length) const; /// Skip the offsets and returns the start of the buffer where the real strings is stored. Caller needs to check if /// the tensor's type is a string, otherwise undefined address would be returned. \return address of the first string /// of the tensor. uchar *GetStringsBuffer() const { return data_ + kOffsetSize * shape_.NumOfElements() + kOffsetSize; } /// all access to shape_ should be via shape TensorShape shape_; /// data type of tensor DataType type_; /// pointer to the start of the physical data unsigned char *data_; /// An allocator for data_ CharAllocPtr data_allocator_; /// pointer to the end of the physical data unsigned char *data_end_ = nullptr; private: #ifdef ENABLE_ANDROID friend class tensor::DETensor; #endif /// Copy raw data of a array based on shape and strides to the destination pointer /// \param dst [out] Pointer to the destination array where the content is to be copied /// \param[in] src Pointer to the source of strided array to be copied /// \param[in] shape shape of the source array /// \param[in] strides strides of the source array /// \param[in] type_size number of bytes needed to store one array element's type /// \return Status Code static Status CopyStridedArray(unsigned char *dst, unsigned char *src, std::vector shape, std::vector strides, uint8_t type_size); /// const of the size of the offset variable static constexpr uint8_t kOffsetSize = sizeof(offset_t); #ifdef ENABLE_PYTHON /// Helper function to create a tensor from Numpy array of strings /// \param[in] arr Numpy array /// \param[out] out Created Tensor /// \return Status static Status CreateFromNpString(py::array arr, TensorPtr *out); #endif }; template <> inline Tensor::TensorIterator Tensor::end() { return TensorIterator(data_, shape_.NumOfElements()); } /// Create a Tensor from a given list of strings. /// @note: The memory layout of a Tensor of strings consists of the Offset_array followed by the strings. /// The offset array will store one extra value to find the length of the last string. /// OFFSET_1, OFFSET_2, ..., OFFSET_n+1, STRING_1, STRING_2, ..., STRING_n /// The value of each offset is the start index of the corresponding string /// Offsets is of type offset_t /// strings will ne null-terminated /// example: Tensor(['abc', 'de'], shape={2}, type=DE_STRING) /// |----------------------------------------------------------------| /// | OFFSET ARRAY | STRINGS | /// | bytes 0-3 | bytes 3-6 | bytes 7-10 | bytes 11-14 | bytes 15-17 | /// | 11 | 15 | 18 | abc\0 | de\0 | /// |----------------------------------------------------------------| /// \param[in] items elements of the tensor /// \param[in] shape shape of the output tensor /// \param[out] out output argument to hold the created Tensor /// \return Status Code template <> inline Status Tensor::CreateFromVector(const std::vector &items, const TensorShape &shape, TensorPtr *out) { CHECK_FAIL_RETURN_UNEXPECTED( items.size() == shape.NumOfElements(), "Number of elements in the vector does not match the number of elements of the shape required"); const TensorAlloc *alloc = GlobalContext::Instance()->tensor_allocator(); *out = std::allocate_shared(*alloc, TensorShape({static_cast(items.size())}), DataType(DataType::DE_STRING)); if (items.size() == 0) { if (shape.known()) { return (*out)->Reshape(shape); } } auto length_sum = [](dsize_t sum, const std::string &s) { return s.length() + sum; }; dsize_t total_length = std::accumulate(items.begin(), items.end(), 0, length_sum); // total bytes needed = offset array + strings // offset array needs to store one offset var per element + 1 extra to get the length of the last string. // strings will be null-terminated --> need 1 extra byte per element dsize_t num_bytes = (kOffsetSize + 1) * (*out)->shape_.NumOfElements() + kOffsetSize + total_length; (*out)->AllocateBuffer(num_bytes); auto offset_arr = reinterpret_cast((*out)->data_); uchar *buf = (*out)->GetStringsBuffer(); offset_t offset = buf - (*out)->data_; // the first string will start here uint32_t i = 0; for (const auto &str : items) { // insert the start index of the string. offset_arr[i++] = offset; // total bytes are reduced by kOffsetSize num_bytes -= kOffsetSize; // insert actual string int ret_code = memcpy_s((*out)->data_ + offset, num_bytes, common::SafeCStr(str), str.length() + 1); if (ret_code != 0) MS_LOG(ERROR) << "Cannot copy string into Tensor"; // next string will be stored right after the current one. offset = offset + str.length() + 1; // total bytes are reduced by the length of the string num_bytes -= str.length() + 1; } // store one more offset value so we can get the length of the last string // length[last_element] = offset_arr[last_element + 1] - offset_arr[last_element] offset_arr[i] = offset; (*out)->data_end_ = (*out)->data_ + offset_arr[i]; MS_ASSERT(num_bytes == 0); if (shape.known()) { RETURN_IF_NOT_OK((*out)->Reshape(shape)); } return Status::OK(); } /// Create a string scalar Tensor from the given value. /// \param[in] item value /// \param[out] out Created tensor /// \return Status code template <> inline Status Tensor::CreateScalar(const std::string &item, TensorPtr *out) { return CreateFromVector({item}, TensorShape::CreateScalar(), out); } } // namespace dataset } // namespace mindspore #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_CORE_TENSOR_H_