You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

tensor.h 30 kB

5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803
  1. /**
  2. * Copyright 2019-2021 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_CORE_TENSOR_H_
  17. #define MINDSPORE_CCSRC_MINDDATA_DATASET_CORE_TENSOR_H_
  18. #include <deque>
  19. #include <memory>
  20. #include <string>
  21. #include <vector>
  22. #include "./securec.h"
  23. #ifndef ENABLE_ANDROID
  24. #include "utils/log_adapter.h"
  25. #else
  26. #include "mindspore/lite/src/common/log_adapter.h"
  27. #endif
  28. #if defined(_WIN32) || defined(_WIN64)
  29. #undef HAVE_STDDEF_H
  30. #undef HAVE_STDLIB_H
  31. #endif
  32. #ifdef ENABLE_PYTHON
  33. #include "pybind11/numpy.h"
  34. #include "pybind11/pybind11.h"
  35. #include "pybind11/stl.h"
  36. #endif
  37. #include "utils/ms_utils.h"
  38. #include "include/api/status.h"
  39. #include "minddata/dataset/core/constants.h"
  40. #include "minddata/dataset/core/data_type.h"
  41. #include "minddata/dataset/core/tensor_helpers.h"
  42. #include "minddata/dataset/core/tensor_shape.h"
  43. #include "minddata/dataset/core/de_tensor.h"
  44. #ifndef ENABLE_ANDROID
  45. #include "proto/example.pb.h"
  46. #endif
  47. #ifdef ENABLE_PYTHON
  48. namespace py = pybind11;
  49. #endif
  50. namespace mindspore {
  51. namespace dataset {
  52. class Tensor;
  53. template <typename T>
  54. class Allocator;
  55. using CharAllocPtr = std::unique_ptr<Allocator<unsigned char>>;
  56. using TensorAllocPtr = std::shared_ptr<Allocator<Tensor>>; // An allocator shared_ptr for Tensors
  57. using offset_t = uint32_t; // type of offset values to store strings locations
  58. using TensorPtr = std::shared_ptr<Tensor>;
  59. class Tensor {
  60. public:
  61. Tensor() = delete;
  62. Tensor(const Tensor &other) = delete;
  63. Tensor &operator=(const Tensor &other) = delete;
  64. /// Create a tensor using shape and type. This constructor should not be used directly, use CreateFromTensor instead
  65. /// \note The shape and type information should be known and valid
  66. /// \note The constructor does not allocate data
  67. /// \param shape TensorShape
  68. /// \param type DataType
  69. Tensor(const TensorShape &shape, const DataType &type);
  70. /// Move constructor
  71. /// \param other Tensor to be moved
  72. Tensor(Tensor &&other) noexcept;
  73. /// Move assignment operator
  74. /// \param other Tensor to be moved
  75. Tensor &operator=(Tensor &&other) noexcept;
  76. /// Create a numeric tensor with type and shape. Items of the tensor would be uninitialized.
  77. /// \param[in] shape shape of the output tensor
  78. /// \param[in] type type of the output tensor
  79. /// \param[out] out Generated tensor
  80. /// \return Status code
  81. static Status CreateEmpty(const TensorShape &shape, const DataType &type, TensorPtr *out);
  82. /// Create a numeric tensor from a pointer in memory. Length of the source data is determined from the shape and type.
  83. /// Data will be copied into the new created tensor.
  84. /// \param[in] shape shape of the output tensor
  85. /// \param[in] type type of the output tensor
  86. /// \param[in] src pointer to the source data
  87. /// \param[out] out Generated tensor
  88. /// \return Status code
  89. static Status CreateFromMemory(const TensorShape &shape, const DataType &type, const uchar *src, TensorPtr *out);
  90. /// Create a tensor from a pointer in memory and length. Data will be copied into the new created tensor.
  91. /// \param[in] shape shape of the output tensor
  92. /// \param[in] type type of the output tensor
  93. /// \param[in] src pointer to the source data
  94. /// \param[in] length length of the src data
  95. /// \param[out] out Generated tensor
  96. /// \return Status code
  97. static Status CreateFromMemory(const TensorShape &shape, const DataType &type, const uchar *src,
  98. const dsize_t &length, TensorPtr *out);
  99. /// Create a copy of the input tensor
  100. /// \param[in] in original tensor to be copied
  101. /// \param[out] out output tensor to be generated
  102. /// \return Status
  103. static Status CreateFromTensor(const TensorPtr &in, TensorPtr *out) {
  104. return CreateFromMemory(in->shape(), in->type(), in->GetBuffer(), in->SizeInBytes(), out);
  105. }
  106. #ifdef ENABLE_PYTHON
  107. /// Create a Tensor from a given py::array
  108. /// \param[in] arr py::array
  109. /// \param[out] out Created tensor
  110. /// \return Status Code
  111. static Status CreateFromNpArray(const py::array &arr, TensorPtr *out);
  112. #endif
  113. #ifndef ENABLE_ANDROID
  114. /// Create a tensor of type DE_STRING from a BytesList.
  115. /// \param[in] bytes_list protobuf's Bytelist
  116. /// \param[in] shape shape of the output tensor
  117. /// \param[out] out created Tensor
  118. /// \return Status Code
  119. static Status CreateFromByteList(const dataengine::BytesList &bytes_list, const TensorShape &shape, TensorPtr *out);
  120. /// Create a tensor of type UINT8 or INT8 from a BytesList.
  121. /// The tensor will be padded with ' ' to reach the required pad_size.
  122. /// \param[in] bytes_list protobuf's Bytelist
  123. /// \param[in] shape shape of the output tensor
  124. /// \param[in] type type of created tensor. Should be DE_UINT8 or INT8
  125. /// \param[in] pad_size The size of the tensor after padding
  126. /// \param[out] out created Tensor
  127. /// \return Status Code
  128. static Status CreateFromByteList(const dataengine::BytesList &bytes_list, const TensorShape &shape,
  129. const DataType &type, dsize_t pad_size, TensorPtr *out);
  130. #endif
  131. /// Create a Tensor from a given list of values.
  132. /// \tparam type of the values to be inserted.
  133. /// \param[in] items elements of the tensor
  134. /// \param[in] shape shape of the output tensor
  135. /// \param[out] out output argument to hold the created Tensor
  136. /// \return Status Code
  137. template <typename T>
  138. static Status CreateFromVector(const std::vector<T> &items, const TensorShape &shape, TensorPtr *out) {
  139. CHECK_FAIL_RETURN_UNEXPECTED(
  140. items.size() == shape.NumOfElements(),
  141. "Number of elements in the vector does not match the number of elements of the shape required");
  142. DataType type = DataType::FromCType<T>();
  143. // if items is empty, items_ptr would be nullptr. CreateFromMemory will handle this case.
  144. auto items_ptr = reinterpret_cast<const uchar *>(&items[0]);
  145. return CreateFromMemory(shape, type, items_ptr, out);
  146. }
  147. /// Create a 1D Tensor from a given list of values.
  148. /// \tparam type of the values to be inserted.
  149. /// \param[in] items elements of the tensor
  150. /// \param[out] out output argument to hold the created Tensor
  151. /// \return Status Code
  152. template <typename T>
  153. static Status CreateFromVector(const std::vector<T> &items, TensorPtr *out) {
  154. return CreateFromVector(items, TensorShape({static_cast<dsize_t>(items.size())}), out);
  155. }
  156. /// Create a 1D boolean Tensor from a given list of boolean values.
  157. /// \param[in] items elements of the tensor
  158. /// \param[in] shape shape of the output tensor
  159. /// \param[out] out output argument to hold the created Tensor
  160. /// \return Status Code
  161. static Status CreateFromVector(const std::vector<bool> &items, const TensorShape &shape, TensorPtr *out) {
  162. std::vector<uint8_t> temp(items.begin(), items.end());
  163. RETURN_IF_NOT_OK(CreateFromVector(temp, shape, out));
  164. (*out)->type_ = DataType(DataType::DE_BOOL);
  165. return Status::OK();
  166. }
  167. /// Create a numeric scalar Tensor from the given value.
  168. /// \tparam T type of value
  169. /// \param[in] item value
  170. /// \param[out] out Created tensor
  171. /// \return Status code
  172. template <typename T>
  173. static Status CreateScalar(const T &item, TensorPtr *out) {
  174. DataType type = DataType::FromCType<T>();
  175. auto item_ptr = reinterpret_cast<const uchar *>(&item);
  176. return CreateFromMemory(TensorShape::CreateScalar(), type, item_ptr, out);
  177. }
  178. /// Create a tensor from a binary file on disk.
  179. /// \param[in] path file to be read
  180. /// \param[out] out Created Tensor
  181. /// \return Status code
  182. static Status CreateFromFile(const std::string &path, TensorPtr *out);
  183. /// Destruct the tensor and release the memory using the allocator
  184. virtual ~Tensor();
  185. /// Equality operator. compares tensor shape, type and data
  186. /// \param[in] rhs Tensor to be compared with
  187. /// \return bool
  188. bool operator==(const Tensor &rhs) const;
  189. bool operator!=(const Tensor &rhs) const { return !((*this) == rhs); }
  190. /// Get item located at `index`, caller needs to provide the type.
  191. /// \tparam T
  192. /// \param[in] index vector<dsize_t>
  193. /// \return return the item specified at index
  194. template <typename T>
  195. Status GetItemAt(T *o, const std::vector<dsize_t> &index) const;
  196. /// Get string located at `index`.
  197. /// \param[in] index vector<dsize_t>
  198. /// \return return std::string_view specified at index
  199. Status GetItemAt(std::string_view *o, const std::vector<dsize_t> &index) const;
  200. template <typename T>
  201. Status GetUnsignedIntAt(T *o, const std::vector<dsize_t> &index) const;
  202. template <typename T>
  203. Status GetSignedIntAt(T *o, const std::vector<dsize_t> &index) const;
  204. template <typename T>
  205. Status GetFloatAt(T *o, const std::vector<dsize_t> &index) const;
  206. /// set item at location specified by index
  207. /// \tparam `T`
  208. /// \param[in] index
  209. /// \param[in] value of type `T`
  210. template <typename T>
  211. Status SetItemAt(const std::vector<dsize_t> &index, const T &value) {
  212. T *ptr = nullptr;
  213. RETURN_IF_NOT_OK(GetItemPtr<T>(&ptr, index));
  214. *ptr = value;
  215. return Status::OK();
  216. }
  217. /// set string item at location specified by index
  218. /// \param[in] index
  219. /// \param[in] value of type std::string
  220. Status SetItemAt(const std::vector<dsize_t> &index, const std::string &value) {
  221. RETURN_UNEXPECTED_IF_NULL(data_);
  222. uchar *ptr = nullptr;
  223. offset_t length = 0;
  224. RETURN_IF_NOT_OK(GetItemPtr(&ptr, index, &length));
  225. if (value.length() != length) {
  226. RETURN_STATUS_UNEXPECTED("Length of the new string does not match the item.");
  227. }
  228. memcpy_s(reinterpret_cast<char *>(ptr), length, value.c_str(), length);
  229. return Status::OK();
  230. }
  231. /// fill tensor with Zeros. Does not support strings.
  232. Status Zero() {
  233. CHECK_FAIL_RETURN_UNEXPECTED(type_ != DataType::DE_STRING, "Cannot use Zero on tensor of strings..");
  234. dsize_t size = SizeInBytes();
  235. CHECK_FAIL_RETURN_UNEXPECTED(memset_sp(GetMutableBuffer(), size, 0, size) == 0,
  236. "Failed to fill tensor with zeroes.");
  237. return Status::OK();
  238. }
  239. /// Fill all elements in the Tensor with the given value of type `T`. Does not support strings.
  240. /// \tparam T
  241. /// \param value[in]
  242. template <typename T>
  243. Status Fill(const T &value) {
  244. CHECK_FAIL_RETURN_UNEXPECTED(type_ != DataType::DE_STRING, "Cannot use fill on tensor of strings.");
  245. int64_t cellSize = type_.SizeInBytes();
  246. if ((data_ != nullptr) && type_.IsCompatible<T>()) {
  247. for (dsize_t i = 0; i < Size(); i++) {
  248. CHECK_FAIL_RETURN_UNEXPECTED(memcpy_s((data_ + i * cellSize), cellSize, &value, cellSize) == 0, "memcpy err");
  249. }
  250. return Status::OK();
  251. } else {
  252. std::string err;
  253. err += (data_ == nullptr) ? "data_ is nullptr \t" : "";
  254. err += type_.IsCompatible<T>() ? "data type not compatible\t" : "";
  255. return Status(StatusCode::kMDUnexpectedError, err);
  256. }
  257. }
  258. /// Getter function for shape
  259. /// \return
  260. const TensorShape &shape() const { return shape_; }
  261. /// Check if tensor has data
  262. /// \return bool - true if tensor is not empty
  263. bool HasData() const { return data_ != nullptr; }
  264. /// Reshape the tensor. The given shape should have the same number of elements in the Tensor
  265. /// \param shape
  266. virtual Status Reshape(const TensorShape &shape);
  267. /// \return number of elements in this tensor
  268. dsize_t Size() const { return shape().NumOfElements(); }
  269. /// \return the number of bytes this tensor is needs
  270. dsize_t SizeInBytes() const {
  271. if (data_end_ == nullptr) return type_.SizeInBytes() * shape_.NumOfElements();
  272. return data_end_ - data_;
  273. }
  274. /// \return the rank of the tensor
  275. dsize_t Rank() const { return shape().Rank(); }
  276. /// Get the starting memory address as a constant for the data of the tensor. This potentially
  277. /// drives an allocation if the data area.
  278. /// \return const unsigned char*
  279. const unsigned char *GetBuffer() const { return data_; }
  280. /// Getter of the type
  281. /// \return
  282. DataType type() const { return type_; }
  283. /// Provide stream operator for displaying it
  284. /// \param output stream
  285. /// \param so the Tensor object to be printed
  286. /// \return output stream
  287. friend std::ostream &operator<<(std::ostream &out, const Tensor &so) {
  288. so.Print(out);
  289. return out;
  290. }
  291. /// Invalidate this Tensor by setting the type and shape to unknown and MData to null.
  292. /// Calling this method will make the Tensor and its data inaccessible, use it with caution.
  293. void Invalidate();
  294. /// Copy input tensor into self at the location index.
  295. /// Index is a vector of axes which can be incomplete:
  296. /// Ex: shape <2,3>, inserting into index {0} will replace the first row. index {1,2} will replace the last cell.
  297. /// \param index
  298. /// \param input
  299. /// \param partial_insert: boolean to determine if insertion along the full axis is enforced
  300. /// \return Status code
  301. Status InsertTensor(const std::vector<dsize_t> &index, const std::shared_ptr<Tensor> &input,
  302. const bool partial_insert = false);
  303. /// Find the address of the given index. Used in InsertTensor.
  304. /// Example:
  305. /// Tensor t= [[1,2],[3,4]] , StartAddrOfIndex({0}) -> &1
  306. /// \param index incomplete index
  307. /// \param output: startAddrofIndex
  308. /// \param output: remaining
  309. /// \return Status code
  310. Status StartAddrOfIndex(std::vector<dsize_t> ind, uchar **start_addr_of_index, TensorShape *remaining);
  311. /// Expand the shape of the Tensor with one extra dimension.
  312. /// For example, if the shape is <512,512,3>:
  313. /// *- ExpandDim(0) gives: <1,512,512,3>
  314. /// *- ExpandDim(1) gives: <512,1,512,3>
  315. /// *- ExpandDim(3) gives: <512,512,3,1>
  316. /// \param axis location of the dim
  317. virtual Status ExpandDim(const dsize_t &axis);
  318. virtual void Squeeze();
  319. /// Calculates the strides of the Tensor
  320. /// Ex: Tensor of shape <4,2,2> and type DE_UINT8 (1 byte)
  321. /// The strides will be {6,2,1}.
  322. /// Ex: Tensor of shape <4,2,2> and type DE_UINT32 (4 byte)
  323. /// The strides will be {24,8,4}.
  324. /// \return vector of integers
  325. std::vector<dsize_t> Strides() const;
  326. std::string ToString() {
  327. std::stringstream ss;
  328. this->Print(ss);
  329. return ss.str();
  330. }
  331. /// Handle negative indices.
  332. /// \param[out] out modified index
  333. /// \param[in] index
  334. /// \param[in] length axis length used to modify index
  335. /// \return dsize_t modified index
  336. static inline dsize_t HandleNeg(dsize_t index, dsize_t length) { return (index < 0) ? (index + length) : index; }
  337. /// Handle negative indices for a vector of indices.
  338. /// \param[out] out modified vector of indices
  339. /// \param[in] index_vector vector of indices
  340. /// \return std::vector<dsize_t> modified vector of indices
  341. static inline std::vector<dsize_t> HandleNegIndices(std::vector<dsize_t> index_vector, std::vector<dsize_t> length) {
  342. std::vector<dsize_t> indices(index_vector.size(), 0);
  343. for (int i = 0; i < index_vector.size(); i++) {
  344. indices[i] = HandleNeg(index_vector[i], length[i]);
  345. }
  346. return indices;
  347. }
  348. /// Slice tensor bases on the given indices. Copy the sliced data into out tensor.
  349. /// Based on the type of tensor, SliceNumeric or SliceString will be called
  350. /// \param[out] out Tensor
  351. /// \param[in] slice_options vector of SliceOption objects
  352. /// \return Status error code
  353. Status Slice(TensorPtr *out, const std::vector<mindspore::dataset::SliceOption> slice_options);
  354. /// Get slice_option according to shape and index.
  355. /// \param[in] slice_option input SliceOption object
  356. /// \param[in] slice_index index of SliceOption object
  357. /// \param[out] output slice_option with shape info
  358. /// \return Status error code
  359. Status GetSliceOption(const SliceOption &slice_option, const int32_t &slice_index, SliceOption *slice_option_ptr);
  360. #ifdef ENABLE_PYTHON
  361. /// Constructs numpy array from input tensor
  362. /// \param[in] data this data is the location of python data
  363. /// \return Status code
  364. Status GetDataAsNumpy(py::array *data);
  365. Status GetDataAsNumpyStrings(py::array *data);
  366. static Status GetBufferInfo(Tensor *t, py::buffer_info *out);
  367. #endif
  368. /// TensorIterator is a linear iterator that can be used to iterate over the elements of the Tensor
  369. /// The order elements is as the memory layout (i.e., row-major) [[1,2,3],[4,5,6] --> 1,2,3,4,5,6
  370. /// \tparam T type of values in the Tensor Iterator
  371. template <typename T, bool = true>
  372. class TensorIterator {
  373. public:
  374. using iterator_category = std::random_access_iterator_tag;
  375. using value_type = T;
  376. using difference_type = ptrdiff_t;
  377. using pointer = T *;
  378. using reference = T &;
  379. explicit TensorIterator(uchar *ptr = nullptr) { ptr_ = reinterpret_cast<T *>(ptr); }
  380. TensorIterator(const TensorIterator<T> &raw_iterator) { ptr_ = raw_iterator.ptr_; }
  381. ~TensorIterator() = default;
  382. TensorIterator<T> &operator=(const TensorIterator<T> &rhs) {
  383. ptr_ = rhs.ptr_;
  384. return *this;
  385. }
  386. TensorIterator<T> &operator=(T *rhs) {
  387. ptr_ = rhs;
  388. return *this;
  389. }
  390. bool operator==(const TensorIterator<T> &rhs) { return ptr_ == rhs.ptr_; }
  391. bool operator!=(const TensorIterator<T> &rhs) { return !(*this == rhs); }
  392. operator bool() const { return ptr_ != nullptr; }
  393. T &operator*() { return *ptr_; }
  394. const T &operator*() const { return *ptr_; }
  395. T *operator->() { return ptr_; }
  396. TensorIterator<T> &operator+=(const ptrdiff_t &inc) {
  397. ptr_ += inc;
  398. return *this;
  399. }
  400. TensorIterator<T> &operator-=(const ptrdiff_t &inc) {
  401. ptr_ -= inc;
  402. return *this;
  403. }
  404. TensorIterator<T> &operator++() {
  405. ++ptr_;
  406. return *this;
  407. }
  408. TensorIterator<T> &operator--() {
  409. --ptr_;
  410. return *this;
  411. }
  412. TensorIterator<T> operator++(int) {
  413. auto temp(*this);
  414. ++ptr_;
  415. return temp;
  416. }
  417. TensorIterator<T> operator--(int) {
  418. auto temp(*this);
  419. --ptr_;
  420. return temp;
  421. }
  422. TensorIterator<T> operator+(const ptrdiff_t &inc) {
  423. auto oldPtr = ptr_;
  424. ptr_ += inc;
  425. auto temp(*this);
  426. ptr_ = oldPtr;
  427. return temp;
  428. }
  429. TensorIterator<T> operator-(const ptrdiff_t &inc) {
  430. auto oldPtr = ptr_;
  431. ptr_ -= inc;
  432. auto temp(*this);
  433. ptr_ = oldPtr;
  434. return temp;
  435. }
  436. protected:
  437. T *ptr_;
  438. };
  439. // Specialization of TensorIterator for strings. It returns std::string_view for every item.
  440. // \tparam DUMMY, used to mbe able to specialize the inner class
  441. template <bool DUMMY>
  442. class TensorIterator<std::string_view, DUMMY> {
  443. public:
  444. using iterator_category = std::random_access_iterator_tag;
  445. using value_type = std::string_view;
  446. using difference_type = ptrdiff_t;
  447. using pointer = std::string_view *;
  448. using reference = std::string_view &;
  449. explicit TensorIterator(uchar *data = nullptr, dsize_t index = 0) {
  450. data_ = reinterpret_cast<const char *>(data);
  451. index_ = index;
  452. }
  453. TensorIterator(const TensorIterator<std::string_view, DUMMY> &raw_iterator) {
  454. data_ = raw_iterator.data_;
  455. index_ = raw_iterator.index_;
  456. }
  457. ~TensorIterator() = default;
  458. bool operator==(const TensorIterator<std::string_view> &rhs) { return data_ == rhs.data_ && index_ == rhs.index_; }
  459. bool operator!=(const TensorIterator<std::string_view> &rhs) { return !(*this == rhs); }
  460. operator bool() const { return data_ != nullptr; }
  461. std::string_view operator*() const {
  462. auto offset_ = reinterpret_cast<const offset_t *>(data_);
  463. offset_t start = offset_[index_];
  464. return std::string_view{data_ + start};
  465. }
  466. TensorIterator<std::string_view> &operator+=(const dsize_t &inc) {
  467. index_ += inc;
  468. return *this;
  469. }
  470. TensorIterator<std::string_view> &operator-=(const dsize_t &inc) {
  471. index_ -= inc;
  472. return *this;
  473. }
  474. TensorIterator<std::string_view> &operator++() {
  475. ++index_;
  476. return *this;
  477. }
  478. TensorIterator<std::string_view> &operator--() {
  479. --index_;
  480. return *this;
  481. }
  482. TensorIterator<std::string_view> operator++(int) {
  483. auto temp(*this);
  484. ++index_;
  485. return temp;
  486. }
  487. TensorIterator<std::string_view> operator--(int) {
  488. auto temp(*this);
  489. --index_;
  490. return temp;
  491. }
  492. TensorIterator<std::string_view> operator+(const dsize_t &inc) {
  493. auto oldPtr = index_;
  494. index_ += inc;
  495. auto temp(*this);
  496. index_ = oldPtr;
  497. return temp;
  498. }
  499. TensorIterator<std::string_view> operator-(const dsize_t &inc) {
  500. auto oldPtr = index_;
  501. index_ -= inc;
  502. auto temp(*this);
  503. index_ = oldPtr;
  504. return temp;
  505. }
  506. protected:
  507. dsize_t index_;
  508. const char *data_;
  509. };
  510. /// Return a TensorIterator that points to the start of the Tensor.
  511. /// It's the user responsibility to use the correct type that matches the Tensor type
  512. /// \tparam T The type of values in the Tensor
  513. /// \return TensorIterator
  514. template <typename T>
  515. TensorIterator<T> begin() {
  516. return TensorIterator<T>(data_);
  517. }
  518. /// Return a linear iterator that points to the place after the last element of the Tensor.
  519. /// \tparam T The type of values in the Tensor
  520. /// \return TensorIterator
  521. template <typename T>
  522. TensorIterator<T> end() {
  523. return TensorIterator<T>(data_end_);
  524. }
  525. /// Copies the last dimension at `index` from Tensor `src` to this Tensor.
  526. /// \param[in] src Tensor
  527. /// \param[in] index vector to the start of the dimension. The last dim should be 0
  528. /// \return Status
  529. Status CopyLastDimAt(const std::shared_ptr<Tensor> &src, const std::vector<dsize_t> &index);
  530. protected:
  531. /// Allocate memory for the tensor using the data_allocator
  532. /// \param[in] length number of bytes to be allocated
  533. /// \return Error Status
  534. Status AllocateBuffer(const dsize_t &length);
  535. /// Get the starting memory address for the data of the tensor. This potentially
  536. /// drives an allocation if the data is null.
  537. /// \return unsigned char*
  538. unsigned char *GetMutableBuffer() { return data_; }
  539. /// A function that prints Tensor recursively, first called by print
  540. /// \param[in] out
  541. /// \param[in] cur_dim
  542. /// \param[in] cur_index
  543. void PrintRecursive(std::ostream &out, int32_t cur_dim, const std::vector<dsize_t> &cur_index) const;
  544. /// A function that prints info about the tensor
  545. /// \param[out] out output stream
  546. void Print(std::ostream &out) const;
  547. /// A function that print the value as specified by its index
  548. /// \param[in] index vector representing the index
  549. /// \param[out] out
  550. void PrintItemAt(const std::vector<dsize_t> &index, std::ostream &out) const;
  551. /// Get pointer to item located at `index`, caller needs to provide the type.
  552. /// \tparam T
  553. /// \param[in] index vector<dsize_t>
  554. /// \return return a pointer to the item specified at index of type `T`
  555. template <typename T>
  556. Status GetItemPtr(T **, const std::vector<dsize_t> &index) const;
  557. /// Get pointer to string located at `index` and the length of string
  558. /// \param[in] index vector<dsize_t>
  559. /// \return return a pointer to the string specified at index and the length of the string
  560. Status GetItemPtr(uchar **, const std::vector<dsize_t> &index, offset_t *length = nullptr) const;
  561. /// Given a flat index of an item string, return the start and length of the item
  562. /// \param[in] index flat index of the item
  563. /// \param[out] start address of the ths string
  564. /// \param[out] length of the string
  565. Status GetStringAt(dsize_t index, uchar **string_start, offset_t *length) const;
  566. /// Skip the offsets and returns the start of the buffer where the real strings is stored. Caller needs to check if
  567. /// the tensor's type is a string, otherwise undefined address would be returned. \return address of the first string
  568. /// of the tensor.
  569. uchar *GetStringsBuffer() const { return data_ + kOffsetSize * shape_.NumOfElements() + kOffsetSize; }
  570. /// all access to shape_ should be via shape
  571. TensorShape shape_;
  572. /// data type of tensor
  573. DataType type_;
  574. /// pointer to the start of the physical data
  575. unsigned char *data_;
  576. /// An allocator for data_
  577. CharAllocPtr data_allocator_;
  578. /// pointer to the end of the physical data
  579. unsigned char *data_end_ = nullptr;
  580. private:
  581. friend class DETensor;
  582. /// Slice numeric tensors.
  583. Status SliceNumeric(TensorPtr *out, const std::vector<std::vector<dsize_t>> &indices, const TensorShape &shape);
  584. /// Slice string tensors
  585. Status SliceString(TensorPtr *out, const std::vector<std::vector<dsize_t>> &indices, const TensorShape &shape);
  586. /// Copy raw data of a array based on shape and strides to the destination pointer
  587. /// \param dst [out] Pointer to the destination array where the content is to be copied
  588. /// \param[in] src Pointer to the source of strided array to be copied
  589. /// \param[in] shape shape of the source array
  590. /// \param[in] strides strides of the source array
  591. /// \param[in] type_size number of bytes needed to store one array element's type
  592. /// \return Status Code
  593. static Status CopyStridedArray(unsigned char *dst, unsigned char *src, std::vector<dsize_t> shape,
  594. std::vector<dsize_t> strides, uint8_t type_size);
  595. /// const of the size of the offset variable
  596. static constexpr uint8_t kOffsetSize = sizeof(offset_t);
  597. #ifdef ENABLE_PYTHON
  598. /// Helper function to create a tensor from Numpy array of strings
  599. /// \param[in] arr Numpy array
  600. /// \param[out] out Created Tensor
  601. /// \return Status
  602. static Status CreateFromNpString(py::array arr, TensorPtr *out);
  603. #endif
  604. };
  605. template <>
  606. inline Tensor::TensorIterator<std::string_view> Tensor::end<std::string_view>() {
  607. return TensorIterator<std::string_view>(data_, shape_.NumOfElements());
  608. }
  609. /// Create a Tensor from a given list of strings.
  610. /// @note: The memory layout of a Tensor of strings consists of the Offset_array followed by the strings.
  611. /// The offset array will store one extra value to find the length of the last string.
  612. /// OFFSET_1, OFFSET_2, ..., OFFSET_n+1, STRING_1, STRING_2, ..., STRING_n
  613. /// The value of each offset is the start index of the corresponding string
  614. /// Offsets is of type offset_t
  615. /// strings will ne null-terminated
  616. /// example: Tensor(['abc', 'de'], shape={2}, type=DE_STRING)
  617. /// |----------------------------------------------------------------|
  618. /// | OFFSET ARRAY | STRINGS |
  619. /// | bytes 0-3 | bytes 3-6 | bytes 7-10 | bytes 11-14 | bytes 15-17 |
  620. /// | 11 | 15 | 18 | abc\0 | de\0 |
  621. /// |----------------------------------------------------------------|
  622. /// \param[in] items elements of the tensor
  623. /// \param[in] shape shape of the output tensor
  624. /// \param[out] out output argument to hold the created Tensor
  625. /// \return Status Code
  626. template <>
  627. inline Status Tensor::CreateFromVector<std::string>(const std::vector<std::string> &items, const TensorShape &shape,
  628. TensorPtr *out) {
  629. CHECK_FAIL_RETURN_UNEXPECTED(
  630. items.size() == shape.NumOfElements(),
  631. "Number of elements in the vector does not match the number of elements of the shape required");
  632. const TensorAlloc *alloc = GlobalContext::Instance()->tensor_allocator();
  633. *out = std::allocate_shared<Tensor>(*alloc, TensorShape({static_cast<dsize_t>(items.size())}),
  634. DataType(DataType::DE_STRING));
  635. if (items.size() == 0) {
  636. if (shape.known()) {
  637. return (*out)->Reshape(shape);
  638. }
  639. }
  640. auto length_sum = [](dsize_t sum, const std::string &s) { return s.length() + sum; };
  641. dsize_t total_length = std::accumulate(items.begin(), items.end(), 0, length_sum);
  642. // total bytes needed = offset array + strings
  643. // offset array needs to store one offset var per element + 1 extra to get the length of the last string.
  644. // strings will be null-terminated --> need 1 extra byte per element
  645. dsize_t num_bytes = (kOffsetSize + 1) * (*out)->shape_.NumOfElements() + kOffsetSize + total_length;
  646. (*out)->AllocateBuffer(num_bytes);
  647. auto offset_arr = reinterpret_cast<offset_t *>((*out)->data_);
  648. uchar *buf = (*out)->GetStringsBuffer();
  649. offset_t offset = buf - (*out)->data_; // the first string will start here
  650. uint32_t i = 0;
  651. for (const auto &str : items) {
  652. // insert the start index of the string.
  653. offset_arr[i++] = offset;
  654. // total bytes are reduced by kOffsetSize
  655. num_bytes -= kOffsetSize;
  656. // insert actual string
  657. int ret_code = memcpy_s((*out)->data_ + offset, num_bytes, common::SafeCStr(str), str.length() + 1);
  658. if (ret_code != 0) MS_LOG(ERROR) << "Cannot copy string into Tensor";
  659. // next string will be stored right after the current one.
  660. offset = offset + str.length() + 1;
  661. // total bytes are reduced by the length of the string
  662. num_bytes -= str.length() + 1;
  663. }
  664. // store one more offset value so we can get the length of the last string
  665. offset_arr[i] = offset;
  666. (*out)->data_end_ = (*out)->data_ + offset_arr[i];
  667. MS_ASSERT(num_bytes == 0);
  668. if (shape.known()) {
  669. RETURN_IF_NOT_OK((*out)->Reshape(shape));
  670. }
  671. return Status::OK();
  672. }
  673. /// Create a string scalar Tensor from the given value.
  674. /// \param[in] item value
  675. /// \param[out] out Created tensor
  676. /// \return Status code
  677. template <>
  678. inline Status Tensor::CreateScalar<std::string>(const std::string &item, TensorPtr *out) {
  679. return CreateFromVector<std::string>({item}, TensorShape::CreateScalar(), out);
  680. }
  681. } // namespace dataset
  682. } // namespace mindspore
  683. #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_CORE_TENSOR_H_