You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

tensor.h 29 kB

5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775
  1. /**
  2. * Copyright 2019 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_CORE_TENSOR_H_
  17. #define MINDSPORE_CCSRC_MINDDATA_DATASET_CORE_TENSOR_H_
  18. #include <deque>
  19. #include <memory>
  20. #include <string>
  21. #include <vector>
  22. #include "./securec.h"
  23. #ifndef ENABLE_ANDROID
  24. #include "utils/log_adapter.h"
  25. #else
  26. #include "mindspore/lite/src/common/log_adapter.h"
  27. #endif
  28. #if defined(_WIN32) || defined(_WIN64)
  29. #undef HAVE_STDDEF_H
  30. #undef HAVE_STDLIB_H
  31. #endif
  32. #ifdef ENABLE_PYTHON
  33. #include "pybind11/numpy.h"
  34. #include "pybind11/pybind11.h"
  35. #include "pybind11/stl.h"
  36. #endif
  37. #include "utils/ms_utils.h"
  38. #include "minddata/dataset/core/constants.h"
  39. #include "minddata/dataset/core/data_type.h"
  40. #include "minddata/dataset/core/tensor_shape.h"
  41. #include "minddata/dataset/util/status.h"
  42. #ifndef ENABLE_ANDROID
  43. #include "proto/example.pb.h"
  44. #else
  45. #include "minddata/dataset/include/de_tensor.h"
  46. #endif
  47. #ifdef ENABLE_PYTHON
  48. namespace py = pybind11;
  49. #endif
  50. namespace mindspore {
  51. #ifdef ENABLE_ANDROID
  52. namespace tensor {
  53. class DETensor;
  54. } // namespace tensor
  55. #endif
  56. namespace dataset {
  57. class Tensor;
  58. template <typename T>
  59. class Allocator;
  60. using CharAllocPtr = std::unique_ptr<Allocator<unsigned char>>;
  61. using TensorAllocPtr = std::shared_ptr<Allocator<Tensor>>; // An allocator shared_ptr for Tensors
  62. using offset_t = uint32_t; // type of offset values to store strings locations
  63. using TensorPtr = std::shared_ptr<Tensor>;
  64. class Tensor {
  65. public:
  66. Tensor() = delete;
  67. Tensor(const Tensor &other) = delete;
  68. Tensor &operator=(const Tensor &other) = delete;
  69. /// Create a tensor using shape and type. This constructor should not be used directly, use CreateFromTensor instead
  70. /// \note The shape and type information should be known and valid
  71. /// \note The constructor does not allocate data
  72. /// \param shape TensorShape
  73. /// \param type DataType
  74. Tensor(const TensorShape &shape, const DataType &type);
  75. /// Move constructor
  76. /// \param other Tensor to be moved
  77. Tensor(Tensor &&other) noexcept;
  78. /// Move assigment operator
  79. /// \param other Tensor to be moved
  80. Tensor &operator=(Tensor &&other) noexcept;
  81. /// Create a numeric tensor with type and shape. Items of the tensor would be uninitialized.
  82. /// \param[in] shape shape of the output tensor
  83. /// \param[in] type type of the output tensor
  84. /// \param[out] out Generated tensor
  85. /// \return Status code
  86. static Status CreateEmpty(const TensorShape &shape, const DataType &type, TensorPtr *out);
  87. /// Create a numeric tensor from a pointer in memory. Length of the source data is determined from the shape and type.
  88. /// Data will be copied into the new created tensor.
  89. /// \param[in] shape shape of the output tensor
  90. /// \param[in] type type of the output tensor
  91. /// \param[in] src pointer to the source data
  92. /// \param[out] out Generated tensor
  93. /// \return Status code
  94. static Status CreateFromMemory(const TensorShape &shape, const DataType &type, const uchar *src, TensorPtr *out);
  95. /// Create a tensor from a pointer in memory and length. Data will be copied into the new created tensor.
  96. /// \param[in] shape shape of the output tensor
  97. /// \param[in] type type of the output tensor
  98. /// \param[in] src pointer to the source data
  99. /// \param[in] length length of the src data
  100. /// \param[out] out Generated tensor
  101. /// \return Status code
  102. static Status CreateFromMemory(const TensorShape &shape, const DataType &type, const uchar *src,
  103. const dsize_t &length, TensorPtr *out);
  104. /// Create a copy of the input tensor
  105. /// \param[in] in original tensor to be copied
  106. /// \param[out] out output tensor to be generated
  107. /// \return Status
  108. static Status CreateFromTensor(const TensorPtr &in, TensorPtr *out) {
  109. return CreateFromMemory(in->shape(), in->type(), in->GetBuffer(), in->SizeInBytes(), out);
  110. }
  111. #ifdef ENABLE_PYTHON
  112. /// Create a Tensor from a given py::array
  113. /// \param[in] arr py::array
  114. /// \param[out] out Created tensor
  115. /// \return Status Code
  116. static Status CreateFromNpArray(const py::array &arr, TensorPtr *out);
  117. #endif
  118. #ifndef ENABLE_ANDROID
  119. /// Create a tensor of type DE_STRING from a BytesList.
  120. /// \param[in] bytes_list protobuf's Bytelist
  121. /// \param[in] shape shape of the outout tensor
  122. /// \param[out] out created Tensor
  123. /// \return Status Code
  124. static Status CreateFromByteList(const dataengine::BytesList &bytes_list, const TensorShape &shape, TensorPtr *out);
  125. /// Create a tensor of type UINT8 or INT8 from a BytesList.
  126. /// The tensor will be padded with ' ' to reach the required pad_size.
  127. /// \param[in] bytes_list protobuf's Bytelist
  128. /// \param[in] shape shape of the output tensor
  129. /// \param[in] type type of created tensor. Should be DE_UINT8 or INT8
  130. /// \param[in] pad_size The size of the tensor after padding
  131. /// \param[out] out created Tensor
  132. /// \return Status Code
  133. static Status CreateFromByteList(const dataengine::BytesList &bytes_list, const TensorShape &shape,
  134. const DataType &type, dsize_t pad_size, TensorPtr *out);
  135. #endif
  136. /// Create a Tensor from a given list of values.
  137. /// \tparam type of the values to be inserted.
  138. /// \param[in] items elements of the tensor
  139. /// \param[in] shape shape of the output tensor
  140. /// \param[out] out output argument to hold the created Tensor
  141. /// \return Status Code
  142. template <typename T>
  143. static Status CreateFromVector(const std::vector<T> &items, const TensorShape &shape, TensorPtr *out) {
  144. CHECK_FAIL_RETURN_UNEXPECTED(
  145. items.size() == shape.NumOfElements(),
  146. "Number of elements in the vector does not match the number of elements of the shape required");
  147. DataType type = DataType::FromCType<T>();
  148. // if items is empty, items_ptr would be nullptr. CreateFromMemory will handle this case.
  149. auto items_ptr = reinterpret_cast<const uchar *>(&items[0]);
  150. return CreateFromMemory(shape, type, items_ptr, out);
  151. }
  152. /// Create a 1D Tensor from a given list of values.
  153. /// \tparam type of the values to be inserted.
  154. /// \param[in] items elements of the tensor
  155. /// \param[out] out output argument to hold the created Tensor
  156. /// \return Status Code
  157. template <typename T>
  158. static Status CreateFromVector(const std::vector<T> &items, TensorPtr *out) {
  159. return CreateFromVector(items, TensorShape({static_cast<dsize_t>(items.size())}), out);
  160. }
  161. /// Create a numeric scalar Tensor from the given value.
  162. /// \tparam T type of value
  163. /// \param[in] item value
  164. /// \param[out] out Created tensor
  165. /// \return Status code
  166. template <typename T>
  167. static Status CreateScalar(const T &item, TensorPtr *out) {
  168. DataType type = DataType::FromCType<T>();
  169. auto item_ptr = reinterpret_cast<const uchar *>(&item);
  170. return CreateFromMemory(TensorShape::CreateScalar(), type, item_ptr, out);
  171. }
  172. /// Create a tensor from a binary file on disk.
  173. /// \param[in] path file to be read
  174. /// \param[out] out Created Tensor
  175. /// \return Status code
  176. static Status CreateFromFile(const std::string &path, TensorPtr *out);
  177. /// Destruct the tensor and release the memory using the allocator
  178. virtual ~Tensor();
  179. /// Equality operator. compares tensor shape, type and data
  180. /// \param[in] rhs Tensor to be compared with
  181. /// \return bool
  182. bool operator==(const Tensor &rhs) const;
  183. bool operator!=(const Tensor &rhs) const { return !((*this) == rhs); }
  184. /// Get item located at `index`, caller needs to provide the type.
  185. /// \tparam T
  186. /// \param[in] index vector<dsize_t>
  187. /// \return return the item specified at index
  188. template <typename T>
  189. Status GetItemAt(T *o, const std::vector<dsize_t> &index) const;
  190. /// Get string located at `index`.
  191. /// \param[in] index vector<dsize_t>
  192. /// \return return std::string_view specified at index
  193. Status GetItemAt(std::string_view *o, const std::vector<dsize_t> &index) const;
  194. template <typename T>
  195. Status GetUnsignedIntAt(T *o, const std::vector<dsize_t> &index) const;
  196. template <typename T>
  197. Status GetSignedIntAt(T *o, const std::vector<dsize_t> &index) const;
  198. template <typename T>
  199. Status GetFloatAt(T *o, const std::vector<dsize_t> &index) const;
  200. /// set item at location specified by index
  201. /// \tparam `T`
  202. /// \param[in] index
  203. /// \param[in] value of type `T`
  204. template <typename T>
  205. Status SetItemAt(const std::vector<dsize_t> &index, const T &value) {
  206. T *ptr = nullptr;
  207. RETURN_IF_NOT_OK(GetItemPtr<T>(&ptr, index));
  208. *ptr = value;
  209. return Status::OK();
  210. }
  211. /// set string item at location specified by index
  212. /// \param[in] index
  213. /// \param[in] value of type std::string
  214. Status SetItemAt(const std::vector<dsize_t> &index, const std::string &value) {
  215. RETURN_UNEXPECTED_IF_NULL(data_);
  216. uchar *ptr = nullptr;
  217. offset_t length = 0;
  218. RETURN_IF_NOT_OK(GetItemPtr(&ptr, index, &length));
  219. if (value.length() != length) {
  220. RETURN_STATUS_UNEXPECTED("Length of the new string does not match the item.");
  221. }
  222. memcpy_s(reinterpret_cast<char *>(ptr), length, value.c_str(), length);
  223. return Status::OK();
  224. }
  225. /// fill tensor with Zeros. Does not support strings.
  226. Status Zero() {
  227. CHECK_FAIL_RETURN_UNEXPECTED(type_ != DataType::DE_STRING, "Cannot use Zero on tensor of strings..");
  228. dsize_t size = SizeInBytes();
  229. CHECK_FAIL_RETURN_UNEXPECTED(memset_sp(GetMutableBuffer(), size, 0, size) == 0,
  230. "Failed to fill tensor with zeroes.");
  231. return Status::OK();
  232. }
  233. /// Fill all elements in the Tensor with the given value of type `T`. Does not support strings.
  234. /// \tparam T
  235. /// \param value[in]
  236. template <typename T>
  237. Status Fill(const T &value) {
  238. CHECK_FAIL_RETURN_UNEXPECTED(type_ != DataType::DE_STRING, "Cannot use fill on tensor of strings.");
  239. int64_t cellSize = type_.SizeInBytes();
  240. if ((data_ != nullptr) && type_.IsCompatible<T>()) {
  241. for (dsize_t i = 0; i < Size(); i++) {
  242. CHECK_FAIL_RETURN_UNEXPECTED(memcpy_s((data_ + i * cellSize), cellSize, &value, cellSize) == 0, "memcpy err");
  243. }
  244. return Status::OK();
  245. } else {
  246. std::string err;
  247. err += (data_ == nullptr) ? "data_ is nullptr \t" : "";
  248. err += type_.IsCompatible<T>() ? "data type not compatible\t" : "";
  249. return Status(StatusCode::kUnexpectedError, err);
  250. }
  251. }
  252. /// Getter function for shape
  253. /// \return
  254. const TensorShape &shape() const { return shape_; }
  255. /// Check if tensor has data
  256. /// \return bool - true if tensor is empty
  257. bool HasData() const { return data_ != nullptr; }
  258. /// Reshape the tensor. The given shape should have the same number of elements in the Tensor
  259. /// \param shape
  260. virtual Status Reshape(const TensorShape &shape);
  261. /// \return number of elements in this tensor
  262. dsize_t Size() const { return shape().NumOfElements(); }
  263. /// \return the number of bytes this tensor is needs
  264. dsize_t SizeInBytes() const {
  265. if (data_end_ == nullptr) return type_.SizeInBytes() * shape_.NumOfElements();
  266. return data_end_ - data_;
  267. }
  268. /// \return the rank of the tensor
  269. dsize_t Rank() const { return shape().Rank(); }
  270. /// Get the starting memory address as a constant for the data of the tensor. This potentially
  271. /// drives an allocation if the data area.
  272. /// \return const unsigned char*
  273. const unsigned char *GetBuffer() const { return data_; }
  274. /// Getter of the type
  275. /// \return
  276. DataType type() const { return type_; }
  277. /// Provide stream operator for displaying it
  278. /// \param output stream
  279. /// \param so the Tensor object to be printed
  280. /// \return output stream
  281. friend std::ostream &operator<<(std::ostream &out, const Tensor &so) {
  282. so.Print(out);
  283. return out;
  284. }
  285. /// Invalidate this Tensor by setting the type and shape to unknown and MData to null.
  286. /// Calling this method will make the Tensor and its data inaccessible, use it with caution.
  287. void Invalidate();
  288. /// Copy input tensor into self at the location index.
  289. /// Index is a vector of axises which can be incomplete:
  290. /// Ex: shape <2,3>, inserting into index {0} will replace the first row. index {1,2} will replace the last cell.
  291. /// \param index
  292. /// \param input
  293. /// \param partial_insert: boolean to determine if insertion along the full axis is enforced
  294. /// \return Status code
  295. Status InsertTensor(const std::vector<dsize_t> &index, const std::shared_ptr<Tensor> &input,
  296. const bool partial_insert = false);
  297. /// Find the address of the given index. Used in InsertTensor.
  298. /// Example:
  299. /// Tensor t= [[1,2],[3,4]] , StartAddrOfIndex({0}) -> &1
  300. /// \param index incomplete index
  301. /// \param output: startAddrofIndex
  302. /// \param output: remaining
  303. /// \return Status code
  304. Status StartAddrOfIndex(std::vector<dsize_t> ind, uchar **start_addr_of_index, TensorShape *remaining);
  305. /// Expand the shape of the Tensor with one extra dimension.
  306. /// For example, if the shape is <512,512,3>:
  307. /// *- ExpandDim(0) gives: <1,512,512,3>
  308. /// *- ExpandDim(1) gives: <512,1,512,3>
  309. /// *- ExpandDim(3) gives: <512,512,3,1>
  310. /// \param axis location of the dim
  311. virtual Status ExpandDim(const dsize_t &axis);
  312. virtual void Squeeze();
  313. /// Calculates the strides of the Tensor
  314. /// Ex: Tensor of shape <4,2,2> and type DE_UINT8 (1 byte)
  315. /// The strides will be {6,2,1}.
  316. /// Ex: Tensor of shape <4,2,2> and type DE_UINT32 (4 byte)
  317. /// The strides will be {24,8,4}.
  318. /// \return vector of integers
  319. std::vector<dsize_t> Strides() const;
  320. std::string ToString() {
  321. std::stringstream ss;
  322. this->Print(ss);
  323. return ss.str();
  324. }
  325. /// Handle negative indices.
  326. static inline dsize_t HandleNeg(dsize_t index, dsize_t length) { return (index < 0) ? (index + length) : index; }
  327. /// Slice tensor bases on the given indicies. Copy the sliced data into out tensor. Only rank1 tensors are supported.
  328. /// Based on the type of tensor, SliceNumeric or SliceString will be called
  329. /// \param[out] out Tensor
  330. /// \param[in] indices vector of indices
  331. /// \return Status error code
  332. Status Slice(TensorPtr *out, const std::vector<dsize_t> &indices);
  333. /// Slice numeric tensors.
  334. Status SliceNumeric(TensorPtr *out, const std::vector<dsize_t> &indices);
  335. /// Slice string tensors
  336. Status SliceString(TensorPtr *out, const std::vector<dsize_t> &indices);
  337. #ifdef ENABLE_PYTHON
  338. /// Constructs numpy array from input tensor
  339. /// \param[in] data this data is the location of python data
  340. /// \return Status code
  341. Status GetDataAsNumpy(py::array *data);
  342. Status GetDataAsNumpyStrings(py::array *data);
  343. static Status GetBufferInfo(Tensor *t, py::buffer_info *out);
  344. #endif
  345. /// TensorIterator is a linear iterator that can be used to iterate over the elements of the Tensor
  346. /// The order elements is as the memory layout (i.e., row-major) [[1,2,3],[4,5,6] --> 1,2,3,4,5,6
  347. /// \tparam T type of values in the Tensor Iterator
  348. template <typename T, bool = true>
  349. class TensorIterator {
  350. public:
  351. using iterator_category = std::random_access_iterator_tag;
  352. using value_type = T;
  353. using difference_type = ptrdiff_t;
  354. using pointer = T *;
  355. using reference = T &;
  356. explicit TensorIterator(uchar *ptr = nullptr) { ptr_ = reinterpret_cast<T *>(ptr); }
  357. TensorIterator(const TensorIterator<T> &raw_iterator) { ptr_ = raw_iterator.ptr_; }
  358. ~TensorIterator() = default;
  359. TensorIterator<T> &operator=(const TensorIterator<T> &rhs) {
  360. ptr_ = rhs.ptr_;
  361. return *this;
  362. }
  363. TensorIterator<T> &operator=(T *rhs) {
  364. ptr_ = rhs;
  365. return *this;
  366. }
  367. bool operator==(const TensorIterator<T> &rhs) { return ptr_ == rhs.ptr_; }
  368. bool operator!=(const TensorIterator<T> &rhs) { return !(*this == rhs); }
  369. operator bool() const { return ptr_ != nullptr; }
  370. T &operator*() { return *ptr_; }
  371. const T &operator*() const { return *ptr_; }
  372. T *operator->() { return ptr_; }
  373. TensorIterator<T> &operator+=(const ptrdiff_t &inc) {
  374. ptr_ += inc;
  375. return *this;
  376. }
  377. TensorIterator<T> &operator-=(const ptrdiff_t &inc) {
  378. ptr_ -= inc;
  379. return *this;
  380. }
  381. TensorIterator<T> &operator++() {
  382. ++ptr_;
  383. return *this;
  384. }
  385. TensorIterator<T> &operator--() {
  386. --ptr_;
  387. return *this;
  388. }
  389. TensorIterator<T> operator++(int) {
  390. auto temp(*this);
  391. ++ptr_;
  392. return temp;
  393. }
  394. TensorIterator<T> operator--(int) {
  395. auto temp(*this);
  396. --ptr_;
  397. return temp;
  398. }
  399. TensorIterator<T> operator+(const ptrdiff_t &inc) {
  400. auto oldPtr = ptr_;
  401. ptr_ += inc;
  402. auto temp(*this);
  403. ptr_ = oldPtr;
  404. return temp;
  405. }
  406. TensorIterator<T> operator-(const ptrdiff_t &inc) {
  407. auto oldPtr = ptr_;
  408. ptr_ -= inc;
  409. auto temp(*this);
  410. ptr_ = oldPtr;
  411. return temp;
  412. }
  413. protected:
  414. T *ptr_;
  415. };
  416. // Specialization of TensorIterator for strings. It returns std::string_view for every item.
  417. // \tparam DUMMY, used to mbe able to specialize the inner class
  418. template <bool DUMMY>
  419. class TensorIterator<std::string_view, DUMMY> {
  420. public:
  421. using iterator_category = std::random_access_iterator_tag;
  422. using value_type = std::string_view;
  423. using difference_type = ptrdiff_t;
  424. using pointer = std::string_view *;
  425. using reference = std::string_view &;
  426. explicit TensorIterator(uchar *data = nullptr, dsize_t index = 0) {
  427. data_ = reinterpret_cast<const char *>(data);
  428. index_ = index;
  429. }
  430. TensorIterator(const TensorIterator<std::string_view, DUMMY> &raw_iterator) {
  431. data_ = raw_iterator.data_;
  432. index_ = raw_iterator.index_;
  433. }
  434. ~TensorIterator() = default;
  435. bool operator==(const TensorIterator<std::string_view> &rhs) { return data_ == rhs.data_ && index_ == rhs.index_; }
  436. bool operator!=(const TensorIterator<std::string_view> &rhs) { return !(*this == rhs); }
  437. operator bool() const { return data_ != nullptr; }
  438. std::string_view operator*() const {
  439. auto offset_ = reinterpret_cast<const offset_t *>(data_);
  440. offset_t start = offset_[index_];
  441. return std::string_view{data_ + start};
  442. }
  443. TensorIterator<std::string_view> &operator+=(const dsize_t &inc) {
  444. index_ += inc;
  445. return *this;
  446. }
  447. TensorIterator<std::string_view> &operator-=(const dsize_t &inc) {
  448. index_ -= inc;
  449. return *this;
  450. }
  451. TensorIterator<std::string_view> &operator++() {
  452. ++index_;
  453. return *this;
  454. }
  455. TensorIterator<std::string_view> &operator--() {
  456. --index_;
  457. return *this;
  458. }
  459. TensorIterator<std::string_view> operator++(int) {
  460. auto temp(*this);
  461. ++index_;
  462. return temp;
  463. }
  464. TensorIterator<std::string_view> operator--(int) {
  465. auto temp(*this);
  466. --index_;
  467. return temp;
  468. }
  469. TensorIterator<std::string_view> operator+(const dsize_t &inc) {
  470. auto oldPtr = index_;
  471. index_ += inc;
  472. auto temp(*this);
  473. index_ = oldPtr;
  474. return temp;
  475. }
  476. TensorIterator<std::string_view> operator-(const dsize_t &inc) {
  477. auto oldPtr = index_;
  478. index_ -= inc;
  479. auto temp(*this);
  480. index_ = oldPtr;
  481. return temp;
  482. }
  483. protected:
  484. dsize_t index_;
  485. const char *data_;
  486. };
  487. /// Return a TensorIterator that points to the start of the Tensor.
  488. /// It's the user responsibility to use the correct type that matches the Tensor type
  489. /// \tparam T The type of values in the Tensor
  490. /// \return TensorIterator
  491. template <typename T>
  492. TensorIterator<T> begin() {
  493. return TensorIterator<T>(data_);
  494. }
  495. /// Return a linear iterator that points to the place after the last element of the Tensor.
  496. /// \tparam T The type of values in the Tensor
  497. /// \return TensorIterator
  498. template <typename T>
  499. TensorIterator<T> end() {
  500. return TensorIterator<T>(data_end_);
  501. }
  502. /// Copies the last dimension at `index` from Tensor `src` to this Tensor.
  503. /// \param[in] src Tensor
  504. /// \param[in] index vector to the start of the dimension. The last dim should be 0
  505. /// \return Status
  506. Status CopyLastDimAt(const std::shared_ptr<Tensor> &src, const std::vector<dsize_t> &index);
  507. protected:
  508. /// Allocate memory for the tensor using the data_allocator
  509. /// \param[in] length number of bytes to be allocated
  510. /// \return Error Status
  511. Status AllocateBuffer(const dsize_t &length);
  512. /// Get the starting memory address for the data of the tensor. This potentially
  513. /// drives an allocation if the data is null.
  514. /// \return unsigned char*
  515. unsigned char *GetMutableBuffer() { return data_; }
  516. /// A function that prints Tensor recursively, first called by print
  517. /// \param[in] out
  518. /// \param[in] cur_dim
  519. /// \param[in] cur_index
  520. void PrintRecursive(std::ostream &out, int32_t cur_dim, const std::vector<dsize_t> &cur_index) const;
  521. /// A function that prints info about the tensor
  522. /// \param[out] out output stream
  523. void Print(std::ostream &out) const;
  524. /// A function that print the value as specified by its index
  525. /// \param[in] index vector representing the index
  526. /// \param[out] out
  527. void PrintItemAt(const std::vector<dsize_t> &index, std::ostream &out) const;
  528. /// Get pointer to item located at `index`, caller needs to provide the type.
  529. /// \tparam T
  530. /// \param[in] index vector<dsize_t>
  531. /// \return return a pointer to the item specified at index of type `T`
  532. template <typename T>
  533. Status GetItemPtr(T **, const std::vector<dsize_t> &index) const;
  534. /// Get pointer to string located at `index` and the length of string
  535. /// \param[in] index vector<dsize_t>
  536. /// \return return a pointer to the string specified at index and the length of the string
  537. Status GetItemPtr(uchar **, const std::vector<dsize_t> &index, offset_t *length = nullptr) const;
  538. /// Given a flat index of an item string, return the start and length of the item
  539. /// \param[in] index flat index of the item
  540. /// \param[out] start address of the ths string
  541. /// \param[out] length of the string
  542. Status GetStringAt(dsize_t index, uchar **string_start, offset_t *length) const;
  543. /// Skip the offsets and returns the start of the buffer where the real strings is stored. Caller needs to check if
  544. /// the tensor's type is a string, otherwise undefined address would be returned. \return address of the first string
  545. /// of the tensor.
  546. uchar *GetStringsBuffer() const { return data_ + kOffsetSize * shape_.NumOfElements() + kOffsetSize; }
  547. /// all access to shape_ should be via shape
  548. TensorShape shape_;
  549. /// data type of tensor
  550. DataType type_;
  551. /// pointer to the start of the physical data
  552. unsigned char *data_;
  553. /// An allocator for data_
  554. CharAllocPtr data_allocator_;
  555. /// pointer to the end of the physical data
  556. unsigned char *data_end_ = nullptr;
  557. private:
  558. #ifdef ENABLE_ANDROID
  559. friend class tensor::DETensor;
  560. #endif
  561. /// Copy raw data of a array based on shape and strides to the destination pointer
  562. /// \param dst [out] Pointer to the destination array where the content is to be copied
  563. /// \param[in] src Pointer to the source of strided array to be copied
  564. /// \param[in] shape shape of the source array
  565. /// \param[in] strides strides of the source array
  566. /// \param[in] type_size number of bytes needed to store one array element's type
  567. /// \return Status Code
  568. static Status CopyStridedArray(unsigned char *dst, unsigned char *src, std::vector<dsize_t> shape,
  569. std::vector<dsize_t> strides, uint8_t type_size);
  570. /// const of the size of the offset variable
  571. static constexpr uint8_t kOffsetSize = sizeof(offset_t);
  572. #ifdef ENABLE_PYTHON
  573. /// Helper function to create a tensor from Numpy array of strings
  574. /// \param[in] arr Numpy array
  575. /// \param[out] out Created Tensor
  576. /// \return Status
  577. static Status CreateFromNpString(py::array arr, TensorPtr *out);
  578. #endif
  579. };
  580. template <>
  581. inline Tensor::TensorIterator<std::string_view> Tensor::end<std::string_view>() {
  582. return TensorIterator<std::string_view>(data_, shape_.NumOfElements());
  583. }
  584. /// Create a Tensor from a given list of strings.
  585. /// @note: The memory layout of a Tensor of strings consists of the Offset_array followed by the strings.
  586. /// The offset array will store one extra value to find the length of the last string.
  587. /// OFFSET_1, OFFSET_2, ..., OFFSET_n+1, STRING_1, STRING_2, ..., STRING_n
  588. /// The value of each offset is the start index of the corresponding string
  589. /// Offsets is of type offset_t
  590. /// strings will ne null-terminated
  591. /// example: Tensor(['abc', 'de'], shape={2}, type=DE_STRING)
  592. /// |----------------------------------------------------------------|
  593. /// | OFFSET ARRAY | STRINGS |
  594. /// | bytes 0-3 | bytes 3-6 | bytes 7-10 | bytes 11-14 | bytes 15-17 |
  595. /// | 11 | 15 | 18 | abc\0 | de\0 |
  596. /// |----------------------------------------------------------------|
  597. /// \param[in] items elements of the tensor
  598. /// \param[in] shape shape of the output tensor
  599. /// \param[out] out output argument to hold the created Tensor
  600. /// \return Status Code
  601. template <>
  602. inline Status Tensor::CreateFromVector<std::string>(const std::vector<std::string> &items, const TensorShape &shape,
  603. TensorPtr *out) {
  604. CHECK_FAIL_RETURN_UNEXPECTED(
  605. items.size() == shape.NumOfElements(),
  606. "Number of elements in the vector does not match the number of elements of the shape required");
  607. const TensorAlloc *alloc = GlobalContext::Instance()->tensor_allocator();
  608. *out = std::allocate_shared<Tensor>(*alloc, TensorShape({static_cast<dsize_t>(items.size())}),
  609. DataType(DataType::DE_STRING));
  610. if (items.size() == 0) {
  611. if (shape.known()) {
  612. return (*out)->Reshape(shape);
  613. }
  614. }
  615. auto length_sum = [](dsize_t sum, const std::string &s) { return s.length() + sum; };
  616. dsize_t total_length = std::accumulate(items.begin(), items.end(), 0, length_sum);
  617. // total bytes needed = offset array + strings
  618. // offset array needs to store one offset var per element + 1 extra to get the length of the last string.
  619. // strings will be null-terminated --> need 1 extra byte per element
  620. dsize_t num_bytes = (kOffsetSize + 1) * (*out)->shape_.NumOfElements() + kOffsetSize + total_length;
  621. (*out)->AllocateBuffer(num_bytes);
  622. auto offset_arr = reinterpret_cast<offset_t *>((*out)->data_);
  623. uchar *buf = (*out)->GetStringsBuffer();
  624. offset_t offset = buf - (*out)->data_; // the first string will start here
  625. uint32_t i = 0;
  626. for (const auto &str : items) {
  627. // insert the start index of the string.
  628. offset_arr[i++] = offset;
  629. // total bytes are reduced by kOffsetSize
  630. num_bytes -= kOffsetSize;
  631. // insert actual string
  632. int ret_code = memcpy_s((*out)->data_ + offset, num_bytes, common::SafeCStr(str), str.length() + 1);
  633. if (ret_code != 0) MS_LOG(ERROR) << "Cannot copy string into Tensor";
  634. // next string will be stored right after the current one.
  635. offset = offset + str.length() + 1;
  636. // total bytes are reduced by the length of the string
  637. num_bytes -= str.length() + 1;
  638. }
  639. // store one more offset value so we can get the length of the last string
  640. // length[last_element] = offset_arr[last_element + 1] - offset_arr[last_element]
  641. offset_arr[i] = offset;
  642. (*out)->data_end_ = (*out)->data_ + offset_arr[i];
  643. MS_ASSERT(num_bytes == 0);
  644. if (shape.known()) {
  645. RETURN_IF_NOT_OK((*out)->Reshape(shape));
  646. }
  647. return Status::OK();
  648. }
  649. /// Create a string scalar Tensor from the given value.
  650. /// \param[in] item value
  651. /// \param[out] out Created tensor
  652. /// \return Status code
  653. template <>
  654. inline Status Tensor::CreateScalar<std::string>(const std::string &item, TensorPtr *out) {
  655. return CreateFromVector<std::string>({item}, TensorShape::CreateScalar(), out);
  656. }
  657. } // namespace dataset
  658. } // namespace mindspore
  659. #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_CORE_TENSOR_H_