You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

tensor.h 28 kB

5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771
  1. /**
  2. * Copyright 2019 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_CORE_TENSOR_H_
  17. #define MINDSPORE_CCSRC_MINDDATA_DATASET_CORE_TENSOR_H_
  18. #include <deque>
  19. #include <memory>
  20. #include <string>
  21. #include <vector>
  22. #include "./securec.h"
  23. #include "utils/log_adapter.h"
  24. #if defined(_WIN32) || defined(_WIN64)
  25. #undef HAVE_STDDEF_H
  26. #undef HAVE_STDLIB_H
  27. #endif
  28. #ifdef ENABLE_PYTHON
  29. #include "pybind11/numpy.h"
  30. #include "pybind11/pybind11.h"
  31. #include "pybind11/stl.h"
  32. #endif
  33. #include "utils/ms_utils.h"
  34. #include "minddata/dataset/core/constants.h"
  35. #include "minddata/dataset/core/data_type.h"
  36. #include "minddata/dataset/core/tensor_shape.h"
  37. #include "minddata/dataset/util/status.h"
  38. #ifndef ENABLE_ANDROID
  39. #include "proto/example.pb.h"
  40. #else
  41. #include "minddata/dataset/include/de_tensor.h"
  42. #endif
  43. #ifdef ENABLE_PYTHON
  44. namespace py = pybind11;
  45. #endif
  46. namespace mindspore {
  47. #ifdef ENABLE_ANDROID
  48. namespace tensor {
  49. class DETensor;
  50. } // namespace tensor
  51. #endif
  52. namespace dataset {
  53. class Tensor;
  54. template <typename T>
  55. class Allocator;
  56. using CharAllocPtr = std::unique_ptr<Allocator<unsigned char>>;
  57. using TensorAllocPtr = std::shared_ptr<Allocator<Tensor>>; // An allocator shared_ptr for Tensors
  58. using offset_t = uint32_t; // type of offset values to store strings locations
  59. using TensorPtr = std::shared_ptr<Tensor>;
  60. class Tensor {
  61. public:
  62. Tensor() = delete;
  63. Tensor(const Tensor &other) = delete;
  64. Tensor &operator=(const Tensor &other) = delete;
  65. /// Create a tensor using shape and type. This constructor should not be used directly, use CreateFromTensor instead
  66. /// \note The shape and type information should be known and valid
  67. /// \note The constructor does not allocate data
  68. /// \param shape TensorShape
  69. /// \param type DataType
  70. Tensor(const TensorShape &shape, const DataType &type);
  71. /// Move constructor
  72. /// \param other Tensor to be moved
  73. Tensor(Tensor &&other) noexcept;
  74. /// Move assigment operator
  75. /// \param other Tensor to be moved
  76. Tensor &operator=(Tensor &&other) noexcept;
  77. /// Create a numeric tensor with type and shape. Items of the tensor would be uninitialized.
  78. /// \param[in] shape shape of the output tensor
  79. /// \param[in] type type of the output tensor
  80. /// \param[out] out Generated tensor
  81. /// \return Status code
  82. static Status CreateEmpty(const TensorShape &shape, const DataType &type, TensorPtr *out);
  83. /// Create a numeric tensor from a pointer in memory. Length of the source data is determined from the shape and type.
  84. /// Data will be copied into the new created tensor.
  85. /// \param[in] shape shape of the output tensor
  86. /// \param[in] type type of the output tensor
  87. /// \param[in] src pointer to the source data
  88. /// \param[out] out Generated tensor
  89. /// \return Status code
  90. static Status CreateFromMemory(const TensorShape &shape, const DataType &type, const uchar *src, TensorPtr *out);
  91. /// Create a tensor from a pointer in memory and length. Data will be copied into the new created tensor.
  92. /// \param[in] shape shape of the output tensor
  93. /// \param[in] type type of the output tensor
  94. /// \param[in] src pointer to the source data
  95. /// \param[in] length length of the src data
  96. /// \param[out] out Generated tensor
  97. /// \return Status code
  98. static Status CreateFromMemory(const TensorShape &shape, const DataType &type, const uchar *src,
  99. const dsize_t &length, TensorPtr *out);
  100. /// Create a copy of the input tensor
  101. /// \param[in] in original tensor to be copied
  102. /// \param[out] out output tensor to be generated
  103. /// \return Status
  104. static Status CreateFromTensor(const TensorPtr &in, TensorPtr *out) {
  105. return CreateFromMemory(in->shape(), in->type(), in->GetBuffer(), in->SizeInBytes(), out);
  106. }
  107. #ifdef ENABLE_PYTHON
  108. /// Create a Tensor from a given py::array
  109. /// \param[in] arr py::array
  110. /// \param[out] out Created tensor
  111. /// \return Status Code
  112. static Status CreateFromNpArray(const py::array &arr, TensorPtr *out);
  113. #endif
  114. #ifndef ENABLE_ANDROID
  115. /// Create a tensor of type DE_STRING from a BytesList.
  116. /// \param[in] bytes_list protobuf's Bytelist
  117. /// \param[in] shape shape of the outout tensor
  118. /// \param[out] out created Tensor
  119. /// \return Status Code
  120. static Status CreateFromByteList(const dataengine::BytesList &bytes_list, const TensorShape &shape, TensorPtr *out);
  121. /// Create a tensor of type UINT8 or INT8 from a BytesList.
  122. /// The tensor will be padded with ' ' to reach the required pad_size.
  123. /// \param[in] bytes_list protobuf's Bytelist
  124. /// \param[in] shape shape of the output tensor
  125. /// \param[in] type type of created tensor. Should be DE_UINT8 or INT8
  126. /// \param[in] pad_size The size of the tensor after padding
  127. /// \param[out] out created Tensor
  128. /// \return Status Code
  129. static Status CreateFromByteList(const dataengine::BytesList &bytes_list, const TensorShape &shape,
  130. const DataType &type, dsize_t pad_size, TensorPtr *out);
  131. #endif
  132. /// Create a Tensor from a given list of values.
  133. /// \tparam type of the values to be inserted.
  134. /// \param[in] items elements of the tensor
  135. /// \param[in] shape shape of the output tensor
  136. /// \param[out] out output argument to hold the created Tensor
  137. /// \return Status Code
  138. template <typename T>
  139. static Status CreateFromVector(const std::vector<T> &items, const TensorShape &shape, TensorPtr *out) {
  140. CHECK_FAIL_RETURN_UNEXPECTED(
  141. items.size() == shape.NumOfElements(),
  142. "Number of elements in the vector does not match the number of elements of the shape required");
  143. DataType type = DataType::FromCType<T>();
  144. // if items is empty, items_ptr would be nullptr. CreateFromMemory will handle this case.
  145. auto items_ptr = reinterpret_cast<const uchar *>(&items[0]);
  146. return CreateFromMemory(shape, type, items_ptr, out);
  147. }
  148. /// Create a 1D Tensor from a given list of values.
  149. /// \tparam type of the values to be inserted.
  150. /// \param[in] items elements of the tensor
  151. /// \param[out] out output argument to hold the created Tensor
  152. /// \return Status Code
  153. template <typename T>
  154. static Status CreateFromVector(const std::vector<T> &items, TensorPtr *out) {
  155. return CreateFromVector(items, TensorShape({static_cast<dsize_t>(items.size())}), out);
  156. }
  157. /// Create a numeric scalar Tensor from the given value.
  158. /// \tparam T type of value
  159. /// \param[in] item value
  160. /// \param[out] out Created tensor
  161. /// \return Status code
  162. template <typename T>
  163. static Status CreateScalar(const T &item, TensorPtr *out) {
  164. DataType type = DataType::FromCType<T>();
  165. auto item_ptr = reinterpret_cast<const uchar *>(&item);
  166. return CreateFromMemory(TensorShape::CreateScalar(), type, item_ptr, out);
  167. }
  168. /// Create a tensor from a binary file on disk.
  169. /// \param[in] path file to be read
  170. /// \param[out] out Created Tensor
  171. /// \return Status code
  172. static Status CreateFromFile(const std::string &path, TensorPtr *out);
  173. /// Destruct the tensor and release the memory using the allocator
  174. virtual ~Tensor();
  175. /// Equality operator. compares tensor shape, type and data
  176. /// \param[in] rhs Tensor to be compared with
  177. /// \return bool
  178. bool operator==(const Tensor &rhs) const;
  179. bool operator!=(const Tensor &rhs) const { return !((*this) == rhs); }
  180. /// Get item located at `index`, caller needs to provide the type.
  181. /// \tparam T
  182. /// \param[in] index vector<dsize_t>
  183. /// \return return the item specified at index
  184. template <typename T>
  185. Status GetItemAt(T *o, const std::vector<dsize_t> &index) const;
  186. /// Get string located at `index`.
  187. /// \param[in] index vector<dsize_t>
  188. /// \return return std::string_view specified at index
  189. Status GetItemAt(std::string_view *o, const std::vector<dsize_t> &index) const;
  190. template <typename T>
  191. Status GetUnsignedIntAt(T *o, const std::vector<dsize_t> &index) const;
  192. template <typename T>
  193. Status GetSignedIntAt(T *o, const std::vector<dsize_t> &index) const;
  194. template <typename T>
  195. Status GetFloatAt(T *o, const std::vector<dsize_t> &index) const;
  196. /// set item at location specified by index
  197. /// \tparam `T`
  198. /// \param[in] index
  199. /// \param[in] value of type `T`
  200. template <typename T>
  201. Status SetItemAt(const std::vector<dsize_t> &index, const T &value) {
  202. T *ptr = nullptr;
  203. RETURN_IF_NOT_OK(GetItemPtr<T>(&ptr, index));
  204. *ptr = value;
  205. return Status::OK();
  206. }
  207. /// set string item at location specified by index
  208. /// \param[in] index
  209. /// \param[in] value of type std::string
  210. Status SetItemAt(const std::vector<dsize_t> &index, const std::string &value) {
  211. RETURN_UNEXPECTED_IF_NULL(data_);
  212. uchar *ptr = nullptr;
  213. offset_t length = 0;
  214. RETURN_IF_NOT_OK(GetItemPtr(&ptr, index, &length));
  215. if (value.length() != length) {
  216. RETURN_STATUS_UNEXPECTED("Length of the new string does not match the item.");
  217. }
  218. memcpy_s(reinterpret_cast<char *>(ptr), length, value.c_str(), length);
  219. return Status::OK();
  220. }
  221. /// fill tensor with Zeros. Does not support strings.
  222. Status Zero() {
  223. CHECK_FAIL_RETURN_UNEXPECTED(type_ != DataType::DE_STRING, "Cannot use Zero on tensor of strings..");
  224. dsize_t size = SizeInBytes();
  225. CHECK_FAIL_RETURN_UNEXPECTED(memset_sp(GetMutableBuffer(), size, 0, size) == 0,
  226. "Failed to fill tensor with zeroes.");
  227. return Status::OK();
  228. }
  229. /// Fill all elements in the Tensor with the given value of type `T`. Does not support strings.
  230. /// \tparam T
  231. /// \param value[in]
  232. template <typename T>
  233. Status Fill(const T &value) {
  234. CHECK_FAIL_RETURN_UNEXPECTED(type_ != DataType::DE_STRING, "Cannot use fill on tensor of strings.");
  235. int64_t cellSize = type_.SizeInBytes();
  236. if ((data_ != nullptr) && type_.IsCompatible<T>()) {
  237. for (dsize_t i = 0; i < Size(); i++) {
  238. CHECK_FAIL_RETURN_UNEXPECTED(memcpy_s((data_ + i * cellSize), cellSize, &value, cellSize) == 0, "memcpy err");
  239. }
  240. return Status::OK();
  241. } else {
  242. std::string err;
  243. err += (data_ == nullptr) ? "data_ is nullptr \t" : "";
  244. err += type_.IsCompatible<T>() ? "data type not compatible\t" : "";
  245. return Status(StatusCode::kUnexpectedError, err);
  246. }
  247. }
  248. /// Getter function for shape
  249. /// \return
  250. const TensorShape &shape() const { return shape_; }
  251. /// Check if tensor has data
  252. /// \return bool - true if tensor is empty
  253. bool HasData() const { return data_ != nullptr; }
  254. /// Reshape the tensor. The given shape should have the same number of elements in the Tensor
  255. /// \param shape
  256. virtual Status Reshape(const TensorShape &shape);
  257. /// \return number of elements in this tensor
  258. dsize_t Size() const { return shape().NumOfElements(); }
  259. /// \return the number of bytes this tensor is needs
  260. dsize_t SizeInBytes() const {
  261. if (data_end_ == nullptr) return type_.SizeInBytes() * shape_.NumOfElements();
  262. return data_end_ - data_;
  263. }
  264. /// \return the rank of the tensor
  265. dsize_t Rank() const { return shape().Rank(); }
  266. /// Get the starting memory address as a constant for the data of the tensor. This potentially
  267. /// drives an allocation if the data area.
  268. /// \return const unsigned char*
  269. const unsigned char *GetBuffer() const { return data_; }
  270. /// Getter of the type
  271. /// \return
  272. DataType type() const { return type_; }
  273. /// Provide stream operator for displaying it
  274. /// \param output stream
  275. /// \param so the Tensor object to be printed
  276. /// \return output stream
  277. friend std::ostream &operator<<(std::ostream &out, const Tensor &so) {
  278. so.Print(out);
  279. return out;
  280. }
  281. /// Invalidate this Tensor by setting the type and shape to unknown and MData to null.
  282. /// Calling this method will make the Tensor and its data inaccessible, use it with caution.
  283. void Invalidate();
  284. /// Copy input tensor into self at the location index.
  285. /// Index is a vector of axises which can be incomplete:
  286. /// Ex: shape <2,3>, inserting into index {0} will replace the first row. index {1,2} will replace the last cell.
  287. /// \param index
  288. /// \param input
  289. /// \param partial_insert: boolean to determine if insertion along the full axis is enforced
  290. /// \return Status code
  291. Status InsertTensor(const std::vector<dsize_t> &index, const std::shared_ptr<Tensor> &input,
  292. const bool partial_insert = false);
  293. /// Find the address of the given index. Used in InsertTensor.
  294. /// Example:
  295. /// Tensor t= [[1,2],[3,4]] , StartAddrOfIndex({0}) -> &1
  296. /// \param index incomplete index
  297. /// \param output: startAddrofIndex
  298. /// \param output: remaining
  299. /// \return Status code
  300. Status StartAddrOfIndex(std::vector<dsize_t> ind, uchar **start_addr_of_index, TensorShape *remaining);
  301. /// Expand the shape of the Tensor with one extra dimension.
  302. /// For example, if the shape is <512,512,3>:
  303. /// *- ExpandDim(0) gives: <1,512,512,3>
  304. /// *- ExpandDim(1) gives: <512,1,512,3>
  305. /// *- ExpandDim(3) gives: <512,512,3,1>
  306. /// \param axis location of the dim
  307. virtual Status ExpandDim(const dsize_t &axis);
  308. virtual void Squeeze();
  309. /// Calculates the strides of the Tensor
  310. /// Ex: Tensor of shape <4,2,2> and type DE_UINT8 (1 byte)
  311. /// The strides will be {6,2,1}.
  312. /// Ex: Tensor of shape <4,2,2> and type DE_UINT32 (4 byte)
  313. /// The strides will be {24,8,4}.
  314. /// \return vector of integers
  315. std::vector<dsize_t> Strides() const;
  316. std::string ToString() {
  317. std::stringstream ss;
  318. this->Print(ss);
  319. return ss.str();
  320. }
  321. /// Handle negative indices.
  322. static inline dsize_t HandleNeg(dsize_t index, dsize_t length) { return (index < 0) ? (index + length) : index; }
  323. /// Slice tensor bases on the given indicies. Copy the sliced data into out tensor. Only rank1 tensors are supported.
  324. /// Based on the type of tensor, SliceNumeric or SliceString will be called
  325. /// \param[out] out Tensor
  326. /// \param[in] indices vector of indices
  327. /// \return Status error code
  328. Status Slice(TensorPtr *out, const std::vector<dsize_t> &indices);
  329. /// Slice numeric tensors.
  330. Status SliceNumeric(TensorPtr *out, const std::vector<dsize_t> &indices);
  331. /// Slice string tensors
  332. Status SliceString(TensorPtr *out, const std::vector<dsize_t> &indices);
  333. #ifdef ENABLE_PYTHON
  334. /// Constructs numpy array from input tensor
  335. /// \param[in] data this data is the location of python data
  336. /// \return Status code
  337. Status GetDataAsNumpy(py::array *data);
  338. Status GetDataAsNumpyStrings(py::array *data);
  339. static Status GetBufferInfo(Tensor *t, py::buffer_info *out);
  340. #endif
  341. /// TensorIterator is a linear iterator that can be used to iterate over the elements of the Tensor
  342. /// The order elements is as the memory layout (i.e., row-major) [[1,2,3],[4,5,6] --> 1,2,3,4,5,6
  343. /// \tparam T type of values in the Tensor Iterator
  344. template <typename T, bool = true>
  345. class TensorIterator {
  346. public:
  347. using iterator_category = std::random_access_iterator_tag;
  348. using value_type = T;
  349. using difference_type = ptrdiff_t;
  350. using pointer = T *;
  351. using reference = T &;
  352. explicit TensorIterator(uchar *ptr = nullptr) { ptr_ = reinterpret_cast<T *>(ptr); }
  353. TensorIterator(const TensorIterator<T> &raw_iterator) { ptr_ = raw_iterator.ptr_; }
  354. ~TensorIterator() = default;
  355. TensorIterator<T> &operator=(const TensorIterator<T> &rhs) {
  356. ptr_ = rhs.ptr_;
  357. return *this;
  358. }
  359. TensorIterator<T> &operator=(T *rhs) {
  360. ptr_ = rhs;
  361. return *this;
  362. }
  363. bool operator==(const TensorIterator<T> &rhs) { return ptr_ == rhs.ptr_; }
  364. bool operator!=(const TensorIterator<T> &rhs) { return !(*this == rhs); }
  365. operator bool() const { return ptr_ != nullptr; }
  366. T &operator*() { return *ptr_; }
  367. const T &operator*() const { return *ptr_; }
  368. T *operator->() { return ptr_; }
  369. TensorIterator<T> &operator+=(const ptrdiff_t &inc) {
  370. ptr_ += inc;
  371. return *this;
  372. }
  373. TensorIterator<T> &operator-=(const ptrdiff_t &inc) {
  374. ptr_ -= inc;
  375. return *this;
  376. }
  377. TensorIterator<T> &operator++() {
  378. ++ptr_;
  379. return *this;
  380. }
  381. TensorIterator<T> &operator--() {
  382. --ptr_;
  383. return *this;
  384. }
  385. TensorIterator<T> operator++(int) {
  386. auto temp(*this);
  387. ++ptr_;
  388. return temp;
  389. }
  390. TensorIterator<T> operator--(int) {
  391. auto temp(*this);
  392. --ptr_;
  393. return temp;
  394. }
  395. TensorIterator<T> operator+(const ptrdiff_t &inc) {
  396. auto oldPtr = ptr_;
  397. ptr_ += inc;
  398. auto temp(*this);
  399. ptr_ = oldPtr;
  400. return temp;
  401. }
  402. TensorIterator<T> operator-(const ptrdiff_t &inc) {
  403. auto oldPtr = ptr_;
  404. ptr_ -= inc;
  405. auto temp(*this);
  406. ptr_ = oldPtr;
  407. return temp;
  408. }
  409. protected:
  410. T *ptr_;
  411. };
  412. // Specialization of TensorIterator for strings. It returns std::string_view for every item.
  413. // \tparam DUMMY, used to mbe able to specialize the inner class
  414. template <bool DUMMY>
  415. class TensorIterator<std::string_view, DUMMY> {
  416. public:
  417. using iterator_category = std::random_access_iterator_tag;
  418. using value_type = std::string_view;
  419. using difference_type = ptrdiff_t;
  420. using pointer = std::string_view *;
  421. using reference = std::string_view &;
  422. explicit TensorIterator(uchar *data = nullptr, dsize_t index = 0) {
  423. data_ = reinterpret_cast<const char *>(data);
  424. index_ = index;
  425. }
  426. TensorIterator(const TensorIterator<std::string_view, DUMMY> &raw_iterator) {
  427. data_ = raw_iterator.data_;
  428. index_ = raw_iterator.index_;
  429. }
  430. ~TensorIterator() = default;
  431. bool operator==(const TensorIterator<std::string_view> &rhs) { return data_ == rhs.data_ && index_ == rhs.index_; }
  432. bool operator!=(const TensorIterator<std::string_view> &rhs) { return !(*this == rhs); }
  433. operator bool() const { return data_ != nullptr; }
  434. std::string_view operator*() const {
  435. auto offset_ = reinterpret_cast<const offset_t *>(data_);
  436. offset_t start = offset_[index_];
  437. return std::string_view{data_ + start};
  438. }
  439. TensorIterator<std::string_view> &operator+=(const dsize_t &inc) {
  440. index_ += inc;
  441. return *this;
  442. }
  443. TensorIterator<std::string_view> &operator-=(const dsize_t &inc) {
  444. index_ -= inc;
  445. return *this;
  446. }
  447. TensorIterator<std::string_view> &operator++() {
  448. ++index_;
  449. return *this;
  450. }
  451. TensorIterator<std::string_view> &operator--() {
  452. --index_;
  453. return *this;
  454. }
  455. TensorIterator<std::string_view> operator++(int) {
  456. auto temp(*this);
  457. ++index_;
  458. return temp;
  459. }
  460. TensorIterator<std::string_view> operator--(int) {
  461. auto temp(*this);
  462. --index_;
  463. return temp;
  464. }
  465. TensorIterator<std::string_view> operator+(const dsize_t &inc) {
  466. auto oldPtr = index_;
  467. index_ += inc;
  468. auto temp(*this);
  469. index_ = oldPtr;
  470. return temp;
  471. }
  472. TensorIterator<std::string_view> operator-(const dsize_t &inc) {
  473. auto oldPtr = index_;
  474. index_ -= inc;
  475. auto temp(*this);
  476. index_ = oldPtr;
  477. return temp;
  478. }
  479. protected:
  480. dsize_t index_;
  481. const char *data_;
  482. };
  483. /// Return a TensorIterator that points to the start of the Tensor.
  484. /// It's the user responsibility to use the correct type that matches the Tensor type
  485. /// \tparam T The type of values in the Tensor
  486. /// \return TensorIterator
  487. template <typename T>
  488. TensorIterator<T> begin() {
  489. return TensorIterator<T>(data_);
  490. }
  491. /// Return a linear iterator that points to the place after the last element of the Tensor.
  492. /// \tparam T The type of values in the Tensor
  493. /// \return TensorIterator
  494. template <typename T>
  495. TensorIterator<T> end() {
  496. return TensorIterator<T>(data_end_);
  497. }
  498. /// Copies the last dimension at `index` from Tensor `src` to this Tensor.
  499. /// \param[in] src Tensor
  500. /// \param[in] index vector to the start of the dimension. The last dim should be 0
  501. /// \return Status
  502. Status CopyLastDimAt(const std::shared_ptr<Tensor> &src, const std::vector<dsize_t> &index);
  503. protected:
  504. /// Allocate memory for the tensor using the data_allocator
  505. /// \param[in] length number of bytes to be allocated
  506. /// \return Error Status
  507. Status AllocateBuffer(const dsize_t &length);
  508. /// Get the starting memory address for the data of the tensor. This potentially
  509. /// drives an allocation if the data is null.
  510. /// \return unsigned char*
  511. unsigned char *GetMutableBuffer() { return data_; }
  512. /// A function that prints Tensor recursively, first called by print
  513. /// \param[in] out
  514. /// \param[in] cur_dim
  515. /// \param[in] cur_index
  516. void PrintRecursive(std::ostream &out, int32_t cur_dim, const std::vector<dsize_t> &cur_index) const;
  517. /// A function that prints info about the tensor
  518. /// \param[out] out output stream
  519. void Print(std::ostream &out) const;
  520. /// A function that print the value as specified by its index
  521. /// \param[in] index vector representing the index
  522. /// \param[out] out
  523. void PrintItemAt(const std::vector<dsize_t> &index, std::ostream &out) const;
  524. /// Get pointer to item located at `index`, caller needs to provide the type.
  525. /// \tparam T
  526. /// \param[in] index vector<dsize_t>
  527. /// \return return a pointer to the item specified at index of type `T`
  528. template <typename T>
  529. Status GetItemPtr(T **, const std::vector<dsize_t> &index) const;
  530. /// Get pointer to string located at `index` and the length of string
  531. /// \param[in] index vector<dsize_t>
  532. /// \return return a pointer to the string specified at index and the length of the string
  533. Status GetItemPtr(uchar **, const std::vector<dsize_t> &index, offset_t *length = nullptr) const;
  534. /// Given a flat index of an item string, return the start and length of the item
  535. /// \param[in] index flat index of the item
  536. /// \param[out] start address of the ths string
  537. /// \param[out] length of the string
  538. Status GetStringAt(dsize_t index, uchar **string_start, offset_t *length) const;
  539. /// Skip the offsets and returns the start of the buffer where the real strings is stored. Caller needs to check if
  540. /// the tensor's type is a string, otherwise undefined address would be returned. \return address of the first string
  541. /// of the tensor.
  542. uchar *GetStringsBuffer() const { return data_ + kOffsetSize * shape_.NumOfElements() + kOffsetSize; }
  543. /// all access to shape_ should be via shape
  544. TensorShape shape_;
  545. /// data type of tensor
  546. DataType type_;
  547. /// pointer to the start of the physical data
  548. unsigned char *data_;
  549. /// An allocator for data_
  550. CharAllocPtr data_allocator_;
  551. /// pointer to the end of the physical data
  552. unsigned char *data_end_ = nullptr;
  553. private:
  554. #ifdef ENABLE_ANDROID
  555. friend class tensor::DETensor;
  556. #endif
  557. /// Copy raw data of a array based on shape and strides to the destination pointer
  558. /// \param dst [out] Pointer to the destination array where the content is to be copied
  559. /// \param[in] src Pointer to the source of strided array to be copied
  560. /// \param[in] shape shape of the source array
  561. /// \param[in] strides strides of the source array
  562. /// \param[in] type_size number of bytes needed to store one array element's type
  563. /// \return Status Code
  564. static Status CopyStridedArray(unsigned char *dst, unsigned char *src, std::vector<dsize_t> shape,
  565. std::vector<dsize_t> strides, uint8_t type_size);
  566. /// const of the size of the offset variable
  567. static constexpr uint8_t kOffsetSize = sizeof(offset_t);
  568. #ifdef ENABLE_PYTHON
  569. /// Helper function to create a tensor from Numpy array of strings
  570. /// \param[in] arr Numpy array
  571. /// \param[out] out Created Tensor
  572. /// \return Status
  573. static Status CreateFromNpString(py::array arr, TensorPtr *out);
  574. #endif
  575. };
  576. template <>
  577. inline Tensor::TensorIterator<std::string_view> Tensor::end<std::string_view>() {
  578. return TensorIterator<std::string_view>(data_, shape_.NumOfElements());
  579. }
  580. /// Create a Tensor from a given list of strings.
  581. /// @note: The memory layout of a Tensor of strings consists of the Offset_array followed by the strings.
  582. /// The offset array will store one extra value to find the length of the last string.
  583. /// OFFSET_1, OFFSET_2, ..., OFFSET_n+1, STRING_1, STRING_2, ..., STRING_n
  584. /// The value of each offset is the start index of the corresponding string
  585. /// Offsets is of type offset_t
  586. /// strings will ne null-terminated
  587. /// example: Tensor(['abc', 'de'], shape={2}, type=DE_STRING)
  588. /// |----------------------------------------------------------------|
  589. /// | OFFSET ARRAY | STRINGS |
  590. /// | bytes 0-3 | bytes 3-6 | bytes 7-10 | bytes 11-14 | bytes 15-17 |
  591. /// | 11 | 15 | 18 | abc\0 | de\0 |
  592. /// |----------------------------------------------------------------|
  593. /// \param[in] items elements of the tensor
  594. /// \param[in] shape shape of the output tensor
  595. /// \param[out] out output argument to hold the created Tensor
  596. /// \return Status Code
  597. template <>
  598. inline Status Tensor::CreateFromVector<std::string>(const std::vector<std::string> &items, const TensorShape &shape,
  599. TensorPtr *out) {
  600. CHECK_FAIL_RETURN_UNEXPECTED(
  601. items.size() == shape.NumOfElements(),
  602. "Number of elements in the vector does not match the number of elements of the shape required");
  603. const TensorAlloc *alloc = GlobalContext::Instance()->tensor_allocator();
  604. *out = std::allocate_shared<Tensor>(*alloc, TensorShape({static_cast<dsize_t>(items.size())}),
  605. DataType(DataType::DE_STRING));
  606. if (items.size() == 0) {
  607. if (shape.known()) {
  608. return (*out)->Reshape(shape);
  609. }
  610. }
  611. auto length_sum = [](dsize_t sum, const std::string &s) { return s.length() + sum; };
  612. dsize_t total_length = std::accumulate(items.begin(), items.end(), 0, length_sum);
  613. // total bytes needed = offset array + strings
  614. // offset array needs to store one offset var per element + 1 extra to get the length of the last string.
  615. // strings will be null-terminated --> need 1 extra byte per element
  616. dsize_t num_bytes = (kOffsetSize + 1) * (*out)->shape_.NumOfElements() + kOffsetSize + total_length;
  617. (*out)->AllocateBuffer(num_bytes);
  618. auto offset_arr = reinterpret_cast<offset_t *>((*out)->data_);
  619. uchar *buf = (*out)->GetStringsBuffer();
  620. offset_t offset = buf - (*out)->data_; // the first string will start here
  621. uint32_t i = 0;
  622. for (const auto &str : items) {
  623. // insert the start index of the string.
  624. offset_arr[i++] = offset;
  625. // total bytes are reduced by kOffsetSize
  626. num_bytes -= kOffsetSize;
  627. // insert actual string
  628. int ret_code = memcpy_s((*out)->data_ + offset, num_bytes, common::SafeCStr(str), str.length() + 1);
  629. if (ret_code != 0) MS_LOG(ERROR) << "Cannot copy string into Tensor";
  630. // next string will be stored right after the current one.
  631. offset = offset + str.length() + 1;
  632. // total bytes are reduced by the length of the string
  633. num_bytes -= str.length() + 1;
  634. }
  635. // store one more offset value so we can get the length of the last string
  636. // length[last_element] = offset_arr[last_element + 1] - offset_arr[last_element]
  637. offset_arr[i] = offset;
  638. (*out)->data_end_ = (*out)->data_ + offset_arr[i];
  639. MS_ASSERT(num_bytes == 0);
  640. if (shape.known()) {
  641. RETURN_IF_NOT_OK((*out)->Reshape(shape));
  642. }
  643. return Status::OK();
  644. }
  645. /// Create a string scalar Tensor from the given value.
  646. /// \param[in] item value
  647. /// \param[out] out Created tensor
  648. /// \return Status code
  649. template <>
  650. inline Status Tensor::CreateScalar<std::string>(const std::string &item, TensorPtr *out) {
  651. return CreateFromVector<std::string>({item}, TensorShape::CreateScalar(), out);
  652. }
  653. } // namespace dataset
  654. } // namespace mindspore
  655. #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_CORE_TENSOR_H_