You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

data_utils.cc 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346
  1. /**
  2. * Copyright 2019 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "dataset/kernels/data/data_utils.h"
  17. #include <algorithm>
  18. #include <string>
  19. #include <vector>
  20. #include "dataset/core/constants.h"
  21. #include "dataset/core/tensor.h"
  22. #include "dataset/core/tensor_shape.h"
  23. #include "dataset/core/data_type.h"
  24. #include "dataset/core/pybind_support.h"
  25. namespace mindspore {
  26. namespace dataset {
  27. Status OneHotEncodingUnsigned(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output,
  28. dsize_t num_classes, int64_t index) {
  29. uint64_t class_idx;
  30. if (input->Rank() == 0) {
  31. RETURN_IF_NOT_OK(input->GetItemAt<uint64_t>(&class_idx, {}));
  32. } else {
  33. RETURN_IF_NOT_OK(input->GetItemAt<uint64_t>(&class_idx, {index}));
  34. }
  35. if (class_idx >= static_cast<uint64_t>(num_classes)) {
  36. RETURN_STATUS_UNEXPECTED("One_hot index values are not in range");
  37. }
  38. if (input->type() == DataType::DE_UINT64) {
  39. RETURN_IF_NOT_OK((*output)->SetItemAt<uint64_t>({index, static_cast<dsize_t>(class_idx)}, 1));
  40. } else if (input->type() == DataType::DE_UINT32) {
  41. RETURN_IF_NOT_OK((*output)->SetItemAt<uint32_t>({index, static_cast<dsize_t>(class_idx)}, 1));
  42. } else if (input->type() == DataType::DE_UINT16) {
  43. RETURN_IF_NOT_OK((*output)->SetItemAt<uint16_t>({index, static_cast<dsize_t>(class_idx)}, 1));
  44. } else if (input->type() == DataType::DE_UINT8) {
  45. RETURN_IF_NOT_OK((*output)->SetItemAt<uint8_t>({index, static_cast<dsize_t>(class_idx)}, 1));
  46. } else {
  47. RETURN_STATUS_UNEXPECTED("One hot unsigned only supports unsigned int as input.");
  48. }
  49. return Status::OK();
  50. }
  51. Status OneHotEncodingSigned(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, dsize_t num_classes,
  52. int64_t index) {
  53. int64_t class_idx;
  54. if (input->Rank() == 0) {
  55. RETURN_IF_NOT_OK(input->GetItemAt<int64_t>(&class_idx, {}));
  56. } else {
  57. RETURN_IF_NOT_OK(input->GetItemAt<int64_t>(&class_idx, {index}));
  58. }
  59. if (class_idx >= static_cast<int64_t>(num_classes)) {
  60. RETURN_STATUS_UNEXPECTED("One_hot index values are not in range");
  61. }
  62. if (input->type() == DataType::DE_INT64) {
  63. RETURN_IF_NOT_OK((*output)->SetItemAt<int64_t>({index, static_cast<dsize_t>(class_idx)}, 1));
  64. } else if (input->type() == DataType::DE_INT32) {
  65. RETURN_IF_NOT_OK((*output)->SetItemAt<int32_t>({index, static_cast<dsize_t>(class_idx)}, 1));
  66. } else if (input->type() == DataType::DE_INT16) {
  67. RETURN_IF_NOT_OK((*output)->SetItemAt<int16_t>({index, static_cast<dsize_t>(class_idx)}, 1));
  68. } else if (input->type() == DataType::DE_INT8) {
  69. RETURN_IF_NOT_OK((*output)->SetItemAt<int8_t>({index, static_cast<dsize_t>(class_idx)}, 1));
  70. } else {
  71. RETURN_STATUS_UNEXPECTED("One hot signed only supports signed int as input.");
  72. }
  73. return Status::OK();
  74. }
  75. Status OneHotEncoding(std::shared_ptr<Tensor> input, std::shared_ptr<Tensor> *output, dsize_t num_classes) {
  76. input->Squeeze();
  77. if (input->Rank() > 1) { // We expect the input to be int he first dimension
  78. RETURN_STATUS_UNEXPECTED("One hot only supports scalars or 1D shape Tensors.");
  79. }
  80. if (!input->type().IsInt()) {
  81. RETURN_STATUS_UNEXPECTED("One hot does not support input of this type.");
  82. }
  83. try {
  84. dsize_t num_elements = 1;
  85. if (input->Rank() == 1) num_elements = input->shape()[0];
  86. TensorShape out_shape({num_elements, num_classes});
  87. std::shared_ptr<Tensor> out;
  88. RETURN_IF_NOT_OK(Tensor::CreateTensor(&out, TensorImpl::kFlexible, out_shape, input->type()));
  89. RETURN_IF_NOT_OK(out->Zero());
  90. for (dsize_t i = 0; i < num_elements; ++i) {
  91. if (input->type().IsUnsignedInt()) {
  92. RETURN_IF_NOT_OK(OneHotEncodingUnsigned(input, &out, num_classes, i));
  93. } else {
  94. RETURN_IF_NOT_OK(OneHotEncodingSigned(input, &out, num_classes, i));
  95. }
  96. }
  97. out->Squeeze();
  98. *output = out;
  99. return Status::OK();
  100. } catch (const std::exception &e) {
  101. RETURN_STATUS_UNEXPECTED("Unexpected error in OneHotOp");
  102. }
  103. }
  104. template <typename FROM, typename TO>
  105. void Cast(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
  106. auto in_itr = input->begin<FROM>();
  107. auto out_itr = (*output)->begin<TO>();
  108. auto out_end = (*output)->end<TO>();
  109. for (; out_itr != out_end; static_cast<void>(in_itr++), static_cast<void>(out_itr++))
  110. *out_itr = static_cast<TO>(*in_itr);
  111. }
  112. template <typename T>
  113. void CastFrom(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
  114. switch ((*output)->type().value()) {
  115. case DataType::DE_BOOL:
  116. Cast<T, bool>(input, output);
  117. break;
  118. case DataType::DE_INT8:
  119. Cast<T, int8_t>(input, output);
  120. break;
  121. case DataType::DE_UINT8:
  122. Cast<T, uint8_t>(input, output);
  123. break;
  124. case DataType::DE_INT16:
  125. Cast<T, int16_t>(input, output);
  126. break;
  127. case DataType::DE_UINT16:
  128. Cast<T, uint16_t>(input, output);
  129. break;
  130. case DataType::DE_INT32:
  131. Cast<T, int32_t>(input, output);
  132. break;
  133. case DataType::DE_UINT32:
  134. Cast<T, uint32_t>(input, output);
  135. break;
  136. case DataType::DE_INT64:
  137. Cast<T, int64_t>(input, output);
  138. break;
  139. case DataType::DE_UINT64:
  140. Cast<T, uint64_t>(input, output);
  141. break;
  142. case DataType::DE_FLOAT16:
  143. Cast<T, float16>(input, output);
  144. break;
  145. case DataType::DE_FLOAT32:
  146. Cast<T, float>(input, output);
  147. break;
  148. case DataType::DE_FLOAT64:
  149. Cast<T, double>(input, output);
  150. break;
  151. case DataType::DE_UNKNOWN:
  152. MS_LOG(ERROR) << "Unknown data type.";
  153. break;
  154. }
  155. }
  156. // Type cast operator
  157. Status TypeCast(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, const DataType &data_type) {
  158. RETURN_IF_NOT_OK(Tensor::CreateTensor(output, TensorImpl::kFlexible, input->shape(), data_type));
  159. RETURN_IF_NOT_OK((*output)->AllocateBuffer((*output)->SizeInBytes()));
  160. switch (input->type().value()) {
  161. case DataType::DE_BOOL:
  162. CastFrom<bool>(input, output);
  163. break;
  164. case DataType::DE_INT8:
  165. CastFrom<int8_t>(input, output);
  166. break;
  167. case DataType::DE_UINT8:
  168. CastFrom<uint8_t>(input, output);
  169. break;
  170. case DataType::DE_INT16:
  171. CastFrom<int16_t>(input, output);
  172. break;
  173. case DataType::DE_UINT16:
  174. CastFrom<uint16_t>(input, output);
  175. break;
  176. case DataType::DE_INT32:
  177. CastFrom<int32_t>(input, output);
  178. break;
  179. case DataType::DE_UINT32:
  180. CastFrom<uint32_t>(input, output);
  181. break;
  182. case DataType::DE_INT64:
  183. CastFrom<int64_t>(input, output);
  184. break;
  185. case DataType::DE_UINT64:
  186. CastFrom<uint64_t>(input, output);
  187. break;
  188. case DataType::DE_FLOAT16:
  189. CastFrom<float16>(input, output);
  190. break;
  191. case DataType::DE_FLOAT32:
  192. CastFrom<float>(input, output);
  193. break;
  194. case DataType::DE_FLOAT64:
  195. CastFrom<double>(input, output);
  196. break;
  197. case DataType::DE_UNKNOWN:
  198. // sanity check, unreachable code.
  199. RETURN_STATUS_UNEXPECTED("TypeCast does not support input of this type.");
  200. }
  201. return Status::OK();
  202. }
  203. Status ToFloat16(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
  204. // initiate new tensor for type cast
  205. DataType new_type = DataType("float16");
  206. RETURN_IF_NOT_OK(Tensor::CreateTensor(output, TensorImpl::kFlexible, input->shape(), new_type));
  207. RETURN_IF_NOT_OK((*output)->AllocateBuffer((*output)->SizeInBytes()));
  208. auto in_itr = input->begin<float>();
  209. auto out_itr = (*output)->begin<float16>();
  210. auto out_end = (*output)->end<float16>();
  211. for (; out_itr != out_end; in_itr++, out_itr++) *out_itr = Eigen::half(*in_itr);
  212. return Status::OK();
  213. }
  214. Status PadEnd(const std::shared_ptr<Tensor> &src, std::shared_ptr<Tensor> *dst, const std::vector<dsize_t> &pad_shape,
  215. const std::shared_ptr<Tensor> &pad_val) {
  216. if (pad_val == nullptr) {
  217. if (src->type().IsNumeric()) {
  218. return PadEndNumeric(src, dst, pad_shape, 0);
  219. } else {
  220. return PadEndString(src, dst, pad_shape, "");
  221. }
  222. }
  223. if (pad_val->type().IsNumeric()) {
  224. float val = 0;
  225. RETURN_IF_NOT_OK(pad_val->GetItemAt<float>(&val, {}));
  226. return PadEndNumeric(src, dst, pad_shape, val);
  227. }
  228. std::string_view val;
  229. RETURN_IF_NOT_OK(pad_val->GetItemAt(&val, {}));
  230. return PadEndString(src, dst, pad_shape, std::string(val));
  231. }
  232. Status PadEndNumeric(const std::shared_ptr<Tensor> &src, std::shared_ptr<Tensor> *dst,
  233. const std::vector<dsize_t> &pad_shape, float pad_val) {
  234. CHECK_FAIL_RETURN_UNEXPECTED(src != nullptr && dst != nullptr, "tensor can't be nullptr");
  235. if (src->Rank() == 0 || src->shape().AsVector() == pad_shape) {
  236. (*dst) = src; // if no padding, copy the pointer
  237. } else {
  238. CHECK_FAIL_RETURN_UNEXPECTED(src->Rank() == pad_shape.size(), "Pad to diff rank not allowed");
  239. RETURN_IF_NOT_OK(Tensor::CreateTensor(dst, TensorImpl::kFlexible, TensorShape(pad_shape), src->type()));
  240. auto tensor_type = src->type().value();
  241. if (pad_val == 0) { // if pad with zero, don't care what type it is
  242. RETURN_IF_NOT_OK((*dst)->Zero());
  243. } else if (tensor_type == DataType::DE_INT8) {
  244. RETURN_IF_NOT_OK((*dst)->Fill<int8_t>(pad_val));
  245. } else if (tensor_type == DataType::DE_BOOL) {
  246. RETURN_IF_NOT_OK((*dst)->Fill<bool>(pad_val));
  247. } else if (tensor_type == DataType::DE_UINT8) {
  248. RETURN_IF_NOT_OK((*dst)->Fill<uint8_t>(pad_val));
  249. } else if (tensor_type == DataType::DE_INT16) {
  250. RETURN_IF_NOT_OK((*dst)->Fill<int16_t>(pad_val));
  251. } else if (tensor_type == DataType::DE_FLOAT16) {
  252. RETURN_IF_NOT_OK((*dst)->Fill<float16>(static_cast<float16>(pad_val)));
  253. } else if (tensor_type == DataType::DE_UINT16) {
  254. RETURN_IF_NOT_OK((*dst)->Fill<uint16_t>(pad_val));
  255. } else if (tensor_type == DataType::DE_INT32) {
  256. RETURN_IF_NOT_OK((*dst)->Fill<int32_t>(pad_val));
  257. } else if (tensor_type == DataType::DE_UINT32) {
  258. RETURN_IF_NOT_OK((*dst)->Fill<uint32_t>(pad_val));
  259. } else if (tensor_type == DataType::DE_INT64) {
  260. RETURN_IF_NOT_OK((*dst)->Fill<int64_t>(pad_val));
  261. } else if (tensor_type == DataType::DE_UINT64) {
  262. RETURN_IF_NOT_OK((*dst)->Fill<uint64_t>(pad_val));
  263. } else if (tensor_type == DataType::DE_FLOAT32) {
  264. RETURN_IF_NOT_OK((*dst)->Fill<float>(pad_val));
  265. } else if (tensor_type == DataType::DE_FLOAT64) {
  266. RETURN_IF_NOT_OK((*dst)->Fill<double>(pad_val));
  267. } else {
  268. RETURN_STATUS_UNEXPECTED("Incorrect/Unknown tensor type");
  269. }
  270. std::vector<dsize_t> cur_ind(src->Rank(), 0);
  271. RETURN_IF_NOT_OK(PadEndNumericHelper(src, *dst, cur_ind, 0));
  272. }
  273. return Status::OK();
  274. }
  275. Status PadEndNumericHelper(const std::shared_ptr<Tensor> &src, std::shared_ptr<Tensor> dst,
  276. std::vector<dsize_t> cur_ind, size_t cur_dim) {
  277. if (cur_dim == src->Rank() - 1) { // if this is the last dimension, copy the data
  278. dst->CopyLastDimAt(src, cur_ind);
  279. } else { // not the last dimension, keep doing recursion
  280. dsize_t min_ind = std::min(dst->shape()[cur_dim], src->shape()[cur_dim]);
  281. for (dsize_t i = 0; i < min_ind; i++) {
  282. cur_ind[cur_dim] = i;
  283. RETURN_IF_NOT_OK(PadEndNumericHelper(src, dst, cur_ind, cur_dim + 1));
  284. }
  285. }
  286. return Status::OK();
  287. }
  288. Status PadEndString(const std::shared_ptr<Tensor> &src, std::shared_ptr<Tensor> *dst,
  289. const std::vector<dsize_t> &pad_shape, const std::string &pad_val) {
  290. CHECK_FAIL_RETURN_UNEXPECTED(src != nullptr && dst != nullptr, "tensor can't be nullptr");
  291. if (src->Rank() == 0 || src->shape().AsVector() == pad_shape) {
  292. (*dst) = src; // if no padding, copy the pointer
  293. } else {
  294. CHECK_FAIL_RETURN_UNEXPECTED(src->Rank() == pad_shape.size(), "Pad to diff rank not allowed");
  295. std::vector<dsize_t> cur_ind(src->Rank(), 0);
  296. std::vector<std::string> strings;
  297. RETURN_IF_NOT_OK(PadEndStringHelper(src, &strings, TensorShape(pad_shape), cur_ind, 0, pad_val));
  298. RETURN_IF_NOT_OK(Tensor::CreateTensor(dst, strings, TensorShape(pad_shape)));
  299. }
  300. return Status::OK();
  301. }
  302. Status PadEndStringHelper(const std::shared_ptr<Tensor> &src, std::vector<std::string> *dst,
  303. const TensorShape &dst_shape, std::vector<dsize_t> cur_ind, size_t cur_dim,
  304. const std::string &pad_value) {
  305. if (cur_dim == src->Rank() - 1) { // if this is the last dimension, copy the data
  306. dsize_t min_ind = std::min(dst_shape[cur_dim], src->shape()[cur_dim]);
  307. for (dsize_t i = 0; i < min_ind; i++) {
  308. cur_ind[cur_dim] = i;
  309. std::string_view item;
  310. RETURN_IF_NOT_OK(src->GetItemAt(&item, cur_ind));
  311. dst->emplace_back(item);
  312. }
  313. for (dsize_t i = min_ind; i < dst_shape[cur_dim]; i++) {
  314. dst->emplace_back(pad_value);
  315. }
  316. } else { // not the last dimension, keep doing recursion
  317. dsize_t min_ind = std::min(dst_shape[cur_dim], src->shape()[cur_dim]);
  318. for (dsize_t i = 0; i < min_ind; i++) {
  319. cur_ind[cur_dim] = i;
  320. RETURN_IF_NOT_OK(PadEndStringHelper(src, dst, dst_shape, cur_ind, cur_dim + 1, pad_value));
  321. }
  322. dsize_t count = (dst_shape[cur_dim] - min_ind) * dst_shape.Strides()[cur_dim];
  323. for (dsize_t i = 0; i < count; i++) {
  324. dst->emplace_back(pad_value);
  325. }
  326. }
  327. return Status::OK();
  328. }
  329. } // namespace dataset
  330. } // namespace mindspore