/** * Copyright 2019 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "dataset/kernels/data/data_utils.h" #include #include #include #include "dataset/core/constants.h" #include "dataset/core/tensor.h" #include "dataset/core/tensor_shape.h" #include "dataset/core/data_type.h" #include "dataset/core/pybind_support.h" namespace mindspore { namespace dataset { Status OneHotEncodingUnsigned(const std::shared_ptr &input, std::shared_ptr *output, dsize_t num_classes, int64_t index) { uint64_t class_idx; if (input->Rank() == 0) { RETURN_IF_NOT_OK(input->GetItemAt(&class_idx, {})); } else { RETURN_IF_NOT_OK(input->GetItemAt(&class_idx, {index})); } if (class_idx >= static_cast(num_classes)) { RETURN_STATUS_UNEXPECTED("One_hot index values are not in range"); } if (input->type() == DataType::DE_UINT64) { RETURN_IF_NOT_OK((*output)->SetItemAt({index, static_cast(class_idx)}, 1)); } else if (input->type() == DataType::DE_UINT32) { RETURN_IF_NOT_OK((*output)->SetItemAt({index, static_cast(class_idx)}, 1)); } else if (input->type() == DataType::DE_UINT16) { RETURN_IF_NOT_OK((*output)->SetItemAt({index, static_cast(class_idx)}, 1)); } else if (input->type() == DataType::DE_UINT8) { RETURN_IF_NOT_OK((*output)->SetItemAt({index, static_cast(class_idx)}, 1)); } else { RETURN_STATUS_UNEXPECTED("One hot unsigned only supports unsigned int as input."); } return Status::OK(); } Status OneHotEncodingSigned(const std::shared_ptr &input, std::shared_ptr *output, dsize_t num_classes, int64_t index) { int64_t class_idx; if (input->Rank() == 0) { RETURN_IF_NOT_OK(input->GetItemAt(&class_idx, {})); } else { RETURN_IF_NOT_OK(input->GetItemAt(&class_idx, {index})); } if (class_idx >= static_cast(num_classes)) { RETURN_STATUS_UNEXPECTED("One_hot index values are not in range"); } if (input->type() == DataType::DE_INT64) { RETURN_IF_NOT_OK((*output)->SetItemAt({index, static_cast(class_idx)}, 1)); } else if (input->type() == DataType::DE_INT32) { RETURN_IF_NOT_OK((*output)->SetItemAt({index, static_cast(class_idx)}, 1)); } else if (input->type() == DataType::DE_INT16) { RETURN_IF_NOT_OK((*output)->SetItemAt({index, static_cast(class_idx)}, 1)); } else if (input->type() == DataType::DE_INT8) { RETURN_IF_NOT_OK((*output)->SetItemAt({index, static_cast(class_idx)}, 1)); } else { RETURN_STATUS_UNEXPECTED("One hot signed only supports signed int as input."); } return Status::OK(); } Status OneHotEncoding(std::shared_ptr input, std::shared_ptr *output, dsize_t num_classes) { input->Squeeze(); if (input->Rank() > 1) { // We expect the input to be int he first dimension RETURN_STATUS_UNEXPECTED("One hot only supports scalars or 1D shape Tensors."); } if (!input->type().IsInt()) { RETURN_STATUS_UNEXPECTED("One hot does not support input of this type."); } try { dsize_t num_elements = 1; if (input->Rank() == 1) num_elements = input->shape()[0]; TensorShape out_shape({num_elements, num_classes}); std::shared_ptr out; RETURN_IF_NOT_OK(Tensor::CreateTensor(&out, TensorImpl::kFlexible, out_shape, input->type())); RETURN_IF_NOT_OK(out->Zero()); for (dsize_t i = 0; i < num_elements; ++i) { if (input->type().IsUnsignedInt()) { RETURN_IF_NOT_OK(OneHotEncodingUnsigned(input, &out, num_classes, i)); } else { RETURN_IF_NOT_OK(OneHotEncodingSigned(input, &out, num_classes, i)); } } out->Squeeze(); *output = out; return Status::OK(); } catch (const std::exception &e) { RETURN_STATUS_UNEXPECTED("Unexpected error in OneHotOp"); } } template void Cast(const std::shared_ptr &input, std::shared_ptr *output) { auto in_itr = input->begin(); auto out_itr = (*output)->begin(); auto out_end = (*output)->end(); for (; out_itr != out_end; static_cast(in_itr++), static_cast(out_itr++)) *out_itr = static_cast(*in_itr); } template void CastFrom(const std::shared_ptr &input, std::shared_ptr *output) { switch ((*output)->type().value()) { case DataType::DE_BOOL: Cast(input, output); break; case DataType::DE_INT8: Cast(input, output); break; case DataType::DE_UINT8: Cast(input, output); break; case DataType::DE_INT16: Cast(input, output); break; case DataType::DE_UINT16: Cast(input, output); break; case DataType::DE_INT32: Cast(input, output); break; case DataType::DE_UINT32: Cast(input, output); break; case DataType::DE_INT64: Cast(input, output); break; case DataType::DE_UINT64: Cast(input, output); break; case DataType::DE_FLOAT16: Cast(input, output); break; case DataType::DE_FLOAT32: Cast(input, output); break; case DataType::DE_FLOAT64: Cast(input, output); break; case DataType::DE_UNKNOWN: MS_LOG(ERROR) << "Unknown data type."; break; } } // Type cast operator Status TypeCast(const std::shared_ptr &input, std::shared_ptr *output, const DataType &data_type) { RETURN_IF_NOT_OK(Tensor::CreateTensor(output, TensorImpl::kFlexible, input->shape(), data_type)); RETURN_IF_NOT_OK((*output)->AllocateBuffer((*output)->SizeInBytes())); switch (input->type().value()) { case DataType::DE_BOOL: CastFrom(input, output); break; case DataType::DE_INT8: CastFrom(input, output); break; case DataType::DE_UINT8: CastFrom(input, output); break; case DataType::DE_INT16: CastFrom(input, output); break; case DataType::DE_UINT16: CastFrom(input, output); break; case DataType::DE_INT32: CastFrom(input, output); break; case DataType::DE_UINT32: CastFrom(input, output); break; case DataType::DE_INT64: CastFrom(input, output); break; case DataType::DE_UINT64: CastFrom(input, output); break; case DataType::DE_FLOAT16: CastFrom(input, output); break; case DataType::DE_FLOAT32: CastFrom(input, output); break; case DataType::DE_FLOAT64: CastFrom(input, output); break; case DataType::DE_UNKNOWN: // sanity check, unreachable code. RETURN_STATUS_UNEXPECTED("TypeCast does not support input of this type."); } return Status::OK(); } Status ToFloat16(const std::shared_ptr &input, std::shared_ptr *output) { // initiate new tensor for type cast DataType new_type = DataType("float16"); RETURN_IF_NOT_OK(Tensor::CreateTensor(output, TensorImpl::kFlexible, input->shape(), new_type)); RETURN_IF_NOT_OK((*output)->AllocateBuffer((*output)->SizeInBytes())); auto in_itr = input->begin(); auto out_itr = (*output)->begin(); auto out_end = (*output)->end(); for (; out_itr != out_end; in_itr++, out_itr++) *out_itr = Eigen::half(*in_itr); return Status::OK(); } Status PadEnd(const std::shared_ptr &src, std::shared_ptr *dst, const std::vector &pad_shape, const std::shared_ptr &pad_val) { if (pad_val == nullptr) { if (src->type().IsNumeric()) { return PadEndNumeric(src, dst, pad_shape, 0); } else { return PadEndString(src, dst, pad_shape, ""); } } if (pad_val->type().IsNumeric()) { float val = 0; RETURN_IF_NOT_OK(pad_val->GetItemAt(&val, {})); return PadEndNumeric(src, dst, pad_shape, val); } std::string_view val; RETURN_IF_NOT_OK(pad_val->GetItemAt(&val, {})); return PadEndString(src, dst, pad_shape, std::string(val)); } Status PadEndNumeric(const std::shared_ptr &src, std::shared_ptr *dst, const std::vector &pad_shape, float pad_val) { CHECK_FAIL_RETURN_UNEXPECTED(src != nullptr && dst != nullptr, "tensor can't be nullptr"); if (src->Rank() == 0 || src->shape().AsVector() == pad_shape) { (*dst) = src; // if no padding, copy the pointer } else { CHECK_FAIL_RETURN_UNEXPECTED(src->Rank() == pad_shape.size(), "Pad to diff rank not allowed"); RETURN_IF_NOT_OK(Tensor::CreateTensor(dst, TensorImpl::kFlexible, TensorShape(pad_shape), src->type())); auto tensor_type = src->type().value(); if (pad_val == 0) { // if pad with zero, don't care what type it is RETURN_IF_NOT_OK((*dst)->Zero()); } else if (tensor_type == DataType::DE_INT8) { RETURN_IF_NOT_OK((*dst)->Fill(pad_val)); } else if (tensor_type == DataType::DE_BOOL) { RETURN_IF_NOT_OK((*dst)->Fill(pad_val)); } else if (tensor_type == DataType::DE_UINT8) { RETURN_IF_NOT_OK((*dst)->Fill(pad_val)); } else if (tensor_type == DataType::DE_INT16) { RETURN_IF_NOT_OK((*dst)->Fill(pad_val)); } else if (tensor_type == DataType::DE_FLOAT16) { RETURN_IF_NOT_OK((*dst)->Fill(static_cast(pad_val))); } else if (tensor_type == DataType::DE_UINT16) { RETURN_IF_NOT_OK((*dst)->Fill(pad_val)); } else if (tensor_type == DataType::DE_INT32) { RETURN_IF_NOT_OK((*dst)->Fill(pad_val)); } else if (tensor_type == DataType::DE_UINT32) { RETURN_IF_NOT_OK((*dst)->Fill(pad_val)); } else if (tensor_type == DataType::DE_INT64) { RETURN_IF_NOT_OK((*dst)->Fill(pad_val)); } else if (tensor_type == DataType::DE_UINT64) { RETURN_IF_NOT_OK((*dst)->Fill(pad_val)); } else if (tensor_type == DataType::DE_FLOAT32) { RETURN_IF_NOT_OK((*dst)->Fill(pad_val)); } else if (tensor_type == DataType::DE_FLOAT64) { RETURN_IF_NOT_OK((*dst)->Fill(pad_val)); } else { RETURN_STATUS_UNEXPECTED("Incorrect/Unknown tensor type"); } std::vector cur_ind(src->Rank(), 0); RETURN_IF_NOT_OK(PadEndNumericHelper(src, *dst, cur_ind, 0)); } return Status::OK(); } Status PadEndNumericHelper(const std::shared_ptr &src, std::shared_ptr dst, std::vector cur_ind, size_t cur_dim) { if (cur_dim == src->Rank() - 1) { // if this is the last dimension, copy the data dst->CopyLastDimAt(src, cur_ind); } else { // not the last dimension, keep doing recursion dsize_t min_ind = std::min(dst->shape()[cur_dim], src->shape()[cur_dim]); for (dsize_t i = 0; i < min_ind; i++) { cur_ind[cur_dim] = i; RETURN_IF_NOT_OK(PadEndNumericHelper(src, dst, cur_ind, cur_dim + 1)); } } return Status::OK(); } Status PadEndString(const std::shared_ptr &src, std::shared_ptr *dst, const std::vector &pad_shape, const std::string &pad_val) { CHECK_FAIL_RETURN_UNEXPECTED(src != nullptr && dst != nullptr, "tensor can't be nullptr"); if (src->Rank() == 0 || src->shape().AsVector() == pad_shape) { (*dst) = src; // if no padding, copy the pointer } else { CHECK_FAIL_RETURN_UNEXPECTED(src->Rank() == pad_shape.size(), "Pad to diff rank not allowed"); std::vector cur_ind(src->Rank(), 0); std::vector strings; RETURN_IF_NOT_OK(PadEndStringHelper(src, &strings, TensorShape(pad_shape), cur_ind, 0, pad_val)); RETURN_IF_NOT_OK(Tensor::CreateTensor(dst, strings, TensorShape(pad_shape))); } return Status::OK(); } Status PadEndStringHelper(const std::shared_ptr &src, std::vector *dst, const TensorShape &dst_shape, std::vector cur_ind, size_t cur_dim, const std::string &pad_value) { if (cur_dim == src->Rank() - 1) { // if this is the last dimension, copy the data dsize_t min_ind = std::min(dst_shape[cur_dim], src->shape()[cur_dim]); for (dsize_t i = 0; i < min_ind; i++) { cur_ind[cur_dim] = i; std::string_view item; RETURN_IF_NOT_OK(src->GetItemAt(&item, cur_ind)); dst->emplace_back(item); } for (dsize_t i = min_ind; i < dst_shape[cur_dim]; i++) { dst->emplace_back(pad_value); } } else { // not the last dimension, keep doing recursion dsize_t min_ind = std::min(dst_shape[cur_dim], src->shape()[cur_dim]); for (dsize_t i = 0; i < min_ind; i++) { cur_ind[cur_dim] = i; RETURN_IF_NOT_OK(PadEndStringHelper(src, dst, dst_shape, cur_ind, cur_dim + 1, pad_value)); } dsize_t count = (dst_shape[cur_dim] - min_ind) * dst_shape.Strides()[cur_dim]; for (dsize_t i = 0; i < count; i++) { dst->emplace_back(pad_value); } } return Status::OK(); } } // namespace dataset } // namespace mindspore