zzy34407230
/
mindspore2022

/**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "dataset/kernels/data/data_utils.h"
#include <algorithm>
#include <string>
#include <vector>
#include "dataset/core/constants.h"
#include "dataset/core/tensor.h"
#include "dataset/core/tensor_shape.h"
#include "dataset/core/data_type.h"
#include "dataset/core/pybind_support.h"

namespace mindspore {
namespace dataset {
Status OneHotEncodingUnsigned(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output,
                              dsize_t num_classes, int64_t index) {
  uint64_t class_idx;
  if (input->Rank() == 0) {
    RETURN_IF_NOT_OK(input->GetItemAt<uint64_t>(&class_idx, {}));
  } else {
    RETURN_IF_NOT_OK(input->GetItemAt<uint64_t>(&class_idx, {index}));
  }
  if (class_idx >= static_cast<uint64_t>(num_classes)) {
    RETURN_STATUS_UNEXPECTED("One_hot index values are not in range");
  }
  if (input->type() == DataType::DE_UINT64) {
    RETURN_IF_NOT_OK((*output)->SetItemAt<uint64_t>({index, static_cast<dsize_t>(class_idx)}, 1));
  } else if (input->type() == DataType::DE_UINT32) {
    RETURN_IF_NOT_OK((*output)->SetItemAt<uint32_t>({index, static_cast<dsize_t>(class_idx)}, 1));
  } else if (input->type() == DataType::DE_UINT16) {
    RETURN_IF_NOT_OK((*output)->SetItemAt<uint16_t>({index, static_cast<dsize_t>(class_idx)}, 1));
  } else if (input->type() == DataType::DE_UINT8) {
    RETURN_IF_NOT_OK((*output)->SetItemAt<uint8_t>({index, static_cast<dsize_t>(class_idx)}, 1));
  } else {
    RETURN_STATUS_UNEXPECTED("One hot unsigned only supports unsigned int as input.");
  }
  return Status::OK();
}

Status OneHotEncodingSigned(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, dsize_t num_classes,
                            int64_t index) {
  int64_t class_idx;
  if (input->Rank() == 0) {
    RETURN_IF_NOT_OK(input->GetItemAt<int64_t>(&class_idx, {}));
  } else {
    RETURN_IF_NOT_OK(input->GetItemAt<int64_t>(&class_idx, {index}));
  }
  if (class_idx >= static_cast<int64_t>(num_classes)) {
    RETURN_STATUS_UNEXPECTED("One_hot index values are not in range");
  }
  if (input->type() == DataType::DE_INT64) {
    RETURN_IF_NOT_OK((*output)->SetItemAt<int64_t>({index, static_cast<dsize_t>(class_idx)}, 1));
  } else if (input->type() == DataType::DE_INT32) {
    RETURN_IF_NOT_OK((*output)->SetItemAt<int32_t>({index, static_cast<dsize_t>(class_idx)}, 1));
  } else if (input->type() == DataType::DE_INT16) {
    RETURN_IF_NOT_OK((*output)->SetItemAt<int16_t>({index, static_cast<dsize_t>(class_idx)}, 1));
  } else if (input->type() == DataType::DE_INT8) {
    RETURN_IF_NOT_OK((*output)->SetItemAt<int8_t>({index, static_cast<dsize_t>(class_idx)}, 1));
  } else {
    RETURN_STATUS_UNEXPECTED("One hot signed only supports signed int as input.");
  }
  return Status::OK();
}

Status OneHotEncoding(std::shared_ptr<Tensor> input, std::shared_ptr<Tensor> *output, dsize_t num_classes) {
  input->Squeeze();
  if (input->Rank() > 1) {  // We expect the input to be int he first dimension
    RETURN_STATUS_UNEXPECTED("One hot only supports scalars or 1D shape Tensors.");
  }
  if (!input->type().IsInt()) {
    RETURN_STATUS_UNEXPECTED("One hot does not support input of this type.");
  }
  try {
    dsize_t num_elements = 1;
    if (input->Rank() == 1) num_elements = input->shape()[0];
    TensorShape out_shape({num_elements, num_classes});
    std::shared_ptr<Tensor> out;
    RETURN_IF_NOT_OK(Tensor::CreateTensor(&out, TensorImpl::kFlexible, out_shape, input->type()));
    RETURN_IF_NOT_OK(out->Zero());
    for (dsize_t i = 0; i < num_elements; ++i) {
      if (input->type().IsUnsignedInt()) {
        RETURN_IF_NOT_OK(OneHotEncodingUnsigned(input, &out, num_classes, i));
      } else {
        RETURN_IF_NOT_OK(OneHotEncodingSigned(input, &out, num_classes, i));
      }
    }
    out->Squeeze();
    *output = out;
    return Status::OK();
  } catch (const std::exception &e) {
    RETURN_STATUS_UNEXPECTED("Unexpected error in OneHotOp");
  }
}

template <typename FROM, typename TO>
void Cast(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
  auto in_itr = input->begin<FROM>();
  auto out_itr = (*output)->begin<TO>();
  auto out_end = (*output)->end<TO>();
  for (; out_itr != out_end; static_cast<void>(in_itr++), static_cast<void>(out_itr++))
    *out_itr = static_cast<TO>(*in_itr);
}

template <typename T>
void CastFrom(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
  switch ((*output)->type().value()) {
    case DataType::DE_BOOL:
      Cast<T, bool>(input, output);
      break;
    case DataType::DE_INT8:
      Cast<T, int8_t>(input, output);
      break;
    case DataType::DE_UINT8:
      Cast<T, uint8_t>(input, output);
      break;
    case DataType::DE_INT16:
      Cast<T, int16_t>(input, output);
      break;
    case DataType::DE_UINT16:
      Cast<T, uint16_t>(input, output);
      break;
    case DataType::DE_INT32:
      Cast<T, int32_t>(input, output);
      break;
    case DataType::DE_UINT32:
      Cast<T, uint32_t>(input, output);
      break;
    case DataType::DE_INT64:
      Cast<T, int64_t>(input, output);
      break;
    case DataType::DE_UINT64:
      Cast<T, uint64_t>(input, output);
      break;
    case DataType::DE_FLOAT16:
      Cast<T, float16>(input, output);
      break;
    case DataType::DE_FLOAT32:
      Cast<T, float>(input, output);
      break;
    case DataType::DE_FLOAT64:
      Cast<T, double>(input, output);
      break;
    case DataType::DE_UNKNOWN:
      MS_LOG(ERROR) << "Unknown data type.";
      break;
  }
}

// Type cast operator
Status TypeCast(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, const DataType &data_type) {
  RETURN_IF_NOT_OK(Tensor::CreateTensor(output, TensorImpl::kFlexible, input->shape(), data_type));

  RETURN_IF_NOT_OK((*output)->AllocateBuffer((*output)->SizeInBytes()));
  switch (input->type().value()) {
    case DataType::DE_BOOL:
      CastFrom<bool>(input, output);
      break;
    case DataType::DE_INT8:
      CastFrom<int8_t>(input, output);
      break;
    case DataType::DE_UINT8:
      CastFrom<uint8_t>(input, output);
      break;
    case DataType::DE_INT16:
      CastFrom<int16_t>(input, output);
      break;
    case DataType::DE_UINT16:
      CastFrom<uint16_t>(input, output);
      break;
    case DataType::DE_INT32:
      CastFrom<int32_t>(input, output);
      break;
    case DataType::DE_UINT32:
      CastFrom<uint32_t>(input, output);
      break;
    case DataType::DE_INT64:
      CastFrom<int64_t>(input, output);
      break;
    case DataType::DE_UINT64:
      CastFrom<uint64_t>(input, output);
      break;
    case DataType::DE_FLOAT16:
      CastFrom<float16>(input, output);
      break;
    case DataType::DE_FLOAT32:
      CastFrom<float>(input, output);
      break;
    case DataType::DE_FLOAT64:
      CastFrom<double>(input, output);
      break;
    case DataType::DE_UNKNOWN:
      // sanity check, unreachable code.
      RETURN_STATUS_UNEXPECTED("TypeCast does not support input of this type.");
  }
  return Status::OK();
}

Status ToFloat16(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
  // initiate new tensor for type cast
  DataType new_type = DataType("float16");
  RETURN_IF_NOT_OK(Tensor::CreateTensor(output, TensorImpl::kFlexible, input->shape(), new_type));
  RETURN_IF_NOT_OK((*output)->AllocateBuffer((*output)->SizeInBytes()));

  auto in_itr = input->begin<float>();
  auto out_itr = (*output)->begin<float16>();
  auto out_end = (*output)->end<float16>();
  for (; out_itr != out_end; in_itr++, out_itr++) *out_itr = Eigen::half(*in_itr);

  return Status::OK();
}

Status PadEnd(const std::shared_ptr<Tensor> &src, std::shared_ptr<Tensor> *dst, const std::vector<dsize_t> &pad_shape,
              const std::shared_ptr<Tensor> &pad_val) {
  if (pad_val == nullptr) {
    if (src->type().IsNumeric()) {
      return PadEndNumeric(src, dst, pad_shape, 0);
    } else {
      return PadEndString(src, dst, pad_shape, "");
    }
  }
  if (pad_val->type().IsNumeric()) {
    float val = 0;
    RETURN_IF_NOT_OK(pad_val->GetItemAt<float>(&val, {}));
    return PadEndNumeric(src, dst, pad_shape, val);
  }
  std::string_view val;
  RETURN_IF_NOT_OK(pad_val->GetItemAt(&val, {}));
  return PadEndString(src, dst, pad_shape, std::string(val));
}

Status PadEndNumeric(const std::shared_ptr<Tensor> &src, std::shared_ptr<Tensor> *dst,
                     const std::vector<dsize_t> &pad_shape, float pad_val) {
  CHECK_FAIL_RETURN_UNEXPECTED(src != nullptr && dst != nullptr, "tensor can't be nullptr");
  if (src->Rank() == 0 || src->shape().AsVector() == pad_shape) {
    (*dst) = src;  // if no padding, copy the pointer
  } else {
    CHECK_FAIL_RETURN_UNEXPECTED(src->Rank() == pad_shape.size(), "Pad to diff rank not allowed");
    RETURN_IF_NOT_OK(Tensor::CreateTensor(dst, TensorImpl::kFlexible, TensorShape(pad_shape), src->type()));
    auto tensor_type = src->type().value();
    if (pad_val == 0) {  // if pad with zero, don't care what type it is
      RETURN_IF_NOT_OK((*dst)->Zero());
    } else if (tensor_type == DataType::DE_INT8) {
      RETURN_IF_NOT_OK((*dst)->Fill<int8_t>(pad_val));
    } else if (tensor_type == DataType::DE_BOOL) {
      RETURN_IF_NOT_OK((*dst)->Fill<bool>(pad_val));
    } else if (tensor_type == DataType::DE_UINT8) {
      RETURN_IF_NOT_OK((*dst)->Fill<uint8_t>(pad_val));
    } else if (tensor_type == DataType::DE_INT16) {
      RETURN_IF_NOT_OK((*dst)->Fill<int16_t>(pad_val));
    } else if (tensor_type == DataType::DE_FLOAT16) {
      RETURN_IF_NOT_OK((*dst)->Fill<float16>(static_cast<float16>(pad_val)));
    } else if (tensor_type == DataType::DE_UINT16) {
      RETURN_IF_NOT_OK((*dst)->Fill<uint16_t>(pad_val));
    } else if (tensor_type == DataType::DE_INT32) {
      RETURN_IF_NOT_OK((*dst)->Fill<int32_t>(pad_val));
    } else if (tensor_type == DataType::DE_UINT32) {
      RETURN_IF_NOT_OK((*dst)->Fill<uint32_t>(pad_val));
    } else if (tensor_type == DataType::DE_INT64) {
      RETURN_IF_NOT_OK((*dst)->Fill<int64_t>(pad_val));
    } else if (tensor_type == DataType::DE_UINT64) {
      RETURN_IF_NOT_OK((*dst)->Fill<uint64_t>(pad_val));
    } else if (tensor_type == DataType::DE_FLOAT32) {
      RETURN_IF_NOT_OK((*dst)->Fill<float>(pad_val));
    } else if (tensor_type == DataType::DE_FLOAT64) {
      RETURN_IF_NOT_OK((*dst)->Fill<double>(pad_val));
    } else {
      RETURN_STATUS_UNEXPECTED("Incorrect/Unknown tensor type");
    }
    std::vector<dsize_t> cur_ind(src->Rank(), 0);
    RETURN_IF_NOT_OK(PadEndNumericHelper(src, *dst, cur_ind, 0));
  }
  return Status::OK();
}
Status PadEndNumericHelper(const std::shared_ptr<Tensor> &src, std::shared_ptr<Tensor> dst,
                           std::vector<dsize_t> cur_ind, size_t cur_dim) {
  if (cur_dim == src->Rank() - 1) {  // if this is the last dimension, copy the data
    dst->CopyLastDimAt(src, cur_ind);
  } else {  // not the last dimension, keep doing recursion
    dsize_t min_ind = std::min(dst->shape()[cur_dim], src->shape()[cur_dim]);
    for (dsize_t i = 0; i < min_ind; i++) {
      cur_ind[cur_dim] = i;
      RETURN_IF_NOT_OK(PadEndNumericHelper(src, dst, cur_ind, cur_dim + 1));
    }
  }
  return Status::OK();
}

Status PadEndString(const std::shared_ptr<Tensor> &src, std::shared_ptr<Tensor> *dst,
                    const std::vector<dsize_t> &pad_shape, const std::string &pad_val) {
  CHECK_FAIL_RETURN_UNEXPECTED(src != nullptr && dst != nullptr, "tensor can't be nullptr");
  if (src->Rank() == 0 || src->shape().AsVector() == pad_shape) {
    (*dst) = src;  // if no padding, copy the pointer
  } else {
    CHECK_FAIL_RETURN_UNEXPECTED(src->Rank() == pad_shape.size(), "Pad to diff rank not allowed");
    std::vector<dsize_t> cur_ind(src->Rank(), 0);
    std::vector<std::string> strings;
    RETURN_IF_NOT_OK(PadEndStringHelper(src, &strings, TensorShape(pad_shape), cur_ind, 0, pad_val));
    RETURN_IF_NOT_OK(Tensor::CreateTensor(dst, strings, TensorShape(pad_shape)));
  }
  return Status::OK();
}

Status PadEndStringHelper(const std::shared_ptr<Tensor> &src, std::vector<std::string> *dst,
                          const TensorShape &dst_shape, std::vector<dsize_t> cur_ind, size_t cur_dim,
                          const std::string &pad_value) {
  if (cur_dim == src->Rank() - 1) {  // if this is the last dimension, copy the data
    dsize_t min_ind = std::min(dst_shape[cur_dim], src->shape()[cur_dim]);
    for (dsize_t i = 0; i < min_ind; i++) {
      cur_ind[cur_dim] = i;
      std::string_view item;
      RETURN_IF_NOT_OK(src->GetItemAt(&item, cur_ind));
      dst->emplace_back(item);
    }
    for (dsize_t i = min_ind; i < dst_shape[cur_dim]; i++) {
      dst->emplace_back(pad_value);
    }

  } else {  // not the last dimension, keep doing recursion
    dsize_t min_ind = std::min(dst_shape[cur_dim], src->shape()[cur_dim]);
    for (dsize_t i = 0; i < min_ind; i++) {
      cur_ind[cur_dim] = i;
      RETURN_IF_NOT_OK(PadEndStringHelper(src, dst, dst_shape, cur_ind, cur_dim + 1, pad_value));
    }
    dsize_t count = (dst_shape[cur_dim] - min_ind) * dst_shape.Strides()[cur_dim];
    for (dsize_t i = 0; i < count; i++) {
      dst->emplace_back(pad_value);
    }
  }
  return Status::OK();
}
}  // namespace dataset
}  // namespace mindspore