From 220ec451817b8e573c5c93af65669af33c220afd Mon Sep 17 00:00:00 2001
From: zhaozhenlong <zhaozhenlong1@huawei.com>
Date: Sun, 16 Aug 2020 16:11:56 +0800
Subject: [PATCH] fp16 concat

---
 .../runtime/kernel/arm/base/concat_base.cc    |   1 +
 .../runtime/kernel/arm/fp16/concat_fp16.cc    | 130 ++++++++++++++++++
 .../src/runtime/kernel/arm/fp16/concat_fp16.h |  54 ++++++++
 .../kernel/arm/nnacl/fp16/concat_fp16.c       |  43 ++++++
 .../kernel/arm/nnacl/fp16/concat_fp16.h       |  30 ++++
 5 files changed, 258 insertions(+)
 create mode 100644 mindspore/lite/src/runtime/kernel/arm/fp16/concat_fp16.cc
 create mode 100644 mindspore/lite/src/runtime/kernel/arm/fp16/concat_fp16.h
 create mode 100644 mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/concat_fp16.c
 create mode 100644 mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/concat_fp16.h
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/concat_base.cc b/mindspore/lite/src/runtime/kernel/arm/base/concat_base.cc
index e28d88b771..e5d7576454 100644
--- a/mindspore/lite/src/runtime/kernel/arm/base/concat_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/concat_base.cc
@@ -108,6 +108,7 @@ kernel::LiteKernel *CpuConcatFp32KernelCreator(const std::vector<lite::tensor::T
   return kernel;
 }
 
+
 REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_Concat, CpuConcatInt8KernelCreator)
 REG_KERNEL(kCPU, kNumberTypeInt32, PrimitiveType_Concat, CpuConcatInt32KernelCreator)
 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Concat, CpuConcatFp32KernelCreator)
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/concat_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/concat_fp16.cc
new file mode 100644
index 0000000000..1c31d90c47
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/concat_fp16.cc
@@ -0,0 +1,130 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vector>
+#include "nnacl/fp16/concat_fp16.h"
+#include "src/runtime/kernel/arm/fp16/concat_fp16.h"
+#include "src/kernel_registry.h"
+#include "schema/model_generated.h"
+#include "include/errorcode.h"
+#include "nnacl/fp16/cast_fp16.h"
+
+using mindspore::kernel::KERNEL_ARCH::kCPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_Concat;
+
+namespace mindspore::kernel {
+int ConcatFp16CPUKernel::Init() {
+  auto ret = ConcatBaseCPUKernel::Init();
+  if (ret != RET_OK) {
+    return ret;
+  }
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
+
+  return ReSize();
+}
+
+int ConcatFp16CPUKernel::ReSize() {
+  for (auto ptr : fp16_inputs_) {
+    if (ptr != nullptr) {
+      free(ptr);
+      ptr = nullptr;
+    }
+  }
+  fp16_inputs_.clear();
+  for (size_t i = 0; i < in_tensors_.size(); ++i) {
+    float16_t *ptr = nullptr;
+    ptr = reinterpret_cast<float16_t *>(malloc(sizeof(float16_t) * in_tensors_[i]->ElementsNum()));
+    if (ptr == nullptr) {
+      MS_LOG(ERROR) << "malloc failed";
+      return RET_ERROR;
+    }
+    fp16_inputs_.push_back(ptr);
+  }
+
+  if (fp16_output_ != nullptr) {
+    free(fp16_output_);
+    fp16_output_ = nullptr;
+  }
+  fp16_output_ = reinterpret_cast<float16_t *>(malloc(sizeof(float16_t) * out_tensors_[0]->ElementsNum()));
+  if (fp16_output_ == nullptr) {
+    MS_LOG(ERROR) << "malloc failed";
+    return RET_ERROR;
+  }
+  return ConcatBaseCPUKernel::ReSize();
+}
+
+int ConcatFp16CPUKernel::Run() {
+  auto prepare_ret = Prepare();
+  if (prepare_ret != RET_OK) {
+    MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
+    return prepare_ret;
+  }
+  auto input_num = in_tensors_.size();
+  std::vector<float *> inputs_addr(input_num, nullptr);
+  std::vector<int *> inputs_output_shape(input_num + 1, nullptr);
+
+  std::vector<std::vector<int>> shapes;
+  for (size_t i = 0; i < input_num; ++i) {
+    inputs_addr[i] = reinterpret_cast<float *>(in_tensors_[i]->Data());
+    if (inputs_addr[i] == nullptr) {
+      MS_LOG(ERROR) << "got nullptr when cast in_tensor to float ptr";
+      return RET_ERROR;
+    }
+
+    Float32ToFloat16(inputs_addr[i], fp16_inputs_[i], in_tensors_[i]->ElementsNum());
+    shapes.push_back(in_tensors_[i]->shape());
+    inputs_output_shape[i] = shapes[i].data();
+  }
+  auto output_shape = out_tensors_.at(0)->shape();
+  inputs_output_shape[input_num] = output_shape.data();
+  auto output_addr = out_tensors_.at(0)->Data();
+
+  ConcatFp16(reinterpret_cast<void **>(fp16_inputs_.data()), input_num, axis_, inputs_output_shape.data(),
+             output_shape.size(), reinterpret_cast<void *>(fp16_output_));
+  Float16ToFloat32(fp16_output_, reinterpret_cast<float *>(output_addr), out_tensors_.at(0)->ElementsNum());
+  return RET_OK;
+}
+
+kernel::LiteKernel *CpuConcatFp16KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
+                                               const std::vector<lite::tensor::Tensor *> &outputs,
+                                               OpParameter *opParameter, const Context *ctx,
+                                               const kernel::KernelKey &desc, const lite::Primitive *primitive) {
+  if (opParameter == nullptr) {
+    MS_LOG(ERROR) << "Input opParameter is nullptr!";
+    return nullptr;
+  }
+  MS_ASSERT(desc.type == schema::PrimitiveType_Concat);
+  auto *kernel = new (std::nothrow) ConcatFp16CPUKernel(opParameter, inputs, outputs, ctx, primitive);
+  if (kernel == nullptr) {
+    MS_LOG(ERROR) << "new ConcatCPUKernel fail!";
+    return nullptr;
+  }
+  auto ret = kernel->Init();
+  if (ret != RET_OK) {
+    delete kernel;
+    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
+                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
+    return nullptr;
+  }
+  return kernel;
+}
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Concat, CpuConcatFp16KernelCreator)
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/concat_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/concat_fp16.h
new file mode 100644
index 0000000000..3f12f1998a
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/concat_fp16.h
@@ -0,0 +1,54 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONCAT_FP16_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONCAT_FP16_H_
+
+#include <arm_neon.h>
+#include <vector>
+#include "src/lite_kernel.h"
+#include "include/context.h"
+#include "src/runtime/kernel/arm/base/concat_base.h"
+
+using mindspore::lite::Context;
+
+namespace mindspore::kernel {
+class ConcatFp16CPUKernel : public ConcatBaseCPUKernel {
+ public:
+  ConcatFp16CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
+                      const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
+                      const lite::Primitive *primitive)
+      : ConcatBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
+
+  ~ConcatFp16CPUKernel() {
+    for (auto ptr : fp16_inputs_) {
+      if (ptr != nullptr) {
+        free(ptr);
+      }
+    }
+  }
+
+  int Init() override;
+
+  int ReSize() override;
+
+  int Run() override;
+
+ private:
+  std::vector<float16_t *> fp16_inputs_;
+  float16_t *fp16_output_ = nullptr;
+};
+}  // namespace mindspore::kernel
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONCAT_FP16_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/concat_fp16.c b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/concat_fp16.c
new file mode 100644
index 0000000000..25984f82fe
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/concat_fp16.c
@@ -0,0 +1,43 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nnacl/fp16/concat_fp16.h"
+#include <string.h>
+
+void ConcatFp16(void **input, int input_num, int axis, int **inputs_output_shape, size_t shape_size, void *output) {
+  int before_axis_size = 1;
+  for (int i = 0; i < axis; ++i) {
+    before_axis_size *= inputs_output_shape[0][i];
+  }
+  // sizeof float16 / byte
+  int after_axis_size = 2;
+  for (size_t i = axis + 1; i < shape_size; ++i) {
+    after_axis_size *= inputs_output_shape[0][i];
+  }
+  int axis_offset = 0;
+  uint8_t *dst_base = (output);
+  size_t output_stride = after_axis_size * inputs_output_shape[input_num][axis];
+  for (int i = 0; i < input_num; ++i) {
+    uint8_t *src_base = (input[i]);
+    size_t input_stride = after_axis_size * inputs_output_shape[i][axis];
+    for (int j = 0; j < before_axis_size; ++j) {
+      uint8_t *src = src_base + j * input_stride;
+      uint8_t *dst = dst_base + j * output_stride + axis_offset * after_axis_size;
+      memcpy(dst, src, input_stride);
+    }
+    axis_offset += inputs_output_shape[i][axis];
+  }
+}
diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/concat_fp16.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/concat_fp16.h
new file mode 100644
index 0000000000..81d272a2f5
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/concat_fp16.h
@@ -0,0 +1,30 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FP16_CONCAT_FP16_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FP16_CONCAT_FP16_H_
+
+#include "nnacl/op_base.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void ConcatFp16(void **input, int input_num, int axis, int **inputs_output_shape, size_t shape_size, void *output);
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FP16_CONCAT_FP16_H_