From 432e81c212e7624fd4204c147ac46eca4e32e22a Mon Sep 17 00:00:00 2001
From: zhaozhenlong <zhaozhenlong1@huawei.com>
Date: Fri, 9 Apr 2021 15:28:41 +0800
Subject: [PATCH] expand dims and tile npu ops

npu support select input from multi outputs
---
 .../runtime/agent/npu/subgraph_npu_kernel.cc  | 42 ++++++----
 .../src/runtime/kernel/npu/arithmetic_npu.cc  | 20 +++++
 .../src/runtime/kernel/npu/arithmetic_npu.h   |  5 ++
 .../src/runtime/kernel/npu/expand_dims_npu.cc | 55 +++++++++++++
 .../src/runtime/kernel/npu/expand_dims_npu.h  | 42 ++++++++++
 .../lite/src/runtime/kernel/npu/npu_kernel.h  | 10 +++
 .../lite/src/runtime/kernel/npu/tile_npu.cc   | 80 +++++++++++++++++++
 .../lite/src/runtime/kernel/npu/tile_npu.h    | 47 +++++++++++
 .../src/runtime/kernel/npu/transpose_npu.cc   |  6 +-
 9 files changed, 289 insertions(+), 18 deletions(-)
 create mode 100644 mindspore/lite/src/runtime/kernel/npu/expand_dims_npu.cc
 create mode 100644 mindspore/lite/src/runtime/kernel/npu/expand_dims_npu.h
 create mode 100644 mindspore/lite/src/runtime/kernel/npu/tile_npu.cc
 create mode 100644 mindspore/lite/src/runtime/kernel/npu/tile_npu.h
diff --git a/mindspore/lite/src/runtime/agent/npu/subgraph_npu_kernel.cc b/mindspore/lite/src/runtime/agent/npu/subgraph_npu_kernel.cc
index 801126dc83..a9208b0629 100644
--- a/mindspore/lite/src/runtime/agent/npu/subgraph_npu_kernel.cc
+++ b/mindspore/lite/src/runtime/agent/npu/subgraph_npu_kernel.cc
@@ -16,6 +16,8 @@
 
 #include "src/runtime/agent/npu/subgraph_npu_kernel.h"
 #include <set>
+#include <unordered_map>
+#include <utility>
 #include "include/errorcode.h"
 #include "src/runtime/agent/npu/npu_executor.h"
 #include "include/graph/operator.h"
@@ -34,8 +36,10 @@ using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
 
 static std::set<mindspore::schema::PrimitiveType> npu_specific_weight_nodes = {
-  schema::PrimitiveType_Conv2DFusion, schema::PrimitiveType_Conv2dTransposeFusion, schema::PrimitiveType_ScaleFusion,
-  schema::PrimitiveType_BatchNorm,    schema::PrimitiveType_FullConnection,        schema::PrimitiveType_InstanceNorm};
+  schema::PrimitiveType_Conv2DFusion,   schema::PrimitiveType_Conv2dTransposeFusion,
+  schema::PrimitiveType_ScaleFusion,    schema::PrimitiveType_BatchNorm,
+  schema::PrimitiveType_FullConnection, schema::PrimitiveType_InstanceNorm,
+  schema::PrimitiveType_TileFusion,     schema::PrimitiveType_PadFusion};
 
 SubGraphNpuKernel::~SubGraphNpuKernel() {
   subgraph_input_op_.clear();
@@ -95,7 +99,9 @@ int SubGraphNpuKernel::BuildNPUInputOp() {
   op_buffer_.clear();
   for (auto node : this->nodes_) {
     std::vector<ge::Operator *> node_input_op;
-    for (auto in_tensor : node->in_tensors()) {
+    std::unordered_map<int, std::pair<ge::Operator *, int>> index2_multi_out_index;
+    for (int i = 0; i < node->in_tensors().size(); ++i) {
+      auto in_tensor = node->in_tensors()[i];
       if (IsSubGraphInputTensor(in_tensor)) {
         auto tensor_name = node->name() + "_" + std::to_string(count++);
         hiai::op::Data *data;
@@ -109,21 +115,24 @@ int SubGraphNpuKernel::BuildNPUInputOp() {
       bool is_weight_tensor = true;
       for (auto in_kernel : node->in_kernels()) {
         if (IsContain(in_kernel->out_tensors(), in_tensor)) {
-          if (in_kernel->desc().arch == mindspore::kernel::kNPU) {
-            // input come from npu
-            auto npu_op = reinterpret_cast<NPUKernel *>(in_kernel)->GetNPUOp();
-            if (npu_op != nullptr) {
-              node_input_op.push_back(npu_op);
-              is_weight_tensor = false;
-              break;
-            } else {
-              MS_LOG(ERROR) << in_kernel->type_str() << "NPU Operator is nullptr.";
-              return RET_ERROR;
-            }
-          } else {
+          if (in_kernel->desc().arch != mindspore::kernel::kNPU) {
             MS_LOG(ERROR) << "The input of the intermediate node comes from the CPU";
             return RET_ERROR;
           }
+          // input come from npu
+          auto npu_op = reinterpret_cast<NPUKernel *>(in_kernel)->GetNPUOp();
+          if (npu_op == nullptr) {
+            MS_LOG(ERROR) << in_kernel->type_str() << "NPU Operator is nullptr.";
+            return RET_ERROR;
+          }
+          node_input_op.push_back(npu_op);
+          if (in_kernel->out_tensors().size() != 1) {  // in_kernel has multi output, we record which output we want.
+            int out_index = std::find(in_kernel->out_tensors().begin(), in_kernel->out_tensors().end(), in_tensor) -
+                            in_kernel->out_tensors().begin();
+            index2_multi_out_index[i] = {npu_op, out_index};
+          }
+          is_weight_tensor = false;
+          break;
         }
       }
 
@@ -144,7 +153,8 @@ int SubGraphNpuKernel::BuildNPUInputOp() {
       }
     }
     // set input to NPU
-    int ret = reinterpret_cast<NPUKernel *>(node)->SetNPUInputs(node->in_tensors(), node->out_tensors(), node_input_op);
+    int ret = reinterpret_cast<NPUKernel *>(node)->SetNPUInputs(node->in_tensors(), node->out_tensors(), node_input_op,
+                                                                index2_multi_out_index);
     if (ret != RET_OK) {
       MS_LOG(ERROR) << node->name() << " set npu inputs failed.";
       return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/npu/arithmetic_npu.cc b/mindspore/lite/src/runtime/kernel/npu/arithmetic_npu.cc
index f7ac613bf2..221ccfed3d 100644
--- a/mindspore/lite/src/runtime/kernel/npu/arithmetic_npu.cc
+++ b/mindspore/lite/src/runtime/kernel/npu/arithmetic_npu.cc
@@ -15,6 +15,8 @@
  */
 
 #include "src/runtime/kernel/npu/arithmetic_npu.h"
+#include <unordered_map>
+#include <utility>
 #include <string>
 #include "include/graph/op/all_ops.h"
 #include "src/kernel_registry.h"
@@ -165,6 +167,24 @@ int ArithmeticNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *> &inputs,
   return RET_OK;
 }
 
+int ArithmeticNPUKernel::SetNPUInputs(
+  const std::vector<mindspore::lite::Tensor *> &inputs, const std::vector<lite::Tensor *> &outputs,
+  const std::vector<ge::Operator *> &npu_inputs,
+  const std::unordered_map<int, std::pair<ge::Operator *, int>> &index2_multi_out_index) {
+  auto ret = SetNPUInputs(inputs, outputs, npu_inputs);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "ArithmeticNPUKernel SetNPUInputs failed";
+    return RET_ERROR;
+  }
+  if (index2_multi_out_index.empty()) {
+    return RET_OK;
+  }
+  for (auto it : index2_multi_out_index) {
+    MS_LOG(INFO) << name_ << "set input " << it.first << " from " << it.second.first << " output " << it.second.second;
+    op_->SetInput(it.first, *it.second.first, it.second.second);
+  }
+  return RET_OK;
+}
 ge::Operator *mindspore::kernel::ArithmeticNPUKernel::GetNPUOp() {
   if (activation_type_ == ActivationType_NO_ACTIVATION) {
     return op_;
diff --git a/mindspore/lite/src/runtime/kernel/npu/arithmetic_npu.h b/mindspore/lite/src/runtime/kernel/npu/arithmetic_npu.h
index 9f48009dbe..8c857ed84a 100644
--- a/mindspore/lite/src/runtime/kernel/npu/arithmetic_npu.h
+++ b/mindspore/lite/src/runtime/kernel/npu/arithmetic_npu.h
@@ -17,6 +17,8 @@
 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_NPU_ARITHMETIC_NPU_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_NPU_ARITHMETIC_NPU_H_
 #include <vector>
+#include <unordered_map>
+#include <utility>
 #include "nnacl/arithmetic.h"
 #include "src/runtime/kernel/npu/npu_kernel.h"
 #include "include/graph/op/all_ops.h"
@@ -34,6 +36,9 @@ class ArithmeticNPUKernel : public NPUKernel {
                 OpParameter *opParameter) override;
   int SetNPUInputs(const std::vector<lite::Tensor *> &inputs, const std::vector<lite::Tensor *> &outputs,
                    const std::vector<ge::Operator *> &npu_inputs) override;
+  int SetNPUInputs(const std::vector<mindspore::lite::Tensor *> &inputs, const std::vector<lite::Tensor *> &outputs,
+                   const std::vector<ge::Operator *> &npu_inputs,
+                   const std::unordered_map<int, std::pair<ge::Operator *, int>> &index2_multi_out_index) override;
 
   ge::Operator *GetNPUOp() override;
 
diff --git a/mindspore/lite/src/runtime/kernel/npu/expand_dims_npu.cc b/mindspore/lite/src/runtime/kernel/npu/expand_dims_npu.cc
new file mode 100644
index 0000000000..31a3ee506d
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/npu/expand_dims_npu.cc
@@ -0,0 +1,55 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/npu/expand_dims_npu.h"
+#include "include/graph/op/all_ops.h"
+#include "src/kernel_registry.h"
+#include "src/runtime/agent/npu/npu_converter_utils.h"
+
+using mindspore::kernel::KERNEL_ARCH::kNPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::schema::PrimitiveType_ExpandDims;
+
+namespace mindspore::kernel {
+int ExpandDimsNPUKernel::IsSupport(const std::vector<lite::Tensor *> &inputs,
+                                   const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter) {
+  return RET_OK;
+}
+
+int ExpandDimsNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *> &inputs,
+                                      const std::vector<lite::Tensor *> &outputs,
+                                      const std::vector<ge::Operator *> &npu_inputs) {
+  op_ = new (std::nothrow) hiai::op::ExpandDims(name_);
+  if (op_ == nullptr) {
+    MS_LOG(ERROR) << name_ << " op is nullptr";
+    return RET_ERROR;
+  }
+  op_->set_input_x(*npu_inputs[0]);
+  op_->set_input_axis(*npu_inputs[1]);
+
+  return RET_OK;
+}
+
+ge::Operator *mindspore::kernel::ExpandDimsNPUKernel::GetNPUOp() { return this->op_; }
+
+ExpandDimsNPUKernel::~ExpandDimsNPUKernel() {
+  if (op_ != nullptr) {
+    delete op_;
+    op_ = nullptr;
+  }
+}
+REG_KERNEL(kNPU, kNumberTypeFloat32, PrimitiveType_ExpandDims, NPUKernelCreator<ExpandDimsNPUKernel>)
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/npu/expand_dims_npu.h b/mindspore/lite/src/runtime/kernel/npu/expand_dims_npu.h
new file mode 100644
index 0000000000..68710305ca
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/npu/expand_dims_npu.h
@@ -0,0 +1,42 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_NPU_EXPAND_DIMS_NPU_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_NPU_EXPAND_DIMS_NPU_H_
+#include <vector>
+#include "src/runtime/kernel/npu/npu_kernel.h"
+#include "include/graph/op/all_ops.h"
+
+namespace mindspore::kernel {
+class ExpandDimsNPUKernel : public NPUKernel {
+ public:
+  ExpandDimsNPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                      const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
+      : NPUKernel(parameter, inputs, outputs, ctx) {}
+  ~ExpandDimsNPUKernel() override;
+
+  int IsSupport(const std::vector<lite::Tensor *> &inputs, const std::vector<lite::Tensor *> &outputs,
+                OpParameter *opParameter) override;
+  int SetNPUInputs(const std::vector<lite::Tensor *> &inputs, const std::vector<lite::Tensor *> &outputs,
+                   const std::vector<ge::Operator *> &npu_inputs) override;
+
+  ge::Operator *GetNPUOp() override;
+
+ private:
+  hiai::op::ExpandDims *op_ = nullptr;
+};
+}  // namespace mindspore::kernel
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_NPU_EXPAND_DIMS_NPU_H_
diff --git a/mindspore/lite/src/runtime/kernel/npu/npu_kernel.h b/mindspore/lite/src/runtime/kernel/npu/npu_kernel.h
index 77ec44ebae..546af30166 100644
--- a/mindspore/lite/src/runtime/kernel/npu/npu_kernel.h
+++ b/mindspore/lite/src/runtime/kernel/npu/npu_kernel.h
@@ -18,6 +18,8 @@
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_NPU_KERNEL_NPU_H_
 
 #include <vector>
+#include <unordered_map>
+#include <utility>
 #include "src/lite_kernel.h"
 #include "include/errorcode.h"
 #include "include/graph/graph.h"
@@ -46,6 +48,14 @@ class NPUKernel : public LiteKernel {
   virtual int SetNPUInputs(const std::vector<mindspore::lite::Tensor *> &inputs,
                            const std::vector<lite::Tensor *> &outputs,
                            const std::vector<ge::Operator *> &npu_inputs) = 0;
+  virtual int SetNPUInputs(const std::vector<mindspore::lite::Tensor *> &inputs,
+                           const std::vector<lite::Tensor *> &outputs, const std::vector<ge::Operator *> &npu_inputs,
+                           const std::unordered_map<int, std::pair<ge::Operator *, int>> &index2_multi_out_index) {
+    if (index2_multi_out_index.empty()) {
+      return SetNPUInputs(inputs, outputs, npu_inputs);
+    }
+    return RET_OK;
+  }
 };
 template <class T>
 kernel::LiteKernel *NPUKernelCreator(const std::vector<lite::Tensor *> &inputs,
diff --git a/mindspore/lite/src/runtime/kernel/npu/tile_npu.cc b/mindspore/lite/src/runtime/kernel/npu/tile_npu.cc
new file mode 100644
index 0000000000..7153f9eb01
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/npu/tile_npu.cc
@@ -0,0 +1,80 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/npu/tile_npu.h"
+#include <memory>
+#include "include/graph/op/all_ops.h"
+#include "src/kernel_registry.h"
+#include "src/runtime/agent/npu/npu_converter_utils.h"
+
+using mindspore::kernel::KERNEL_ARCH::kNPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::schema::PrimitiveType_TileFusion;
+
+namespace mindspore::kernel {
+int TileNPUKernel::IsSupport(const std::vector<lite::Tensor *> &inputs, const std::vector<lite::Tensor *> &outputs,
+                             OpParameter *opParameter) {
+  if (inputs.size() != 2) {
+    return RET_ERROR;
+  }
+  auto multiple_tensor = inputs[1];
+  if (multiple_tensor->ElementsNum() > 4) {
+    return RET_ERROR;
+  }
+  int *multiple_data = reinterpret_cast<int *>(multiple_tensor->data_c());
+  if (multiple_data == nullptr) {
+    return RET_ERROR;
+  }
+  for (int i = 0; i < multiple_tensor->ElementsNum(); ++i) {
+    param_->multiples_[i] = multiple_data[i];
+  }
+  param_->multiples_size_ = static_cast<size_t>(multiple_tensor->ElementsNum());
+  return RET_OK;
+}
+
+int TileNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *> &inputs, const std::vector<lite::Tensor *> &outputs,
+                                const std::vector<ge::Operator *> &npu_inputs) {
+  op_ = new (std::nothrow) hiai::op::Tile(name_);
+  if (op_ == nullptr) {
+    MS_LOG(ERROR) << name_ << " op is nullptr";
+    return RET_ERROR;
+  }
+  op_->set_input_x(*npu_inputs[0]);
+
+  ge::TensorDesc multiple_tensor_desc(ge::Shape({static_cast<int64_t>(param_->multiples_size_)}), ge::FORMAT_NCHW,
+                                      ge::DT_INT32);
+  ge::TensorPtr multiple_tensor = std::make_shared<hiai::Tensor>(multiple_tensor_desc);
+  multiple_tensor->SetData(reinterpret_cast<uint8_t *>(param_->multiples_), param_->multiples_size_ * sizeof(int));
+  multiple_ = new hiai::op::Const(name_ + "multiples");
+  multiple_->set_attr_value(multiple_tensor);
+  op_->set_input_multiples(*multiple_);
+  return RET_OK;
+}
+
+ge::Operator *mindspore::kernel::TileNPUKernel::GetNPUOp() { return this->op_; }
+
+TileNPUKernel::~TileNPUKernel() {
+  if (op_ != nullptr) {
+    delete op_;
+    op_ = nullptr;
+  }
+  if (multiple_ != nullptr) {
+    delete multiple_;
+    multiple_ = nullptr;
+  }
+}
+REG_KERNEL(kNPU, kNumberTypeFloat32, PrimitiveType_TileFusion, NPUKernelCreator<TileNPUKernel>)
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/npu/tile_npu.h b/mindspore/lite/src/runtime/kernel/npu/tile_npu.h
new file mode 100644
index 0000000000..9975372edd
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/npu/tile_npu.h
@@ -0,0 +1,47 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_NPU_TILE_NPU_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_NPU_TILE_NPU_H_
+#include <vector>
+#include "src/runtime/kernel/npu/npu_kernel.h"
+#include "include/graph/op/all_ops.h"
+#include "nnacl/base/tile_base.h"
+
+namespace mindspore::kernel {
+class TileNPUKernel : public NPUKernel {
+ public:
+  TileNPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
+      : NPUKernel(parameter, inputs, outputs, ctx) {
+    param_ = reinterpret_cast<TileParameter *>(parameter);
+  }
+  ~TileNPUKernel() override;
+
+  int IsSupport(const std::vector<lite::Tensor *> &inputs, const std::vector<lite::Tensor *> &outputs,
+                OpParameter *opParameter) override;
+  int SetNPUInputs(const std::vector<lite::Tensor *> &inputs, const std::vector<lite::Tensor *> &outputs,
+                   const std::vector<ge::Operator *> &npu_inputs) override;
+
+  ge::Operator *GetNPUOp() override;
+
+ private:
+  hiai::op::Tile *op_ = nullptr;
+  hiai::op::Const *multiple_ = nullptr;
+  TileParameter *param_ = nullptr;
+};
+}  // namespace mindspore::kernel
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_NPU_TILE_NPU_H_
diff --git a/mindspore/lite/src/runtime/kernel/npu/transpose_npu.cc b/mindspore/lite/src/runtime/kernel/npu/transpose_npu.cc
index c4ac33c6b3..26717ee2e8 100644
--- a/mindspore/lite/src/runtime/kernel/npu/transpose_npu.cc
+++ b/mindspore/lite/src/runtime/kernel/npu/transpose_npu.cc
@@ -33,10 +33,12 @@ int TransposeNPUKernel::IsSupport(const std::vector<lite::Tensor *> &inputs, con
       perm_.push_back(static_cast<int *>(inputs[1]->data_c())[i]);
     }
   } else {
-    MS_LOG(WARNING) << "NPU perm is attribute.";
+    MS_LOG(WARNING) << "NPU perm is attribute or input[1] data nullptr";
     return RET_ERROR;
   }
-
+  if (inputs[1]->ElementsNum() != 4) {
+    return RET_OK;
+  }
   return RET_ERROR;
 }