From a2cdf589e723563036b4c8d34f7a7b283b7d99ce Mon Sep 17 00:00:00 2001
From: zhaozhenlong <zhaozhenlong1@huawei.com>
Date: Tue, 27 Apr 2021 17:30:10 +0800
Subject: [PATCH] fix npu reuse input tensor

---
 .../lite/src/runtime/agent/npu/npu_executor.cc | 18 ++++++++++++++----
 .../lite/src/runtime/agent/npu/npu_executor.h  |  2 +-
 .../runtime/agent/npu/subgraph_npu_kernel.cc   |  2 +-
 3 files changed, 16 insertions(+), 6 deletions(-)
diff --git a/mindspore/lite/src/runtime/agent/npu/npu_executor.cc b/mindspore/lite/src/runtime/agent/npu/npu_executor.cc
index 384e61b71a..1389eba21b 100644
--- a/mindspore/lite/src/runtime/agent/npu/npu_executor.cc
+++ b/mindspore/lite/src/runtime/agent/npu/npu_executor.cc
@@ -15,6 +15,7 @@
  */
 
 #include "src/runtime/agent/npu/npu_executor.h"
+#include <unordered_map>
 #include "include/errorcode.h"
 #include "src/runtime/agent/npu/npu_manager.h"
 #include "nnacl/pack.h"
@@ -100,15 +101,24 @@ bool IsSameShapeOutTensor(Tensor *tensor, std::shared_ptr<hiai::AiTensor> npu_te
 }
 
 int NPUExecutor::Run(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
-                     const std::vector<kernel::LiteKernel *> &out_kernels,
+                     const std::vector<kernel::LiteKernel *> &in_kernels,
                      const std::vector<kernel::LiteKernel *> &kernels, Allocator *allocator,
                      const KernelCallBack &before, const KernelCallBack &after) {
   hiai::AiContext context;
-  std::vector<bool> inputs_visited(in_tensors.size(), false);
+  std::unordered_map<lite::Tensor *, int> tensor_uses;
+  for (const auto ker : in_kernels) {
+    for (const auto ker_input : ker->in_tensors()) {
+      if (tensor_uses.find(ker_input) == tensor_uses.end()) {
+        tensor_uses.insert({ker_input, 1});
+      } else {
+        tensor_uses[ker_input]++;
+      }
+    }
+  }
   for (int i = 0; i < npu_input_tensors_.size(); ++i) {
     int index = 0;
     for (; index < in_tensors.size(); index++) {
-      if (!inputs_visited[index] && IsSameShapeInTensor(in_tensors[index], npu_input_tensors_[i])) {
+      if (tensor_uses[in_tensors[index]] > 0 && IsSameShapeInTensor(in_tensors[index], npu_input_tensors_[i])) {
         void *data = in_tensors[index]->data_c();
         if (data == nullptr) {
           MS_LOG(ERROR) << "For " << model_name_ << ", the " << i << "th input data is nullptr";
@@ -116,7 +126,7 @@ int NPUExecutor::Run(const std::vector<Tensor *> &in_tensors, const std::vector<
         }
 
         memcpy(npu_input_tensors_[i]->GetBuffer(), data, in_tensors[index]->Size());
-        inputs_visited[index] = true;
+        tensor_uses[in_tensors[index]]--;
         in_tensors[index]->DecRefCount();
         break;
       }
diff --git a/mindspore/lite/src/runtime/agent/npu/npu_executor.h b/mindspore/lite/src/runtime/agent/npu/npu_executor.h
index ad325f2934..dc65001ffc 100644
--- a/mindspore/lite/src/runtime/agent/npu/npu_executor.h
+++ b/mindspore/lite/src/runtime/agent/npu/npu_executor.h
@@ -36,7 +36,7 @@ class NPUExecutor : public Executor {
   int Prepare(const std::vector<kernel::LiteKernel *> &kernels) override;
 
   int Run(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
-          const std::vector<kernel::LiteKernel *> &out_kernels, const std::vector<kernel::LiteKernel *> &kernels,
+          const std::vector<kernel::LiteKernel *> &in_kernels, const std::vector<kernel::LiteKernel *> &kernels,
           Allocator *allocator = nullptr, const KernelCallBack &before = nullptr,
           const KernelCallBack &after = nullptr);
 
diff --git a/mindspore/lite/src/runtime/agent/npu/subgraph_npu_kernel.cc b/mindspore/lite/src/runtime/agent/npu/subgraph_npu_kernel.cc
index 468ac2aa24..15a3da9138 100644
--- a/mindspore/lite/src/runtime/agent/npu/subgraph_npu_kernel.cc
+++ b/mindspore/lite/src/runtime/agent/npu/subgraph_npu_kernel.cc
@@ -90,7 +90,7 @@ std::shared_ptr<domi::ModelBufferData> SubGraphNpuKernel::BuildIRModel() {
 
 int SubGraphNpuKernel::Run() {
   return reinterpret_cast<lite::NPUExecutor *>(this->executor_)
-    ->Run(in_tensors_, out_tensor_sorted_, out_nodes_, nodes_);
+    ->Run(in_tensors_, out_tensor_sorted_, in_nodes_, nodes_);
 }
 
 int SubGraphNpuKernel::BuildNPUInputOp() {