run mobilenet_v2 success

5 years ago · d45b5b5126
--- a/mindspore/lite/src/lite_kernel.h
+++ b/mindspore/lite/src/lite_kernel.h
@@ -95,6 +95,8 @@ class LiteKernel {

  virtual int Init() { return mindspore::lite::RET_ERROR; }

  OpParameter *op_parameter() { return op_parameter_; }

  std::string name() const { return this->name_; }

  virtual int Train() {
--- a/mindspore/lite/src/lite_session.cc
+++ b/mindspore/lite/src/lite_session.cc
@@ -479,12 +479,6 @@ int LiteSession::Init(const Context *context) {
    is_running_.store(false);
    return ret;
  }
  ret = InitNPURuntime();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Init NPU runtime failed.";
    is_running_.store(false);
    return ret;
  }
  executor_ = new (std::nothrow) Executor();
  if (nullptr == executor_) {
    MS_LOG(ERROR) << "New Executor failed";
@@ -661,18 +655,6 @@ int LiteSession::Resize(const std::vector<mindspore::tensor::MSTensor *> &inputs
  return RET_OK;
 }

 int LiteSession::InitNPURuntime() {
 #if SUPPORT_NPU
  if (this->context_->IsNpuEnabled()) {
    if (mindspore::lite::NPUManager::GetInstance()->InitClient() != RET_OK) {
      MS_LOG(ERROR) << "NPU client init error.";
      return RET_ERROR;
    }
  }
 #endif
  return RET_OK;
 }

 int LiteSession::InitGPURuntime() {
 #if SUPPORT_GPU
  if (this->context_->IsGpuEnabled()) {
--- a/mindspore/lite/src/lite_session.h
+++ b/mindspore/lite/src/lite_session.h
@@ -103,8 +103,6 @@ class LiteSession : public session::LiteSession {
 private:
  void ResetInputsShape(const std::vector<std::vector<int>> &dims);

  int InitNPURuntime();

  int InitGPURuntime();

 protected:
--- a/mindspore/lite/src/runtime/agent/npu/npu_executor.cc
+++ b/mindspore/lite/src/runtime/agent/npu/npu_executor.cc
@@ -17,10 +17,9 @@
 #include "src/runtime/agent/npu/npu_executor.h"
 #include "include/errorcode.h"
 #include "src/runtime/agent/npu/npu_manager.h"
 #include "nnacl/pack.h"
 namespace mindspore::lite {
 int NPUExecutor::Prepare(const std::vector<kernel::LiteKernel *> &kernels) {
  this->client_ = mindspore::lite::NPUManager::GetInstance()->GetClient();
  this->client_ = mindspore::lite::NPUManager::GetInstance()->GetClient(model_name_);
  if (this->client_ == nullptr) {
    MS_LOG(ERROR) << "client is nullptr.";
    return RET_ERROR;
@@ -33,9 +32,8 @@ int NPUExecutor::Prepare(const std::vector<kernel::LiteKernel *> &kernels) {
 }

 int NPUExecutor::Run(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
                     const std::vector<kernel::LiteKernel *> &kernels, const std::vector<bool> &inputs_nhwc2nchw,
                     const std::vector<bool> &outputs_nchw2nhwc, Allocator *allocator, const KernelCallBack &before,
                     const KernelCallBack &after) {
                     const std::vector<kernel::LiteKernel *> &kernels, Allocator *allocator,
                     const KernelCallBack &before, const KernelCallBack &after) {
  hiai::AiContext context;
  for (int i = 0; i < npu_input_tensors_.size(); ++i) {
    void *data = in_tensors[i]->data_c();
@@ -43,12 +41,7 @@ int NPUExecutor::Run(const std::vector<Tensor *> &in_tensors, const std::vector<
      MS_LOG(ERROR) << model_name_ << " inputs data is nullptr";
      return RET_ERROR;
    }
    if (inputs_nhwc2nchw[i]) {
      PackNHWCToNCHWFp32(data, npu_input_tensors_[i]->GetBuffer(), in_tensors[i]->Batch(),
                         in_tensors[i]->Width() * in_tensors[i]->Height(), in_tensors[i]->Channel());
    } else {
      memcpy(npu_input_tensors_[i]->GetBuffer(), data, in_tensors[i]->Size());
    }
    memcpy(npu_input_tensors_[i]->GetBuffer(), data, in_tensors[i]->Size());
  }
  context.AddPara("model_name", model_name_);
  if (this->client_ == nullptr) {
@@ -68,12 +61,7 @@ int NPUExecutor::Run(const std::vector<Tensor *> &in_tensors, const std::vector<
      MS_LOG(ERROR) << "Malloc buffer failed.";
      return RET_ERROR;
    }
    if (outputs_nchw2nhwc[i]) {
      PackNCHWToNHWCFp32(npu_output_tensors_[i]->GetBuffer(), data, out_tensors[i]->Batch(),
                         out_tensors[i]->Width() * out_tensors[i]->Height(), out_tensors[i]->Channel());
    } else {
      memcpy(data, npu_output_tensors_[i]->GetBuffer(), npu_output_tensors_[i]->GetSize());
    }
    memcpy(data, npu_output_tensors_[i]->GetBuffer(), npu_output_tensors_[i]->GetSize());
    out_tensors[i]->ResetRefCount();
  }
  return RET_OK;
--- a/mindspore/lite/src/runtime/agent/npu/npu_executor.h
+++ b/mindspore/lite/src/runtime/agent/npu/npu_executor.h
@@ -32,8 +32,7 @@ class NPUExecutor : public Executor {
  int Prepare(const std::vector<kernel::LiteKernel *> &kernels) override;

  int Run(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
          const std::vector<kernel::LiteKernel *> &kernels, const std::vector<bool> &inputs_nhwc2nchw,
          const std::vector<bool> &outputs_nchw2nhwc, Allocator *allocator = nullptr,
          const std::vector<kernel::LiteKernel *> &kernels, Allocator *allocator = nullptr,
          const KernelCallBack &before = nullptr, const KernelCallBack &after = nullptr);

 private:
--- a/mindspore/lite/src/runtime/agent/npu/npu_fusion_pass.cc
+++ b/mindspore/lite/src/runtime/agent/npu/npu_fusion_pass.cc
@@ -0,0 +1,224 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "src/runtime/agent/npu/npu_fusion_pass.h"
 #include <vector>
 #include "src/lite_kernel.h"
 #include "nnacl/concat_parameter.h"

 namespace mindspore::lite {
 bool CheckFusion(kernel::LiteKernel *kernel) {
  auto pre_flag =
    std::all_of(kernel->in_kernels().begin(), kernel->in_kernels().end(), [](const kernel::LiteKernel *kernel) {
      return kernel->Type() == schema::PrimitiveType_Nchw2Nhwc && kernel->out_kernels().size() == 1;
    });
  if (!pre_flag) {
    return false;
  }
  auto post_flag =
    std::all_of(kernel->out_kernels().begin(), kernel->out_kernels().end(), [](const kernel::LiteKernel *kernel) {
      return kernel->Type() == schema::PrimitiveType_Nhwc2Nchw && kernel->in_kernels().size() == 1;
    });
  return post_flag;
 }

 void NPUFusionPass::UpdatePreKernels(kernel::LiteKernel *cur_kernel) {
  for (auto in_kernel : cur_kernel->in_kernels()) {
    auto pre_kernel = in_kernel->in_kernels()[0];

    auto pre_out_kernels = pre_kernel->out_kernels();
    for (size_t i = 0; i < pre_out_kernels.size(); i++) {
      if (pre_out_kernels[i] == in_kernel) {
        pre_out_kernels[i] = cur_kernel;
        break;
      }
    }
    pre_kernel->set_out_kernels(pre_out_kernels);

    auto cur_in_kernels = cur_kernel->in_kernels();
    for (size_t i = 0; i < cur_in_kernels.size(); i++) {
      if (cur_in_kernels[i] == in_kernel) {
        cur_in_kernels[i] = pre_kernel;
        break;
      }
    }
    cur_kernel->set_in_kernels(cur_in_kernels);
    kernels->erase(find(kernels->begin(), kernels->end(), in_kernel));
  }
 }

 void NPUFusionPass::UpdatePostKernels(kernel::LiteKernel *cur_kernel) {
  for (auto out_kernel : cur_kernel->out_kernels()) {
    auto post_kernel = out_kernel->out_kernels()[0];

    auto post_in_kernels = post_kernel->in_kernels();
    for (size_t i = 0; i < post_in_kernels.size(); i++) {
      if (post_in_kernels[i] == out_kernel) {
        post_in_kernels[i] = cur_kernel;
        break;
      }
    }
    post_kernel->set_in_kernels(post_in_kernels);

    auto cur_out_kernels = cur_kernel->out_kernels();
    for (size_t i = 0; i < cur_out_kernels.size(); i++) {
      if (cur_out_kernels[i] == out_kernel) {
        cur_out_kernels[i] = post_kernel;
        break;
      }
    }
    cur_kernel->set_out_kernels(cur_out_kernels);
    kernels->erase(find(kernels->begin(), kernels->end(), out_kernel));
  }
 }

 void UpdatePreTensors(kernel::LiteKernel *cur_kernel) {
  auto tensors_vec = cur_kernel->in_tensors();
  for (auto in_kernel : cur_kernel->in_kernels()) {
    lite::Tensor *cur_tensor = nullptr;
    auto in_tensor = in_kernel->in_tensors()[0];
    auto out_tensor = in_kernel->out_tensors()[0];
    auto pre_kernel = in_kernel->in_kernels()[0];
    for (size_t i = 0; i < pre_kernel->out_tensors().size(); i++) {
      if (pre_kernel->out_tensors()[i] == in_tensor) {
        cur_tensor = pre_kernel->out_tensors()[i];
      }
    }
    for (size_t i = 0; i < tensors_vec.size(); i++) {
      if (tensors_vec[i] == out_tensor) {
        tensors_vec[i] = cur_tensor;
      }
    }
  }
  cur_kernel->set_in_tensors(tensors_vec);
 }

 void UpdatePostTensors(kernel::LiteKernel *cur_kernel) {
  auto tensors_vec = cur_kernel->out_tensors();
  for (auto out_kernel : cur_kernel->out_kernels()) {
    auto in_tensor = out_kernel->in_tensors()[0];
    auto out_tensor = out_kernel->out_tensors()[0];
    auto post_kernel = out_kernel->out_kernels()[0];
    lite::Tensor *cur_tensor = nullptr;
    for (size_t i = 0; i < post_kernel->in_tensors().size(); i++) {
      if (post_kernel->in_tensors()[i] == out_tensor) {
        cur_tensor = post_kernel->in_tensors()[i];
      }
    }
    for (size_t i = 0; i < tensors_vec.size(); i++) {
      if (tensors_vec[i] == in_tensor) {
        tensors_vec[i] = cur_tensor;
      }
    }
  }
  cur_kernel->set_out_tensors(tensors_vec);
 }

 int TransFormAxis(int axis) {
  switch (axis) {
    case 0:
      return 0;
    case 1:
      return 2;
    case 2:
      return 3;
    case 3:
    case -1:
      return 1;
    default:
      return -2;
  }
 }

 int NPUFusionPass::AddFusion(kernel::LiteKernel *kernel) {
  if (!CheckFusion(kernel)) {
    return RET_OK;
  }
  UpdatePreTensors(kernel);
  UpdatePostTensors(kernel);
  UpdatePreKernels(kernel);
  UpdatePostKernels(kernel);
  return RET_OK;
 }

 int NPUFusionPass::ConcatFusion(kernel::LiteKernel *kernel) {
  if (!CheckFusion(kernel)) {
    return RET_OK;
  }
  UpdatePreTensors(kernel);
  UpdatePostTensors(kernel);
  UpdatePreKernels(kernel);
  UpdatePostKernels(kernel);
  auto concat_param = reinterpret_cast<ConcatParameter *>(kernel->op_parameter());
  concat_param->axis_ = TransFormAxis(concat_param->axis_);
  return RET_OK;
 }

 int NPUFusionPass::FormatFusion(kernel::LiteKernel *kernel) {
  if (kernel->out_kernels().empty()) {
    return RET_OK;
  }
  if (!std::all_of(kernel->out_kernels().begin(), kernel->out_kernels().end(), [](const kernel::LiteKernel *kernel) {
        return kernel->Type() == schema::PrimitiveType_Nhwc2Nchw;
      })) {
    return RET_OK;
  }
  auto pre_kernel = kernel->in_kernels()[0];

  auto pre_out_kernels = pre_kernel->out_kernels();
  for (size_t i = 0; i < pre_out_kernels.size(); i++) {
    if (pre_out_kernels[i] == kernel) {
      pre_out_kernels.erase(pre_out_kernels.begin() + i);
      break;
    }
  }
  for (const auto &nc2nh : kernel->out_kernels()) {
    for (const auto &post_kernel : nc2nh->out_kernels()) {
      auto post_in_kernels = post_kernel->in_kernels();
      for (size_t i = 0; i < post_in_kernels.size(); i++) {
        if (post_in_kernels[i] == nc2nh) {
          post_in_kernels[i] = pre_kernel;
          break;
        }
      }
      post_kernel->set_in_kernels(post_in_kernels);
      pre_out_kernels.push_back(post_kernel);
    }
    kernels->erase(find(kernels->begin(), kernels->end(), nc2nh));
  }
  pre_kernel->set_out_kernels(pre_out_kernels);
  kernels->erase(find(kernels->begin(), kernels->end(), kernel));
  return RET_OK;
 }

 int NPUFusionPass::Fusion() {
  for (auto kernel : *kernels) {
    switch (kernel->Type()) {
      case schema::PrimitiveType_Concat:
        ConcatFusion(kernel);
        continue;
      case schema::PrimitiveType_Add:
        AddFusion(kernel);
        continue;
      case schema::PrimitiveType_Nchw2Nhwc:
        FormatFusion(kernel);
        continue;
      default:
        continue;
    }
  }
  return RET_OK;
 }
 }  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/agent/npu/npu_fusion_pass.h
+++ b/mindspore/lite/src/runtime/agent/npu/npu_fusion_pass.h
@@ -0,0 +1,40 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_FUSION_PASS_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_FUSION_PASS_H_
 #include <vector>
 #include "src/lite_kernel.h"
 #include "src/ops/primitive_c.h"
 namespace mindspore::lite {
 class NPUFusionPass {
 public:
  explicit NPUFusionPass(std::vector<kernel::LiteKernel *> *dst_kernels) { kernels = dst_kernels; }
  ~NPUFusionPass() = default;
  int Fusion();

 protected:
  int ConcatFusion(kernel::LiteKernel *kernel);
  int AddFusion(kernel::LiteKernel *kernel);
  int FormatFusion(kernel::LiteKernel *kernel);
  void UpdatePreKernels(kernel::LiteKernel *kernel);
  void UpdatePostKernels(kernel::LiteKernel *kernel);

 private:
  std::vector<kernel::LiteKernel *> *kernels;
 };
 }  // namespace mindspore::lite
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_FUSION_PASS_H_
--- a/mindspore/lite/src/runtime/agent/npu/npu_manager.cc
+++ b/mindspore/lite/src/runtime/agent/npu/npu_manager.cc
@@ -15,57 +15,65 @@
 */

 #include "src/runtime/agent/npu/npu_manager.h"
 #include <sys/system_properties.h>
 #include <sys/fcntl.h>
 #include <unistd.h>
 #include "include/hiai_ir_build.h"
 #include "include/HiAiModelManagerService.h"
 #include "include/errorcode.h"
 #include "include/graph/op/all_ops.h"
 #include "src/common/file_utils.h"

 namespace mindspore::lite {
 #define MAX_MODEL_NUM 20
 int NPUManager::CompareVersion(const string &version1, const string &version2) {
  std::istringstream iss1(version1);
  std::istringstream iss2(version2);
  string string1;
  string string2;
  while (!iss1.eof() || !iss2.eof()) {
    getline(iss1, string1, '.');
    getline(iss2, string2, '.');
    if (stoi(string1) > stoi(string2)) return 1;
    if (stoi(string1) < stoi(string2)) return -1;
    string1 = string2 = "0";
  }
  return 0;
 }

 bool NPUManager::IsSupportNPU() {
  if (!is_npu_check_executor) {
    CheckSupportNPU();
  }
  if (is_support_npu) {
    MS_LOG(INFO) << "The current device support NPU.";
    return true;
  } else {
    MS_LOG(INFO) << "The current device NOT SUPPORT NPU.";
    return false;
 bool NPUManager::CheckEMUIVersion() {
  char emui[128] = {0x00};
  __system_property_get("ro.build.version.emui", emui);
  std::string emui_str = emui;
  int pos = emui_str.find('_');
  if (pos != std::string::npos) {
    auto version = emui_str.substr(pos + 1);
    int ret = CompareVersion(version, "11.0.0");
    if (ret < 0) {
      return false;
    }
  }
  return true;
 }

 std::string NPUManager::GetExecutorPath() {
  std::string executor_path;
  char cmdline[1024] = {0};
  int fd = open("/proc/self/cmdline", O_RDONLY);
  if (fd >= 0) {
    char ch;
    int i = 0;
    while (read(fd, &ch, sizeof(ch)) > 0 && !isspace(ch)) {
      if (':' == ch) {
        break;
      }
      cmdline[i] = ch;
      i++;
 bool NPUManager::CheckDDKVersion() {
  auto client = std::make_shared<hiai::AiModelMngerClient>();
  if (client->GetVersion() != nullptr) {
    std::string version = client->GetVersion();
    int ret = CompareVersion(version, "100.330.010.011");
    if (ret < 0) {
      return false;
    }
    close(fd);
  }
  executor_path = std::string(cmdline);
  if (executor_path.empty()) {
    executor_path = "./";
  }
  // android
  if (executor_path.substr(0, 11) == "/data/data/") {
    executor_path = executor_path + '/';
  return true;
 }
 bool NPUManager::IsSupportNPU() {
  if (IsKirinChip() && CheckEMUIVersion() && CheckDDKVersion()) {
    MS_LOG(INFO) << "The current device support NPU.";
    return true;
  } else {
    // Linux
    executor_path = executor_path.substr(0, executor_path.rfind('/')) + "/";
    MS_LOG(INFO) << "The current device NOT SUPPORT NPU.";
    return false;
  }
  return executor_path;
 }

 bool NPUManager::IsKirinChip() {
@@ -96,86 +104,6 @@ bool NPUManager::IsKirinChip() {
  return false;
 }

 bool WriteToOMFile(domi::ModelBufferData om_model_buff, const std::string &om_file_path) {
  FILE *fp;
  fp = fopen(om_file_path.c_str(), "wb");
  if (fp == nullptr) {
    MS_LOG(ERROR) << om_file_path.c_str() << " open failed.";
    return false;
  }

  auto write_size = (uint32_t)fwrite(om_model_buff.data, 1, om_model_buff.length, fp);
  if (write_size != om_model_buff.length) {
    fclose(fp);
    MS_LOG(ERROR) << "Write om file failed.";
    return false;
  }
  fclose(fp);
  return true;
 }

 bool NPUManager::CheckOmBuildIr(const std::string &path) {
  // build test om model
  std::shared_ptr<hiai::op::Add> add_op(new (std::nothrow) hiai::op::Add("add"));
  if (add_op == nullptr) {
    MS_LOG(ERROR) << "new add_op failed.";
    return false;
  }
  ge::TensorDesc desc(ge::Shape({1}), ge::FORMAT_NCHW, ge::DT_FLOAT);
  std::shared_ptr<hiai::op::Data> data = std::make_shared<hiai::op::Data>("data");
  data->update_input_desc_x(desc);
  add_op->set_input_x1(*data);
  add_op->set_input_x2(*data);
  domi::HiaiIrBuild ir_build;
  ge::Graph ir_graph("graph");
  std::vector<ge::Operator> inputs{*data, *data};
  std::vector<ge::Operator> outputs{*add_op};
  ir_graph.SetInputs(inputs).SetOutputs(outputs);
  ge::Model om_model("test_model", "test_version");
  om_model.SetGraph(ir_graph);

  domi::ModelBufferData om_model_buff;
  if (!ir_build.CreateModelBuff(om_model, om_model_buff)) {
    MS_LOG(ERROR) << "Create model buffer failed.";
    return false;
  }
  if (!ir_build.BuildIRModel(om_model, om_model_buff)) {
    MS_LOG(ERROR) << "Build IR model failed.";
    return false;
  }

  // save test om model
  remove(path.c_str());
  bool ret = WriteToOMFile(om_model_buff, path);
  ir_build.ReleaseModelBuff(om_model_buff);
  return ret;
 }

 void NPUManager::CheckSupportNPU() {
  is_npu_check_executor = true;
  std::string path_string = GetExecutorPath();

  std::string test_model_path = path_string + "/mindspore_lite_test_npu.om";
  std::ifstream ifs(test_model_path);
  if (ifs.good() && ifs.is_open()) {
    ifs.close();
    is_support_npu = true;
    return;
  }
  if (!IsKirinChip()) {
    MS_LOG(ERROR) << "The current device chip NOT SUPPORT NPU";
    is_support_npu = false;
    return;
  }

  if (!CheckOmBuildIr(test_model_path)) {
    MS_LOG(ERROR) << "Build OM IR error.";
    is_support_npu = false;
    return;
  }
  is_support_npu = true;
 }

 int NPUManager::AddModel(void *model_buf, uint32_t size, const std::string &model_name, int frequency) {
  hiai::MemBuffer *buffer = mc_builder_->InputMemBufferCreate(model_buf, size);
  if (buffer == nullptr) {
@@ -188,33 +116,42 @@ int NPUManager::AddModel(void *model_buf, uint32_t size, const std::string &mode
  model_desc_.push_back(desc);
  mc_builder_->MemBufferDestroy(buffer);

  model_map_.insert({model_name, index_});
  index_++;
  return RET_OK;
 }

 int NPUManager::InitClient() {
  this->client_ = std::make_shared<hiai::AiModelMngerClient>();
  if (this->client_ == nullptr) {
    return RET_ERROR;
  }
  int ret = this->client_->Init(nullptr);
  if (ret != hiai::AI_SUCCESS) {
    return RET_ERROR;
  }
  mc_builder_ = std::make_shared<hiai::AiModelBuilder>(this->client_);
  return RET_OK;
 }

 int NPUManager::LoadOMModel() {
  int ret = this->client_->Load(model_desc_);
  if (ret != hiai::AI_SUCCESS) {
    MS_LOG(ERROR) << "Client load model failed." << ret;
    return RET_ERROR;
  for (int i = 0; i < index_ / MAX_MODEL_NUM + 1; i++) {
    auto client = std::make_shared<hiai::AiModelMngerClient>();
    if (client == nullptr) {
      MS_LOG(ERROR) << "NPU client is nullptr.";
      return RET_ERROR;
    }
    int ret = client->Init(nullptr);
    if (ret != hiai::AI_SUCCESS) {
      MS_LOG(ERROR) << "NPU client init failed. code is " << ret;
      return RET_ERROR;
    }
    mc_builder_ = std::make_shared<hiai::AiModelBuilder>(client);

    vector<std::shared_ptr<hiai::AiModelDescription>> desc(model_desc_.begin() + i * MAX_MODEL_NUM,
                                                           ((i + 1) * MAX_MODEL_NUM > index_)
                                                             ? model_desc_.begin() + index_
                                                             : model_desc_.begin() + (i + 1) * MAX_MODEL_NUM);
    ret = client->Load(desc);
    if (ret != hiai::AI_SUCCESS) {
      MS_LOG(ERROR) << "Client load model failed." << ret;
      return RET_ERROR;
    }
    clients_.push_back(client);
  }
  return RET_OK;
 }

 std::shared_ptr<hiai::AiModelMngerClient> NPUManager::GetClient() { return client_; }
 std::shared_ptr<hiai::AiModelMngerClient> NPUManager::GetClient(const std::string &model_name) {
  return clients_[model_map_[model_name] / MAX_MODEL_NUM];
 }

 int NPUManager::index() { return index_; }
 int NPUManager::index() const { return index_; }
 }  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/agent/npu/npu_manager.h
+++ b/mindspore/lite/src/runtime/agent/npu/npu_manager.h
@@ -14,15 +14,21 @@
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_UTILS_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_UTILS_H_
 #ifndef MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_MANAGER_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_MANAGER_H_
 #include <string>
 #include <memory>
 #include <vector>
 #include <unordered_map>
 #include <set>
 #include "schema/model_generated.h"
 #include "include/HiAiModelManagerService.h"

 namespace mindspore::lite {

 static std::set<mindspore::schema::PrimitiveType> npu_trans_nodes = {
  schema::PrimitiveType_Conv2D,          schema::PrimitiveType_DeConv2D,
  schema::PrimitiveType_DepthwiseConv2D, schema::PrimitiveType_DeDepthwiseConv2D,
  schema::PrimitiveType_Resize,          schema::PrimitiveType_Pooling};
 class NPUManager {
 public:
  static NPUManager *GetInstance() {
@@ -32,8 +38,6 @@ class NPUManager {

  bool IsSupportNPU();

  int InitClient();

  // provide to subgraph to add model.
  int AddModel(void *model_buf, uint32_t size, const std::string &model_name, int frequency);

@@ -41,18 +45,18 @@ class NPUManager {
  int LoadOMModel();

  // provide to executor.
  std::shared_ptr<hiai::AiModelMngerClient> GetClient();
  std::shared_ptr<hiai::AiModelMngerClient> GetClient(const std::string &model_name);

  int index();
  int index() const;

 private:
  void CheckSupportNPU();

  bool IsKirinChip();

  bool CheckOmBuildIr(const std::string &path);
  bool CheckEMUIVersion();

  std::string GetExecutorPath();
  bool CheckDDKVersion();

  int CompareVersion(const std::string &version1, const std::string &version2);

 private:
  int index_ = 0;
@@ -61,12 +65,14 @@ class NPUManager {

  bool is_support_npu = false;

  std::shared_ptr<hiai::AiModelMngerClient> client_ = nullptr;
  std::vector<std::shared_ptr<hiai::AiModelMngerClient>> clients_;

  std::vector<std::shared_ptr<hiai::AiModelDescription>> model_desc_;

  std::shared_ptr<hiai::AiModelBuilder> mc_builder_ = nullptr;

  std::unordered_map<std::string, int> model_map_;
 };

 }  // namespace mindspore::lite
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_UTILS_H_
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_MANAGER_H_
--- a/mindspore/lite/src/runtime/agent/npu/npu_pass_utils.cc
+++ b/mindspore/lite/src/runtime/agent/npu/npu_pass_utils.cc
@@ -0,0 +1,102 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "src/kernel_registry.h"
 #include "src/ops/nhwc2nchw.h"
 #include "src/ops/nchw2nhwc.h"
 #include "src/runtime/agent/npu/npu_pass_utils.h"
 namespace mindspore::lite {
 using kernel::KERNEL_ARCH::kCPU;
 using kernel::KERNEL_ARCH::kNPU;
 PrimitiveC *NPUPassUtils::CreateNchw2NhwcPrimitive() {
  flatbuffers::FlatBufferBuilder fbb(1024);
  auto val_offset = schema::CreateNchw2Nhwc(fbb);
  auto prim_offset = schema::CreatePrimitive(fbb, schema::PrimitiveType_Nchw2Nhwc, val_offset.o);
  fbb.Finish(prim_offset);
  auto buf = fbb.GetBufferPointer();
  if (buf == nullptr) {
    MS_LOG(ERROR) << "GetBufferPointer return nullptr";
    fbb.Clear();
    return nullptr;
  }
  auto primitive_buf = reinterpret_cast<char *>(malloc(fbb.GetSize()));
  if (primitive_buf == nullptr) {
    MS_LOG(ERROR) << "Malloc primitive_buf_ failed.";
    fbb.Clear();
    return nullptr;
  }
  memcpy(primitive_buf, buf, fbb.GetSize());
  auto *primitive = PrimitiveC::NewPrimitiveC<Nchw2Nhwc>(flatbuffers::GetRoot<schema::Primitive>(primitive_buf));
  free(primitive_buf);
  fbb.Clear();
  return primitive;
 }

 PrimitiveC *NPUPassUtils::CreateNhwc2NchwPrimitive() {
  flatbuffers::FlatBufferBuilder fbb(1024);
  auto val_offset = schema::CreateNhwc2Nchw(fbb);
  auto prim_offset = schema::CreatePrimitive(fbb, schema::PrimitiveType_Nhwc2Nchw, val_offset.o);
  fbb.Finish(prim_offset);
  auto buf = fbb.GetBufferPointer();
  if (buf == nullptr) {
    MS_LOG(ERROR) << "GetBufferPointer return nullptr";
    fbb.Clear();
    return nullptr;
  }
  auto primitive_buf = reinterpret_cast<char *>(malloc(fbb.GetSize()));
  if (primitive_buf == nullptr) {
    MS_LOG(ERROR) << "Malloc primitive_buf_ failed.";
    fbb.Clear();
    return nullptr;
  }
  memcpy(primitive_buf, buf, fbb.GetSize());
  auto *primitive = PrimitiveC::NewPrimitiveC<Nhwc2Nchw>(flatbuffers::GetRoot<schema::Primitive>(primitive_buf));
  free(primitive_buf);
  fbb.Clear();
  return primitive;
 }

 kernel::LiteKernel *NPUPassUtils::CreateNchw2NhwcKernel(const std::vector<Tensor *> &in_tensors,
                                                        const std::vector<Tensor *> &out_tensors,
                                                        const InnerContext *ctx, const std::string &name) {
  kernel::KernelKey key{kCPU, kNumberTypeFloat32, schema::PrimitiveType_Nchw2Nhwc};
  auto nchw2nhwc_primitive = CreateNchw2NhwcPrimitive();
  auto *nchw2nhwc_kernel =
    KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, nchw2nhwc_primitive, ctx, key);
  nchw2nhwc_kernel->set_name(name);
  return nchw2nhwc_kernel;
 }

 kernel::LiteKernel *NPUPassUtils::CreateNhwc2NchwKernel(const std::vector<Tensor *> &in_tensors,
                                                        const std::vector<Tensor *> &out_tensors,
                                                        const InnerContext *ctx, const std::string &name) {
  kernel::KernelKey key{kCPU, kNumberTypeFloat32, schema::PrimitiveType_Nhwc2Nchw};
  auto nhwc2nchw_primitive = CreateNhwc2NchwPrimitive();
  auto *nhwc2nchw_kernel =
    KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, nhwc2nchw_primitive, ctx, key);
  nhwc2nchw_kernel->set_name(name);
  return nhwc2nchw_kernel;
 }

 void NPUPassUtils::UpdateKernel(kernel::LiteKernel *kernel, const std::vector<kernel::LiteKernel *> &in_kernels,
                                const std::vector<kernel::LiteKernel *> &out_kernels,
                                const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors) {
  kernel->set_in_tensors(in_tensors);
  kernel->set_out_tensors(out_tensors);
  kernel->set_in_kernels(in_kernels);
  kernel->set_out_kernels(out_kernels);
 }
 }  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/agent/npu/npu_pass_utils.h
+++ b/mindspore/lite/src/runtime/agent/npu/npu_pass_utils.h
@@ -0,0 +1,44 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_PASS_UTILS_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_PASS_UTILS_H_
 #include <vector>
 #include <string>
 #include "src/ops/primitive_c.h"
 #include "src/lite_kernel.h"
 namespace mindspore::lite {
 class NPUPassUtils {
 public:
  static kernel::LiteKernel *CreateNchw2NhwcKernel(const std::vector<Tensor *> &in_tensors,
                                                   const std::vector<Tensor *> &out_tensors, const InnerContext *ctx,
                                                   const std::string &name);

  static kernel::LiteKernel *CreateNhwc2NchwKernel(const std::vector<Tensor *> &in_tensors,
                                                   const std::vector<Tensor *> &out_tensors, const InnerContext *ctx,
                                                   const std::string &name);

  static void UpdateKernel(kernel::LiteKernel *kernel, const std::vector<kernel::LiteKernel *> &in_kernels,
                           const std::vector<kernel::LiteKernel *> &out_kernels,
                           const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors);

 private:
  static PrimitiveC *CreateNchw2NhwcPrimitive();

  static PrimitiveC *CreateNhwc2NchwPrimitive();
 };
 }  // namespace mindspore::lite
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_PASS_UTILS_H_
--- a/mindspore/lite/src/runtime/agent/npu/npu_transform_pass.cc
+++ b/mindspore/lite/src/runtime/agent/npu/npu_transform_pass.cc
@@ -0,0 +1,201 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "src/runtime/agent/npu/npu_transform_pass.h"
 #include <vector>
 #include "src/lite_kernel.h"
 #include "src/runtime/agent/npu/npu_manager.h"
 #include "src/runtime/agent/npu/npu_pass_utils.h"
 namespace mindspore::lite {
 using kernel::KERNEL_ARCH::kCPU;
 using kernel::KERNEL_ARCH::kNPU;
 int NPUTransformPass::UpdateNH2NCTransNodePreKernel(kernel::LiteKernel *kernel, kernel::LiteKernel *trans_kernel,
                                                    kernel::LiteKernel *after_kernel) {
  std::vector<kernel::LiteKernel *> out_kernels;

  for (auto out_kernel : kernel->out_kernels()) {
    if (out_kernel == after_kernel) {
      out_kernels.push_back(trans_kernel);
    } else {
      out_kernels.push_back(out_kernel);
    }
  }
  NPUPassUtils::UpdateKernel(kernel, kernel->in_kernels(), out_kernels, kernel->in_tensors(), kernel->out_tensors());
  return RET_OK;
 }

 int NPUTransformPass::UpdateNH2NCTransNodeAfterKernel(kernel::LiteKernel *kernel, kernel::LiteKernel *trans_kernel,
                                                      kernel::LiteKernel *before_kernel) {
  std::vector<lite::Tensor *> cur_kernel_in_tensors = {trans_kernel->out_tensors()[0]};
  for (int i = 1; i < kernel->in_tensors().size(); i++) {
    cur_kernel_in_tensors.push_back(kernel->in_tensors()[i]);
  }
  std::vector<kernel::LiteKernel *> cur_in_kernels = {trans_kernel};
  for (int i = 0; i < kernel->in_kernels().size(); i++) {
    auto in_kernel = kernel->in_kernels()[i];
    if (in_kernel != kernel) {
      cur_in_kernels.push_back(in_kernel);
    }
  }
  NPUPassUtils::UpdateKernel(kernel, cur_in_kernels, kernel->out_kernels(), cur_kernel_in_tensors,
                             kernel->out_tensors());
  return RET_OK;
 }

 int NPUTransformPass::InsertPreNode(const InnerContext *context, std::vector<kernel::LiteKernel *>::iterator it,
                                    std::vector<kernel::LiteKernel *> *all_kernels,
                                    std::vector<Tensor *> *all_tensors) {
  auto kernel = *it;
  bool is_input_kernel = kernel->in_kernels().empty();
  if (is_input_kernel || kernel->in_kernels()[0]->desc().arch != kNPU ||
      npu_trans_nodes.find(kernel->in_kernels()[0]->Type()) == npu_trans_nodes.end()) {
    kernel::LiteKernel *before_kernel = nullptr;
    if (!is_input_kernel) {
      before_kernel = kernel->in_kernels()[0];
    }
    // Create pre transform kernel out tensors.
    std::vector<int> shapes{kernel->in_tensors()[0]->shape()[0], kernel->in_tensors()[0]->shape()[3],
                            kernel->in_tensors()[0]->shape()[1], kernel->in_tensors()[0]->shape()[2]};
    auto tensor = new Tensor(kernel->in_tensors()[0]->data_type(), shapes, schema::Format_NCHW, Tensor::VAR);
    std::vector<Tensor *> pre_trans_out_tensors = {tensor};
    all_tensors->push_back(pre_trans_out_tensors[0]);
    // Replace the output tensor of the previous node
    auto name = kernel->name() + "_pre_trans" + "_Nhwc2Nchw_" + std::to_string(total++);
    auto *pre_trans_kernel =
      NPUPassUtils::CreateNhwc2NchwKernel({kernel->in_tensors()[0]}, pre_trans_out_tensors, context, name);
    // Insert Nhwc2Nchw into the front of the current queue
    all_kernels->push_back(pre_trans_kernel);
    // Replace the output kernel of the previous node
    std::vector<kernel::LiteKernel *> pre_trans_in_kernel;
    if (is_input_kernel) {
      pre_trans_in_kernel = {};
    } else {
      pre_trans_in_kernel = {before_kernel};
    }
    NPUPassUtils::UpdateKernel(pre_trans_kernel, pre_trans_in_kernel, {kernel}, {kernel->in_tensors()[0]},
                               pre_trans_out_tensors);

    if (before_kernel != nullptr) {
      UpdateNH2NCTransNodePreKernel(before_kernel, pre_trans_kernel, kernel);
    }
    UpdateNH2NCTransNodeAfterKernel(kernel, pre_trans_kernel, before_kernel);
  }
  return RET_OK;
 }

 int NPUTransformPass::InsertPostNode(const InnerContext *context, std::vector<kernel::LiteKernel *>::iterator it,
                                     std::vector<kernel::LiteKernel *> *all_kernels,
                                     std::vector<Tensor *> *all_tensors) {
  auto kernel = *it;
  // Single output multiple references
  for (int i = 0; i < kernel->out_kernels().size(); i++) {
    auto next_kernel = kernel->out_kernels().at(i);
    if (next_kernel->desc().arch == kNPU && npu_trans_nodes.find(next_kernel->Type()) != npu_trans_nodes.end()) {
      continue;
    }
    // Change format the output of the current kernel nhwc->nchw
    auto shapes = {kernel->out_tensors()[0]->shape()[0], kernel->out_tensors()[0]->shape()[1],
                   kernel->out_tensors()[0]->shape()[2], kernel->out_tensors()[0]->shape()[3]};
    auto tensor = new Tensor(kernel->out_tensors()[0]->data_type(), shapes, schema::Format_NHWC, Tensor::VAR);
    std::vector<Tensor *> post_trans_out_tensors = {tensor};
    all_tensors->push_back(post_trans_out_tensors[0]);
    // Use the output tensor of the current node as the input tensor of the post-conversion operator
    auto name = kernel->name() + "_post_trans" + "_Nchw2Nhwc" + std::to_string(total++);
    auto *post_trans_kernel =
      NPUPassUtils::CreateNchw2NhwcKernel(kernel->out_tensors(), post_trans_out_tensors, context, name);
    // Replace the input tensor of the next node
    NPUPassUtils::UpdateKernel(post_trans_kernel, {kernel}, {next_kernel}, kernel->out_tensors(),
                               post_trans_out_tensors);
    // Directly insert in the back, will not affect the topological sort
    all_kernels->push_back(post_trans_kernel);
    UpdateNC2NHTransNodePreKernel(kernel, post_trans_kernel, next_kernel);
    UpdateNC2NHTransNodeAfterKernel(kernel, post_trans_kernel, next_kernel);
  }
  return RET_OK;
 }

 int NPUTransformPass::UpdateNC2NHTransNodePreKernel(kernel::LiteKernel *kernel, kernel::LiteKernel *trans_kernel,
                                                    kernel::LiteKernel *next_kernel) {
  std::vector<kernel::LiteKernel *> cur_out_kernels;
  for (auto out_kernel : kernel->out_kernels()) {
    if (out_kernel == next_kernel) {
      cur_out_kernels.push_back(trans_kernel);
    } else {
      cur_out_kernels.push_back(out_kernel);
    }
  }
  auto kernel_out_tensor = kernel->out_tensors()[0];
  // Change format the output of the current kernel nhwc->nchw
  std::vector<int> kernel_out_new_shapes = {kernel_out_tensor->shape()[0], kernel_out_tensor->shape()[3],
                                            kernel_out_tensor->shape()[1], kernel_out_tensor->shape()[2]};
  kernel_out_tensor->set_format(schema::Format_NCHW);
  kernel_out_tensor->set_shape(kernel_out_new_shapes);
  NPUPassUtils::UpdateKernel(kernel, kernel->in_kernels(), cur_out_kernels, kernel->in_tensors(), {kernel_out_tensor});
  return RET_OK;
 }

 int NPUTransformPass::UpdateNC2NHTransNodeAfterKernel(kernel::LiteKernel *kernel, kernel::LiteKernel *trans_kernel,
                                                      kernel::LiteKernel *next_kernel) {
  std::vector<Tensor *> next_in_tensors;
  for (auto next_in_tensor : next_kernel->in_tensors()) {
    if (next_in_tensor != kernel->out_tensors()[0]) {
      next_in_tensors.push_back(next_in_tensor);
    } else {
      next_in_tensors.push_back(trans_kernel->out_tensors()[0]);
    }
  }
  next_kernel->set_in_tensors(next_in_tensors);
  std::vector<kernel::LiteKernel *> next_in_kernels;
  for (auto in_kernel : next_kernel->in_kernels()) {
    if (in_kernel == kernel) {
      next_in_kernels.push_back(trans_kernel);
    } else {
      next_in_kernels.push_back(in_kernel);
    }
  }
  NPUPassUtils::UpdateKernel(next_kernel, next_in_kernels, next_kernel->out_kernels(), next_in_tensors,
                             next_kernel->out_tensors());

  return RET_OK;
 }

 int NPUTransformPass::FormatTransformPass(const InnerContext *context, std::vector<kernel::LiteKernel *> *all_kernels,
                                          std::vector<Tensor *> *all_tensors) {
  if (context->IsNpuEnabled()) {
    std::vector<kernel::LiteKernel *> new_kernels;

    for (auto it = all_kernels->begin(); it != all_kernels->end(); it++) {
      auto kernel = *it;
      if (kernel->desc().arch != kNPU) {
        new_kernels.push_back(kernel);
        continue;
      }
      if (npu_trans_nodes.find(kernel->Type()) != npu_trans_nodes.end()) {
        InsertPreNode(context, it, &new_kernels, all_tensors);
        new_kernels.push_back(kernel);
        InsertPostNode(context, it, &new_kernels, all_tensors);
      } else {
        new_kernels.push_back(kernel);
      }
    }
    all_kernels->clear();
    for (int i = 0; i < new_kernels.size(); i++) {
      all_kernels->push_back(new_kernels[i]);
    }
  }
  return RET_OK;
 }

 }  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/agent/npu/npu_transform_pass.h
+++ b/mindspore/lite/src/runtime/agent/npu/npu_transform_pass.h
@@ -0,0 +1,51 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_TRANSFORM_PASS_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_TRANSFORM_PASS_H_
 #include <vector>
 #include "src/lite_kernel.h"
 #include "src/ops/primitive_c.h"
 namespace mindspore::lite {
 class NPUTransformPass {
 public:
  int FormatTransformPass(const InnerContext *context, std::vector<kernel::LiteKernel *> *all_kernels,
                          std::vector<Tensor *> *all_tensors);

 private:
  int UpdateNH2NCTransNodePreKernel(kernel::LiteKernel *kernel, kernel::LiteKernel *trans_kernel,
                                    kernel::LiteKernel *after_kernel);

  int UpdateNH2NCTransNodeAfterKernel(kernel::LiteKernel *kernel, kernel::LiteKernel *trans_kernel,
                                      kernel::LiteKernel *before_kernel);

  int UpdateNC2NHTransNodePreKernel(kernel::LiteKernel *kernel, kernel::LiteKernel *trans_kernel,
                                    kernel::LiteKernel *after_kernel);

  int UpdateNC2NHTransNodeAfterKernel(kernel::LiteKernel *kernel, kernel::LiteKernel *trans_kernel,
                                      kernel::LiteKernel *next_kernel);

  int InsertPreNode(const InnerContext *context, std::vector<kernel::LiteKernel *>::iterator it,
                    std::vector<kernel::LiteKernel *> *all_kernels, std::vector<Tensor *> *all_tensors);

  int InsertPostNode(const InnerContext *context, std::vector<kernel::LiteKernel *>::iterator it,
                     std::vector<kernel::LiteKernel *> *all_kernels, std::vector<Tensor *> *all_tensors);

 private:
  int total = 0;
 };
 }  // namespace mindspore::lite
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_TRANSFORM_PASS_H_
--- a/mindspore/lite/src/runtime/agent/npu/subgraph_npu_kernel.cc
+++ b/mindspore/lite/src/runtime/agent/npu/subgraph_npu_kernel.cc
@@ -24,7 +24,6 @@
 #include "include/graph/model.h"
 #include "include/hiai_ir_build.h"
 #include "include/HiAiModelManagerType.h"
 #include "include/context.h"
 #include "include/version.h"
 #include "src/common/utils.h"
 #include "src/runtime/agent/npu/npu_converter_utils.h"
@@ -34,10 +33,6 @@ namespace mindspore::kernel {
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;

 std::set<schema::PrimitiveType> trans_nodes = {schema::PrimitiveType_Conv2D, schema::PrimitiveType_DeConv2D,
                                               schema::PrimitiveType_DepthwiseConv2D,
                                               schema::PrimitiveType_DeDepthwiseConv2D, schema::PrimitiveType_Resize};

 domi::ModelBufferData *SubGraphNpuKernel::BuildIRModel() {
  ge::Graph graph("NPUGraph");

@@ -75,8 +70,7 @@ domi::ModelBufferData *SubGraphNpuKernel::BuildIRModel() {
 }

 int SubGraphNpuKernel::Run() {
  return reinterpret_cast<lite::NPUExecutor *>(this->executor_)
    ->Run(in_tensors_, out_tensors_, nodes_, inputs_nhwc2nchw_, outputs_nchw2nhwc_);
  return reinterpret_cast<lite::NPUExecutor *>(this->executor_)->Run(in_tensors_, out_tensors_, nodes_);
 }

 int SubGraphNpuKernel::BuildNPUInputOp() {
@@ -88,21 +82,7 @@ int SubGraphNpuKernel::BuildNPUInputOp() {
      if (IsSubGraphInputTensor(in_tensor)) {
        auto tensor_name = node->name() + "_" + std::to_string(count++);
        hiai::op::Data *data;
        if (trans_nodes.find(node->Type()) != trans_nodes.end()) {
          auto shape = in_tensor->shape();
          data = new (std::nothrow) hiai::op::Data(tensor_name);
          if (data == nullptr) {
            MS_LOG(ERROR) << "New data failed.";
            return RET_ERROR;
          }
          ge::TensorDesc tensor_desc(lite::ConverterToNPUShape({shape[0], shape[3], shape[1], shape[2]}),
                                     ge::FORMAT_NCHW, lite::ConverterToNPUDataType(in_tensor->data_type()));
          data->update_input_desc_x(tensor_desc);
          inputs_nhwc2nchw_.push_back(true);
        } else {
          data = mindspore::lite::ConverterToNPUData(in_tensor, tensor_name);
          inputs_nhwc2nchw_.push_back(false);
        }
        data = mindspore::lite::ConverterToNPUData(in_tensor, tensor_name);
        subgraph_input_op_.push_back(*data);
        node_input_op.push_back(data);
        continue;
@@ -132,7 +112,7 @@ int SubGraphNpuKernel::BuildNPUInputOp() {

      // weight tensor
      if (is_weight_tensor) {
        if (trans_nodes.find(node->Type()) == trans_nodes.end()) {
        if (lite::npu_trans_nodes.find(node->Type()) == lite::npu_trans_nodes.end()) {
          auto name = node->name() + "_" + std::to_string(count++);
          auto weight_const = new (std::nothrow) hiai::op::Const(node->name() + "_" + std::to_string(count++));
          if (weight_const == nullptr) {
@@ -162,11 +142,6 @@ std::vector<ge::Operator> SubGraphNpuKernel::GetNPUNodes(const vector<kernel::Li
  ops.reserve(nodes.size());
  for (int i = 0; i < nodes.size(); i++) {
    ops.push_back(*reinterpret_cast<NPUKernel *>(nodes[i])->GetNPUOp());
    if (trans_nodes.find(schema::PrimitiveType(nodes[i]->GetPrimitive()->Type())) != trans_nodes.end()) {
      outputs_nchw2nhwc_.push_back(true);
    } else {
      outputs_nchw2nhwc_.push_back(false);
    }
  }
  return ops;
 }
--- a/mindspore/lite/src/runtime/agent/npu/subgraph_npu_kernel.h
+++ b/mindspore/lite/src/runtime/agent/npu/subgraph_npu_kernel.h
@@ -69,10 +69,6 @@ class SubGraphNpuKernel : public SubGraphKernel {
  std::string GetOMModelName();

 private:
  std::vector<bool> inputs_nhwc2nchw_;

  std::vector<bool> outputs_nchw2nhwc_;

  domi::ModelBufferData *model_buffer_data_;

  std::vector<ge::Operator> subgraph_input_op_;
--- a/mindspore/lite/src/runtime/kernel/npu/convolution_base_npu.cc
+++ b/mindspore/lite/src/runtime/kernel/npu/convolution_base_npu.cc
@@ -16,6 +16,7 @@

 #include "src/runtime/kernel/npu/convolution_base_npu.h"
 #include "src/runtime/agent/npu/npu_converter_utils.h"
 #include "nnacl/pack.h"

 namespace mindspore::kernel {
 ConvolutionBaseNPUKernel::~ConvolutionBaseNPUKernel() {
@@ -39,14 +40,27 @@ int ConvolutionBaseNPUKernel::InitWeightBiasConst(const std::vector<lite::Tensor
    MS_LOG(ERROR) << "New weight const failed.";
    return RET_ERROR;
  }
  auto weight_shape = inputs[1]->shape();
  inputs[1]->set_shape({weight_shape[0], weight_shape[3], weight_shape[1], weight_shape[2]});
  inputs[1]->set_format(schema::Format_NCHW);
  auto weight_tensor = mindspore::lite::ConverterToNPUTensor(inputs[1]);
  weight_->set_attr_value(weight_tensor);
  auto w_shape = inputs[1]->shape();
  auto nhwc_data = inputs[1]->data_c();
  auto nchw_data = reinterpret_cast<float *>(malloc(inputs[1]->ElementsNum() * sizeof(float)));
  if (nchw_data == nullptr) {
    MS_LOG(ERROR) << "Malloc buffer failed.";
    return RET_ERROR;
  }
  PackNHWCToNCHWFp32(nhwc_data, nchw_data, w_shape[0], w_shape[1] * w_shape[2], w_shape[3]);

  inputs[1]->set_shape(weight_shape);
  inputs[1]->set_format(schema::Format_NHWC);
  std::shared_ptr<ge::Tensor> weight_tensor = std::shared_ptr<ge::Tensor>(new (std::nothrow) ge::Tensor());
  if (weight_tensor == nullptr) {
    MS_LOG(ERROR) << "new weight_tensor failed.";
    return RET_ERROR;
  }
  ge::TensorDesc tensor_desc(lite::ConverterToNPUShape({w_shape[0], w_shape[3], w_shape[1], w_shape[2]}),
                             ge::FORMAT_NCHW, lite::ConverterToNPUDataType(inputs[1]->data_type()));
  weight_tensor->SetTensorDesc(tensor_desc);
  weight_tensor->SetData(reinterpret_cast<const uint8_t *>(nchw_data), inputs[1]->Size());

  weight_->set_attr_value(weight_tensor);
  free(nchw_data);

  if (inputs.size() >= 3) {
    bias_ = new (std::nothrow) hiai::op::Const(name_ + "_b");
--- a/mindspore/lite/src/runtime/kernel/npu/convolution_base_npu.h
+++ b/mindspore/lite/src/runtime/kernel/npu/convolution_base_npu.h
@@ -17,17 +17,18 @@
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_NPU_CONVOLUTION_BASE_NPU_H_

 #include <vector>
 #include <memory>
 #include "include/graph/op/all_ops.h"
 #include "src/runtime/kernel/npu/transpose_base_npu.h"
 #include "nnacl/conv_parameter.h"

 namespace mindspore::kernel {
 class ConvolutionBaseNPUKernel : public TransposeBaseNPUKernel {
 class ConvolutionBaseNPUKernel : public NPUKernel {
 public:
  ConvolutionBaseNPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
                           const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
                           const mindspore::lite::PrimitiveC *primitive)
      : TransposeBaseNPUKernel(parameter, inputs, outputs, ctx, primitive) {}
      : NPUKernel(parameter, inputs, outputs, ctx, primitive) {}
  ~ConvolutionBaseNPUKernel() override;

 protected:
--- a/mindspore/lite/src/runtime/kernel/npu/convolution_depthwise_npu.cc
+++ b/mindspore/lite/src/runtime/kernel/npu/convolution_depthwise_npu.cc
@@ -25,7 +25,7 @@ using mindspore::schema::PrimitiveType_DepthwiseConv2D;
 namespace mindspore::kernel {
 int ConvolutionDepthwiseNPUKernel::IsSupport(const std::vector<lite::Tensor *> &inputs,
                                             const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter) {
  return RET_ERROR;
  return RET_OK;
 }

 int ConvolutionDepthwiseNPUKernel::SetConvDwParam() {
@@ -49,19 +49,13 @@ int ConvolutionDepthwiseNPUKernel::SetConvDwParam() {
 int ConvolutionDepthwiseNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *> &inputs,
                                                const std::vector<lite::Tensor *> &outputs,
                                                const std::vector<ge::Operator *> &npu_inputs) {
  auto ret = SetPreTranspose(npu_inputs[0]);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "New pre transpose npu operator (NHWC -> NCHW) for op " << name_ << " failed.";
    return RET_ERROR;
  }

  // set conv attr param
  conv_dw_ = new (std::nothrow) hiai::op::ConvolutionDepthwise(name_ + "_conv_depthwise");
  if (conv_dw_ == nullptr) {
    MS_LOG(ERROR) << "New convolution depthwise operator for op " << name_ << " failed.";
    return RET_ERROR;
  }
  ret = SetConvDwParam();
  auto ret = SetConvDwParam();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Set npu op parameter for convolution depthwise op " << name_ << " failed.";
    return RET_ERROR;
@@ -76,7 +70,7 @@ int ConvolutionDepthwiseNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *
  if (inputs.size() == 3) {
    conv_dw_->set_input_bias(*bias_);
  }
  conv_dw_->set_input_x(*pre_trans_);
  conv_dw_->set_input_x(*npu_inputs[0]);

  if (conv_param_->act_type_ != ActType_No) {
    ret = SetActivation(conv_dw_, conv_param_->act_type_);
@@ -85,21 +79,17 @@ int ConvolutionDepthwiseNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *
      return RET_ERROR;
    }
  }
  return RET_OK;
 }

 ge::Operator *mindspore::kernel::ConvolutionDepthwiseNPUKernel::GetNPUOp() {
  if (conv_param_->act_type_ == ActType_No) {
    ret = SetPostTranspose(conv_dw_);
    return conv_dw_;
  } else {
    ret = SetPostTranspose(act_);
  }
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "New post transpose npu operator (NCHW -> NHWC) for op " << name_ << " failed.";
    return RET_ERROR;
    return act_;
  }
  return RET_OK;
 }

 ge::Operator *mindspore::kernel::ConvolutionDepthwiseNPUKernel::GetNPUOp() { return post_trans_; }

 ConvolutionDepthwiseNPUKernel::~ConvolutionDepthwiseNPUKernel() {
  if (conv_dw_ != nullptr) {
    delete conv_dw_;
--- a/mindspore/lite/src/runtime/kernel/npu/convolution_npu.cc
+++ b/mindspore/lite/src/runtime/kernel/npu/convolution_npu.cc
@@ -24,7 +24,7 @@ using mindspore::schema::PrimitiveType_Conv2D;
 namespace mindspore::kernel {
 int ConvolutionNPUKernel::IsSupport(const std::vector<lite::Tensor *> &inputs,
                                    const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter) {
  return RET_ERROR;
  return RET_OK;
 }

 int ConvolutionNPUKernel::SetConvParam() {
@@ -49,19 +49,13 @@ int ConvolutionNPUKernel::SetConvParam() {
 int ConvolutionNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *> &inputs,
                                       const std::vector<lite::Tensor *> &outputs,
                                       const std::vector<ge::Operator *> &npu_inputs) {
  auto ret = SetPreTranspose(npu_inputs[0]);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "New pre transpose npu operator (NHWC -> NCHW) for op " << name_ << " failed.";
    return RET_ERROR;
  }

  // set conv attr param
  conv_ = new (std::nothrow) hiai::op::Convolution(name_ + "_conv");
  if (conv_ == nullptr) {
    MS_LOG(ERROR) << "New convolution operator for convolution op " << name_ << " failed.";
    return RET_ERROR;
  }
  ret = SetConvParam();
  auto ret = SetConvParam();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Set npu op parameter for convolution op " << name_ << " failed.";
    return RET_ERROR;
@@ -76,7 +70,7 @@ int ConvolutionNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *> &inputs
  if (inputs.size() == 3) {
    conv_->set_input_bias(*bias_);
  }
  conv_->set_input_x(*pre_trans_);
  conv_->set_input_x(*npu_inputs[0]);

  if (conv_param_->act_type_ != ActType_No) {
    ret = SetActivation(conv_, conv_param_->act_type_);
@@ -85,21 +79,17 @@ int ConvolutionNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *> &inputs
      return RET_ERROR;
    }
  }
  return RET_OK;
 }

 ge::Operator *mindspore::kernel::ConvolutionNPUKernel::GetNPUOp() {
  if (conv_param_->act_type_ == ActType_No) {
    ret = SetPostTranspose(conv_);
    return conv_;
  } else {
    ret = SetPostTranspose(act_);
  }
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "New post transpose npu operator (NCHW -> NHWC) for op " << name_ << " failed.";
    return RET_ERROR;
    return act_;
  }
  return RET_OK;
 }

 ge::Operator *mindspore::kernel::ConvolutionNPUKernel::GetNPUOp() { return post_trans_; }

 ConvolutionNPUKernel::~ConvolutionNPUKernel() {
  if (conv_ != nullptr) {
    delete conv_;
--- a/mindspore/lite/src/runtime/kernel/npu/pooling_npu.cc
+++ b/mindspore/lite/src/runtime/kernel/npu/pooling_npu.cc
@@ -62,23 +62,17 @@ int PoolingNPUKernel::SetPoolingParam() {
 int PoolingNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *> &inputs,
                                   const std::vector<lite::Tensor *> &outputs,
                                   const std::vector<ge::Operator *> &npu_inputs) {
  auto ret = SetPreTranspose(npu_inputs[0]);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "New pre transpose npu operator (NHWC -> NCHW) for op " << name_ << " failed.";
    return RET_ERROR;
  }

  pooling_ = new (std::nothrow) hiai::op::PoolingD(name_ + "_pooling");
  if (pooling_ == nullptr) {
    MS_LOG(ERROR) << "New pooling npu operator for op " << name_ << " failed.";
    return RET_ERROR;
  }
  ret = SetPoolingParam();
  auto ret = SetPoolingParam();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Set npu op parameter for convolution op " << name_ << " failed.";
    return RET_ERROR;
  }
  pooling_->set_input_x(*pre_trans_);
  pooling_->set_input_x(*npu_inputs[0]);

  if (pooling_param_->act_type_ != ActType_No) {
    ret = SetActivation(pooling_, pooling_param_->act_type_);
@@ -87,21 +81,17 @@ int PoolingNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *> &inputs,
      return RET_ERROR;
    }
  }
  return RET_OK;
 }

 ge::Operator *mindspore::kernel::PoolingNPUKernel::GetNPUOp() {
  if (pooling_param_->act_type_ == ActType_No) {
    ret = SetPostTranspose(pooling_);
    return pooling_;
  } else {
    ret = SetPostTranspose(act_);
    return act_;
  }
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "New post transpose npu operator (NCHW -> NHWC) for op " << name_ << " failed.";
    return RET_ERROR;
  }
  return RET_OK;
 }

 ge::Operator *mindspore::kernel::PoolingNPUKernel::GetNPUOp() { return post_trans_; }

 PoolingNPUKernel::~PoolingNPUKernel() {
  if (pooling_ != nullptr) {
    delete pooling_;
--- a/mindspore/lite/src/runtime/kernel/npu/resize_npu.cc
+++ b/mindspore/lite/src/runtime/kernel/npu/resize_npu.cc
@@ -36,12 +36,6 @@ int ResizeNPUKernel::IsSupport(const std::vector<lite::Tensor *> &inputs, const

 int ResizeNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *> &inputs, const std::vector<lite::Tensor *> &outputs,
                                  const std::vector<ge::Operator *> &npu_inputs) {
  auto ret = SetPreTranspose(npu_inputs[0]);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "New pre transpose npu operator (NHWC -> NCHW) for op " << name_ << " failed.";
    return RET_ERROR;
  }

  ge::TensorDesc sizeTensorDesc(ge::Shape({2}), ge::FORMAT_NCHW, ge::DT_INT32);
  ge::TensorPtr sizeTensor = std::make_shared<hiai::Tensor>(sizeTensorDesc);
  vector<int32_t> dataValue = {static_cast<int32_t>(new_height_), static_cast<int32_t>(new_width_)};
@@ -55,7 +49,7 @@ int ResizeNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *> &inputs, con
      return RET_ERROR;
    }
    op->set_attr_align_corners(align_corners_);
    op->set_input_x(*pre_trans_);
    op->set_input_x(*npu_inputs[0]);
    op->set_input_size(*out_size);
    op->set_attr_half_pixel_centers(preserve_aspect_ratio_);
    op_ = op;
@@ -66,21 +60,14 @@ int ResizeNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *> &inputs, con
      return RET_ERROR;
    }
    op->set_attr_align_corners(align_corners_);
    op->set_input_x(*pre_trans_);
    op->set_input_x(*npu_inputs[0]);
    op->set_input_size(*out_size);
    op_ = op;
  }

  ret = SetPostTranspose(op_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "New post transpose npu operator (NCHW -> NHWC) for op " << name_ << " failed.";
    return RET_ERROR;
  }

  return RET_OK;
 }

 ge::Operator *mindspore::kernel::ResizeNPUKernel::GetNPUOp() { return this->post_trans_; }
 ge::Operator *mindspore::kernel::ResizeNPUKernel::GetNPUOp() { return this->op_; }

 ResizeNPUKernel::~ResizeNPUKernel() {
  if (op_ != nullptr) {
--- a/mindspore/lite/src/runtime/kernel/npu/resize_npu.h
+++ b/mindspore/lite/src/runtime/kernel/npu/resize_npu.h
@@ -24,12 +24,12 @@
 #include "include/graph/op/all_ops.h"
 #include "src/runtime/kernel/npu/transpose_base_npu.h"
 namespace mindspore::kernel {
 class ResizeNPUKernel : public TransposeBaseNPUKernel {
 class ResizeNPUKernel : public NPUKernel {
 public:
  ResizeNPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
                  const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
                  const mindspore::lite::PrimitiveC *primitive)
      : TransposeBaseNPUKernel(parameter, inputs, outputs, ctx, primitive) {
      : NPUKernel(parameter, inputs, outputs, ctx, primitive) {
    auto resize_parameter = reinterpret_cast<ResizeParameter *>(parameter);
    method_ = resize_parameter->method_;
    new_height_ = resize_parameter->new_height_;
--- a/mindspore/lite/src/scheduler.cc
+++ b/mindspore/lite/src/scheduler.cc
@@ -33,6 +33,8 @@
 #if SUPPORT_NPU
 #include "src/runtime/agent/npu/subgraph_npu_kernel.h"
 #include "src/runtime/agent/npu/npu_manager.h"
 #include "src/runtime/agent/npu/npu_transform_pass.h"
 #include "src/runtime/agent/npu/npu_fusion_pass.h"
 #endif
 namespace mindspore::lite {
 using kernel::KERNEL_ARCH::kCPU;
@@ -63,6 +65,11 @@ int Scheduler::Schedule(std::vector<kernel::LiteKernel *> *dst_kernels) {
    return ret;
  }
  FindAllInoutKernels(*dst_kernels);
  ret = RunPass(dst_kernels);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Schedule run pass failed.";
    return ret;
  }
  ret = ConstructSubGraphs(dst_kernels);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConstructSubGraphs failed.";
@@ -514,4 +521,25 @@ void Scheduler::FindAllInoutKernels(const std::vector<kernel::LiteKernel *> &ker
    kernel->FindInoutKernels(kernels);
  }
 }

 int Scheduler::RunPass(std::vector<kernel::LiteKernel *> *dst_kernels) {
  int ret = RET_OK;
 #if SUPPORT_NPU
  auto transform_pass = new NPUTransformPass;
  ret = transform_pass->FormatTransformPass(context_, dst_kernels, &src_tensors_);
  delete transform_pass;
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Run npu format transform pass failed.";
    return ret;
  }
  auto fusion_pass = new NPUFusionPass(dst_kernels);
  ret = fusion_pass->Fusion();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Run npu fussion transform pass failed.";
    return ret;
  }
  delete fusion_pass;
 #endif
  return ret;
 }
 }  // namespace mindspore::lite
--- a/mindspore/lite/src/scheduler.h
+++ b/mindspore/lite/src/scheduler.h
@@ -77,6 +77,8 @@ class Scheduler {

  static kernel::SubGraphType GetKernelSubGraphType(const kernel::LiteKernel *kernel);

  int RunPass(std::vector<kernel::LiteKernel *> *dst_kernels);

 protected:
  const InnerContext *context_ = nullptr;
  Model *src_model_ = nullptr;