npu pad stridedslice fusion for nasnet

5 years ago · 32f35b1055
--- a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_fusion_pass.cc
+++ b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_fusion_pass.cc
@@ -18,6 +18,9 @@
 #include "src/runtime/agent/npu/optimizer/npu_pass_utils.h"
 #include "src/lite_kernel.h"
 #include "nnacl/concat_parameter.h"
 #include "nnacl/split_parameter.h"
 #include "nnacl/pad_parameter.h"
 #include "nnacl/strided_slice_parameter.h"

 namespace mindspore::lite {
 bool CheckFusion(kernel::LiteKernel *kernel) {
@@ -119,7 +122,7 @@ void NPUFusionPass::UpdatePostKernels(kernel::LiteKernel *cur_kernel) {
 }

 void UpdatePreTensors(kernel::LiteKernel *cur_kernel) {
  auto tensors_vec = cur_kernel->in_tensors();
  auto tensors_vec = NPUPassUtils::GetNonConstInputs(cur_kernel);
  for (auto in_kernel : cur_kernel->in_kernels()) {
    lite::Tensor *cur_tensor = nullptr;
    auto in_tensor = in_kernel->in_tensors()[0];
@@ -136,6 +139,15 @@ void UpdatePreTensors(kernel::LiteKernel *cur_kernel) {
      }
    }
  }
  // add constant inputs back
  if (nodes2const_index.find(static_cast<schema::PrimitiveType>(cur_kernel->op_parameter()->type_)) !=
      nodes2const_index.end()) {
    tensors_vec.resize(cur_kernel->in_tensors().size());
    auto const_index = nodes2const_index[static_cast<schema::PrimitiveType>(cur_kernel->op_parameter()->type_)];
    for (auto index : const_index) {
      tensors_vec[index] = cur_kernel->in_tensors()[index];
    }
  }
  cur_kernel->set_in_tensors(tensors_vec);
 }

@@ -275,15 +287,75 @@ int NPUFusionPass::FormatFusion(kernel::LiteKernel *kernel) {
  return RET_OK;
 }

 int NPUFusionPass::SplitFusion(kernel::LiteKernel *kernel) {
  UpdateKernel(kernel);
  auto split_param = reinterpret_cast<SplitParameter *>(kernel->op_parameter());
  split_param->split_dim_ = TransFormAxis(split_param->split_dim_);
  return RET_OK;
 }

 int NPUFusionPass::PadFusion(kernel::LiteKernel *kernel) {
  UpdateKernel(kernel);
  auto pad_param = reinterpret_cast<PadParameter *>(kernel->op_parameter());
  int c1 = pad_param->paddings_[6];
  int c2 = pad_param->paddings_[7];
  // 0 1 2 3 4 5 6 7
  // n n h h w w c c
  // n n c c h h w w
  pad_param->paddings_[6] = pad_param->paddings_[4];
  pad_param->paddings_[7] = pad_param->paddings_[5];
  pad_param->paddings_[4] = pad_param->paddings_[2];
  pad_param->paddings_[5] = pad_param->paddings_[3];
  pad_param->paddings_[2] = c1;
  pad_param->paddings_[3] = c2;
  return RET_OK;
 }

 int NPUFusionPass::StridedSliceFusion(kernel::LiteKernel *kernel) {
  // basic requirement: input is nhwc 4d
  UpdateKernel(kernel);
  auto param = reinterpret_cast<StridedSliceParameter *>(kernel->op_parameter());
  auto begin_tensor = kernel->in_tensors().at(1);
  int *begin = reinterpret_cast<int *>(begin_tensor->data_c());
  (void)NPUPassUtils::AssistDataNHWC2NCHW(begin, 1);
  auto end_tensor = kernel->in_tensors().at(2);
  int *end = reinterpret_cast<int *>(end_tensor->data_c());
  NPUPassUtils::AssistDataNHWC2NCHW(end, 1);
  auto stride_tensor = kernel->in_tensors().at(3);
  if (kernel->in_tensors().size() == 5) {
    stride_tensor = kernel->in_tensors().at(4);
  }
  int *stride = reinterpret_cast<int *>(stride_tensor->data_c());
  NPUPassUtils::AssistDataNHWC2NCHW(stride, 1);
  param->begins_mask_ = NPUPassUtils::MaskDataNHWC2NCHW(param->begins_mask_);
  param->ends_mask_ = NPUPassUtils::MaskDataNHWC2NCHW(param->ends_mask_);
  param->ellipsisMask_ = NPUPassUtils::MaskDataNHWC2NCHW(param->ellipsisMask_);
  param->newAxisMask_ = NPUPassUtils::MaskDataNHWC2NCHW(param->newAxisMask_);
  param->shrinkAxisMask_ = NPUPassUtils::MaskDataNHWC2NCHW(param->shrinkAxisMask_);
  return RET_OK;
 }

 int NPUFusionPass::Run() {
  for (size_t i = 0; i < kernels->size(); i++) {
    auto kernel = (*kernels)[i];
    if (CheckFusion(kernel)) {
      switch (kernel->Type()) {
        case schema::PrimitiveType_Split:
          i -= kernel->in_kernels().size();
          SplitFusion(kernel);
          continue;
        case schema::PrimitiveType_Concat:
          i -= kernel->in_kernels().size();
          ConcatFusion(kernel);
          continue;
        case schema::PrimitiveType_PadFusion:
          i -= kernel->in_kernels().size();
          PadFusion(kernel);
          continue;
        case schema::PrimitiveType_StridedSlice:
          i -= kernel->in_kernels().size();
          StridedSliceFusion(kernel);
          continue;
        case schema::PrimitiveType_AddFusion:
        case schema::PrimitiveType_Activation:
        case schema::PrimitiveType_Eltwise:
--- a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_fusion_pass.h
+++ b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_fusion_pass.h
@@ -39,6 +39,9 @@ class NPUFusionPass : public NPUBasePass {
  int CommonFusion(kernel::LiteKernel *kernel);
  int ConcatFusion(kernel::LiteKernel *kernel);
  int FormatFusion(kernel::LiteKernel *kernel);
  int SplitFusion(kernel::LiteKernel *kernel);
  int PadFusion(kernel::LiteKernel *kernel);
  int StridedSliceFusion(kernel::LiteKernel *kernel);

 private:
  std::vector<kernel::LiteKernel *> *kernels;
--- a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_insert_transform_pass.cc
+++ b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_insert_transform_pass.cc
@@ -23,8 +23,10 @@ namespace mindspore::lite {
 using kernel::KERNEL_ARCH::kNPU;
 enum InsertState { InsertNone, PreInsert, PostInsert, BothInsert };
 std::set<mindspore::schema::PrimitiveType> npu_insert_nodes = {
  schema::PrimitiveType_Concat, schema::PrimitiveType_AddFusion, schema::PrimitiveType_Eltwise,
  schema::PrimitiveType_Activation};
  schema::PrimitiveType_Concat,       schema::PrimitiveType_AddFusion, schema::PrimitiveType_Eltwise,
  schema::PrimitiveType_Activation,   schema::PrimitiveType_Split,     schema::PrimitiveType_PadFusion,
  schema::PrimitiveType_StridedSlice, schema::PrimitiveType_Activation};

 // this pass goal is to minimize subgraphs generated
 // by inserting nchw2nhwc or nhwc2nchw before or after the operator (e.g. concat, add, etc..) together with
 // fusion pass. If transpose inserted are more than half of input output, we will insert remaining input
@@ -44,7 +46,7 @@ std::set<mindspore::schema::PrimitiveType> npu_insert_nodes = {
 // so we won't insert nc2nh or nh2nc when op's in kernels and out kernels contains no nc2nh or nh2nc.
 // This pass should be run after npu_transform_pass, which insert transpose for nchw-input-limited op like conv2d.

 int GetInsertState(kernel::LiteKernel *kernel) {
 int NPUInsertTransformPass::GetInsertState(kernel::LiteKernel *kernel) {
  // filter out irrelevant kernel
  if (npu_insert_nodes.find(kernel->Type()) == npu_insert_nodes.end()) {
    return InsertNone;
@@ -52,15 +54,17 @@ int GetInsertState(kernel::LiteKernel *kernel) {

  // current kernel is target kernel
  // use out kernels to count how many out lines from current kernel
  std::vector<Tensor *> in_tensors = NPUPassUtils::GetNonConstInputs(kernel);
  size_t in_out_tensor_num =
    kernel->in_tensors().size() + std::max(kernel->out_kernels().size(), static_cast<size_t>(1));
    in_tensors.size() +
    std::max(std::max(kernel->out_kernels().size(), static_cast<size_t>(1)), kernel->out_tensors().size());
  size_t transpose_input_num = 0;
  size_t transpose_output_num = 0;
  bool need_pre_insert = false;
  bool need_post_insert = false;
  // count number of input tensor from nc2nh and output tensor to nh2nc
  for (size_t i = 0; i < kernel->in_tensors().size(); ++i) {
    auto in_kernel = NPUPassUtils::KernelInputFromKernel(kernel, i);
  for (size_t i = 0; i < in_tensors.size(); ++i) {
    auto in_kernel = NPUPassUtils::KernelInputFromKernel(kernel, in_tensors.at(i));
    if (NPUPassUtils::IsNchw2Nhwc(in_kernel)) {
      transpose_input_num++;
    } else {
@@ -81,21 +85,22 @@ int GetInsertState(kernel::LiteKernel *kernel) {
  // won't insert any thing if num of transpose tensor is smaller than half of total input output.
  // won't insert if total input output are all transpose tensor, the fusion pass will handle this.
  size_t transpose_tensor_num = transpose_input_num + transpose_output_num;
  if (transpose_tensor_num <= in_out_tensor_num / 2 || transpose_tensor_num == in_out_tensor_num) {
  if (transpose_tensor_num == 0 || transpose_tensor_num * 2 < in_out_tensor_num ||
      transpose_tensor_num == in_out_tensor_num) {
    return InsertNone;
  }

  InsertState ret;
  if (need_pre_insert && !need_post_insert) {
    return PreInsert;
  }
  if (need_pre_insert && need_post_insert) {
    return BothInsert;
  }
  if (!need_pre_insert && need_post_insert) {
    return PostInsert;
    ret = PreInsert;
  } else if (need_pre_insert && need_post_insert) {
    ret = BothInsert;
  } else if (!need_pre_insert && need_post_insert) {
    ret = PostInsert;
  } else {
    ret = InsertNone;
  }

  return InsertNone;
  return ret;
 }

 int NPUInsertTransformPass::InsertNode(kernel::LiteKernel *kernel, kernel::LiteKernel *post_kernel,
@@ -200,13 +205,20 @@ int NPUInsertTransformPass::InsertForOutputTensor(kernel::LiteKernel *kernel, ke
 int NPUInsertTransformPass::InsertPreNodes(kernel::LiteKernel *kernel,
                                           std::vector<kernel::LiteKernel *> *trans_kernels) {
  int ret = RET_OK;
  for (size_t i = 0; i < kernel->in_tensors().size(); ++i) {
    auto pre_kernel = NPUPassUtils::KernelInputFromKernel(kernel, i);
  auto in_tensors = NPUPassUtils::GetNonConstInputs(kernel);
  for (auto tensor : in_tensors) {
    auto pre_kernel = NPUPassUtils::KernelInputFromKernel(kernel, tensor);
    if (NPUPassUtils::IsNchw2Nhwc(pre_kernel)) {
      continue;
    }
    // if this tensor is input of graph, pre_kernel is nullptr.
    ret = InsertForInputTensor(kernel, i, pre_kernel, trans_kernels);
    auto it = find(kernel->in_tensors().begin(), kernel->in_tensors().end(), tensor);
    if (it == kernel->in_tensors().end()) {
      MS_LOG(ERROR) << "Find in tensor index error";
      return RET_ERROR;
    }
    size_t index = it - kernel->in_tensors().begin();
    ret = InsertForInputTensor(kernel, index, pre_kernel, trans_kernels);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel before kernel " << kernel->name() << " failed.";
      return ret;
@@ -249,59 +261,63 @@ int NPUInsertTransformPass::InsertPostNodes(kernel::LiteKernel *kernel,

 int NPUInsertTransformPass::Run() {
  std::vector<kernel::LiteKernel *> insert_kernels;
  for (size_t i = 0; i < all_kernels_->size(); i++) {
    auto kernel = (*all_kernels_)[i];
    if (kernel->desc().arch != kNPU) {
      continue;
    }
    auto insert_state = GetInsertState(kernel);
    insert_kernels.clear();
    // If the every output kernel is nhwc2nchw, insert
    // modify loop index add post_kernels.size() to the next kernel in the origin vector
    switch (insert_state) {
      case PreInsert: {
        auto ret = InsertPreNodes(kernel, &insert_kernels);
        if (ret != RET_OK) {
          MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel before kernel " << kernel->name()
                        << " failed.";
          return RET_ERROR;
        }
        all_kernels_->insert(all_kernels_->begin() + i, insert_kernels.begin(), insert_kernels.end());
        i += insert_kernels.size();
        break;
  for (int j = 0; j < 2; ++j) {
    for (size_t i = 0; i < all_kernels_->size(); i++) {
      auto kernel = (*all_kernels_)[i];
      if (kernel->desc().arch != kNPU) {
        continue;
      }
      case PostInsert: {
        auto ret = InsertPostNodes(kernel, &insert_kernels);
        if (ret != RET_OK) {
          MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel after kernel " << kernel->name() << " failed.";
          return RET_ERROR;
      auto insert_state = GetInsertState(kernel);
      insert_kernels.clear();
      // If the every output kernel is nhwc2nchw, insert
      // modify loop index add post_kernels.size() to the next kernel in the origin vector
      switch (insert_state) {
        case PreInsert: {
          auto ret = InsertPreNodes(kernel, &insert_kernels);
          if (ret != RET_OK) {
            MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel before kernel " << kernel->name()
                          << " failed.";
            return RET_ERROR;
          }
          all_kernels_->insert(all_kernels_->begin() + i, insert_kernels.begin(), insert_kernels.end());
          i += insert_kernels.size();
          break;
        }
        all_kernels_->insert(all_kernels_->begin() + i + 1, insert_kernels.begin(), insert_kernels.end());
        i += insert_kernels.size();
        break;
      }
      case BothInsert: {
        auto ret = InsertPreNodes(kernel, &insert_kernels);
        if (ret != RET_OK) {
          MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel before kernel " << kernel->name()
                        << " failed.";
          return RET_ERROR;
        case PostInsert: {
          auto ret = InsertPostNodes(kernel, &insert_kernels);
          if (ret != RET_OK) {
            MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel after kernel " << kernel->name()
                          << " failed.";
            return RET_ERROR;
          }
          all_kernels_->insert(all_kernels_->begin() + i + 1, insert_kernels.begin(), insert_kernels.end());
          i += insert_kernels.size();
          break;
        }
        all_kernels_->insert(all_kernels_->begin() + i, insert_kernels.begin(), insert_kernels.end());
        i += insert_kernels.size();
        case BothInsert: {
          auto ret = InsertPreNodes(kernel, &insert_kernels);
          if (ret != RET_OK) {
            MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel before kernel " << kernel->name()
                          << " failed.";
            return RET_ERROR;
          }
          all_kernels_->insert(all_kernels_->begin() + i, insert_kernels.begin(), insert_kernels.end());
          i += insert_kernels.size();

        insert_kernels.clear();
        ret = InsertPostNodes(kernel, &insert_kernels);
        if (ret != RET_OK) {
          MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel after kernel " << kernel->name() << " failed.";
          return RET_ERROR;
          insert_kernels.clear();
          ret = InsertPostNodes(kernel, &insert_kernels);
          if (ret != RET_OK) {
            MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel after kernel " << kernel->name()
                          << " failed.";
            return RET_ERROR;
          }
          all_kernels_->insert(all_kernels_->begin() + i + 1, insert_kernels.begin(), insert_kernels.end());
          i += insert_kernels.size();
          break;
        }
        all_kernels_->insert(all_kernels_->begin() + i + 1, insert_kernels.begin(), insert_kernels.end());
        i += insert_kernels.size();
        break;
        default:
          MS_LOG(DEBUG) << "Insert Nothing on kernel " << kernel->name();
      }
      default:
        MS_LOG(DEBUG) << "Insert Nothing on kernel " << kernel->name();
    }
  }
  return RET_OK;
--- a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_insert_transform_pass.h
+++ b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_insert_transform_pass.h
@@ -34,6 +34,7 @@ class NPUInsertTransformPass : public NPUBasePass {
  int Run() override;

 private:
  int GetInsertState(kernel::LiteKernel *kernel);
  int InsertPreNodes(kernel::LiteKernel *kernel, std::vector<kernel::LiteKernel *> *trans_kernels);

  int InsertPostNodes(kernel::LiteKernel *kernel, std::vector<kernel::LiteKernel *> *trans_kernels);
--- a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_pass_utils.cc
+++ b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_pass_utils.cc
@@ -25,7 +25,10 @@
 namespace mindspore::lite {
 using kernel::KERNEL_ARCH::kCPU;
 using kernel::KERNEL_ARCH::kNPU;

 std::unordered_map<schema::PrimitiveType, std::set<int>> nodes2const_index{
  {schema::PrimitiveType_Split, {1}},
  {schema::PrimitiveType_PadFusion, {1}},
  {schema::PrimitiveType_StridedSlice, {1, 2, 3}}};
 kernel::LiteKernel *NPUPassUtils::CreateNchw2NhwcKernel(const std::vector<Tensor *> &in_tensors,
                                                        const std::vector<Tensor *> &out_tensors,
                                                        const InnerContext *ctx, const std::string &name) {
@@ -125,8 +128,8 @@ void NPUPassUtils::UpdateNC2NHTransNodePreKernel(kernel::LiteKernel *pre_kernel,
  }
  std::copy(trans_kernels.begin(), trans_kernels.end(), std::back_inserter(cur_out_kernels));
  pre_kernel->set_out_kernels(cur_out_kernels);
  // For kernel before trans, the output tensor is used for output tensor of trans, so replace the output tensor with
  // the input tensor of trans.
  // For kernel before trans, the output tensor is used for output tensor of trans, so replace the output tensor
  // with the input tensor of trans.
  pre_kernel->set_out_tensors({trans_kernels.at(0)->in_tensors().at(0)});
 }

@@ -158,7 +161,7 @@ void NPUPassUtils::UpdateNC2NHTransNodePostKernel(kernel::LiteKernel *kernel, ke
  Tensor *old_in_tensor = nullptr;
  // find out which input tensor of post_kernel should be updated
  for (size_t i = 0; i < post_in_tensors.size(); ++i) {
    if (KernelInputFromKernel(post_kernel, i) == kernel) {
    if (KernelInputFromKernel(post_kernel, post_in_tensors.at(i)) == kernel) {
      old_in_tensor = post_in_tensors.at(i);
      break;
    }
@@ -219,17 +222,16 @@ bool NPUPassUtils::IsNchw2Nhwc(const kernel::LiteKernel *kernel) {
  }
  return false;
 }
 kernel::LiteKernel *NPUPassUtils::KernelInputFromKernel(const kernel::LiteKernel *kernel, size_t in_tensor_index) {
 kernel::LiteKernel *NPUPassUtils::KernelInputFromKernel(const kernel::LiteKernel *kernel, Tensor *in_tensor) {
  // given kernel and input tensor index, get which kernel output this tensor.
  // If input tensor is graph input, return nullptr.
  if (kernel == nullptr) {
    return nullptr;
  }
  auto tensor = kernel->in_tensors().at(in_tensor_index);
  auto in_kernels = kernel->in_kernels();
  auto output_contain = [tensor](const kernel::LiteKernel *kernel) {
  auto output_contain = [in_tensor](const kernel::LiteKernel *kernel) {
    auto out_tensors = kernel->out_tensors();
    return std::find(out_tensors.begin(), out_tensors.end(), tensor) != out_tensors.end();
    return std::find(out_tensors.begin(), out_tensors.end(), in_tensor) != out_tensors.end();
  };
  auto it = std::find_if(in_kernels.begin(), in_kernels.end(), output_contain);
  if (it == in_kernels.end()) {
@@ -238,10 +240,57 @@ kernel::LiteKernel *NPUPassUtils::KernelInputFromKernel(const kernel::LiteKernel
  return *it;
 }

 std::vector<Tensor *> NPUPassUtils::GetNonConstInputs(kernel::LiteKernel *kernel) {
  if (kernel == nullptr) {
    return std::vector<Tensor *>{};
  }
  auto type = static_cast<schema::PrimitiveType>(kernel->op_parameter()->type_);
  auto it = nodes2const_index.find(type);
  if (it != nodes2const_index.end()) {
    auto const_input_indices = it->second;
    std::vector<Tensor *> non_const_in_tensors;
    auto in_tensors = kernel->in_tensors();
    for (auto i = 0; i < in_tensors.size(); ++i) {
      if (const_input_indices.find(i) == const_input_indices.end()) {
        non_const_in_tensors.push_back(in_tensors[i]);
      }
    }
    return non_const_in_tensors;
  }
  return kernel->in_tensors();
 }

 bool NPUPassUtils::Scale4dCase(const kernel::LiteKernel *kernel) {
  MS_ASSERT(kernel != nullptr && kernel->op_parameter() != nullptr);
  auto scale_param = reinterpret_cast<ScaleParameter *>(kernel->op_parameter());
  auto in_tensor = kernel->in_tensors().at(1);
  return in_tensor->shape().size() == 1 && (scale_param->axis_ == 3 || scale_param->axis_ == -1);
 }

 void NPUPassUtils::AssistDataNHWC2NCHW(int *data, size_t unit_size) {
  MS_ASSERT(data != nullptr);
  for (size_t i = 0; i < unit_size; ++i) {
    int c = data[3 * unit_size + i];
    // n h w c
    // n c h w
    data[3 * unit_size + i] = data[2 * unit_size + i];
    data[2 * unit_size + i] = data[unit_size + i];
    data[unit_size + i] = c;
  }
 }

 int NPUPassUtils::MaskDataNHWC2NCHW(int mask) {
  int mask_vec[4];
  for (int i = 0; i < 4; ++i) {
    mask_vec[i] = (uint32_t)(mask) & (1 << i);
  }
  AssistDataNHWC2NCHW(mask_vec, 1);
  int ret = 0;
  for (int i = 0; i < 4; ++i) {
    if (mask_vec[i]) {
      ret += 1 << i;
    }
  }
  return ret;
 }
 }  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_pass_utils.h
+++ b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_pass_utils.h
@@ -17,9 +17,12 @@
 #ifndef MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_OPTIMIZER_NPU_PASS_UTILS_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_OPTIMIZER_NPU_PASS_UTILS_H_
 #include <vector>
 #include <set>
 #include <string>
 #include <unordered_map>
 #include "src/lite_kernel.h"
 namespace mindspore::lite {
 extern std::unordered_map<schema::PrimitiveType, std::set<int>> nodes2const_index;
 class NPUPassUtils {
 public:
  static kernel::LiteKernel *CreateNchw2NhwcKernel(const std::vector<Tensor *> &in_tensors,
@@ -52,8 +55,11 @@ class NPUPassUtils {
  static bool IsNhwc2Nchw(const kernel::LiteKernel *kernel);

  static bool IsNchw2Nhwc(const kernel::LiteKernel *kernel);
  static kernel::LiteKernel *KernelInputFromKernel(const kernel::LiteKernel *kernel, size_t in_tensor_index);
  static kernel::LiteKernel *KernelInputFromKernel(const kernel::LiteKernel *kernel, Tensor *in_tensor);
  static std::vector<Tensor *> GetNonConstInputs(kernel::LiteKernel *kernel);
  static bool Scale4dCase(const kernel::LiteKernel *kernel);
  static void AssistDataNHWC2NCHW(int *data, size_t unit_size);
  static int MaskDataNHWC2NCHW(int mask);
 };
 }  // namespace mindspore::lite
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_OPTIMIZER_NPU_PASS_UTILS_H_
--- a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_transform_pass.cc
+++ b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_transform_pass.cc
@@ -14,7 +14,6 @@
 * limitations under the License.
 */
 #include "src/runtime/agent/npu/optimizer/npu_transform_pass.h"
 #include <set>
 #include <vector>
 #include "src/lite_kernel.h"
 #include "src/runtime/agent/npu/npu_manager.h"
@@ -22,7 +21,7 @@
 namespace mindspore::lite {
 using kernel::KERNEL_ARCH::kNPU;

 static std::set<mindspore::schema::PrimitiveType> npu_trans_nodes = {
 std::set<mindspore::schema::PrimitiveType> npu_trans_nodes = {
  schema::PrimitiveType_Conv2DFusion,  schema::PrimitiveType_Conv2dTransposeFusion, schema::PrimitiveType_Resize,
  schema::PrimitiveType_MaxPoolFusion, schema::PrimitiveType_AvgPoolFusion,         schema::PrimitiveType_ScaleFusion};

--- a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_transform_pass.h
+++ b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_transform_pass.h
@@ -16,11 +16,14 @@

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_OPTIMIZER_NPU_TRANSFORM_PASS_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_OPTIMIZER_NPU_TRANSFORM_PASS_H_

 #include <set>
 #include <vector>
 #include "src/lite_kernel.h"
 #include "src/runtime/agent/npu/optimizer/npu_base_pass.h"

 namespace mindspore::lite {
 extern std::set<mindspore::schema::PrimitiveType> npu_trans_nodes;
 class NPUTransformPass : public NPUBasePass {
 public:
  int Run() override;
--- a/mindspore/lite/src/runtime/kernel/npu/pad_npu.cc
+++ b/mindspore/lite/src/runtime/kernel/npu/pad_npu.cc
@@ -31,7 +31,7 @@ int PadNPUKernel::IsSupport(const std::vector<lite::Tensor *> &inputs, const std
  }
  if (inputs.size() >= 2 && inputs[1]->data_c() != nullptr) {
    for (int i = 0; i < inputs[1]->ElementsNum(); i++) {
      paddings_.push_back(static_cast<int *>(inputs[1]->data_c())[i]);
      param_->paddings_[i] = static_cast<int *>(inputs[1]->data_c())[i];
    }
  } else {
    MS_LOG(WARNING) << "NPU axis is attribute.";
@@ -50,7 +50,7 @@ int PadNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *> &inputs, const
  int size = static_cast<int>(param_->padding_length / 2);
  ge::TensorDesc padding_tensor_desc(ge::Shape({size, 2}), ge::FORMAT_NCHW, ge::DT_INT32);
  ge::TensorPtr padding_tensor = std::make_shared<hiai::Tensor>(padding_tensor_desc);
  padding_tensor->SetData(reinterpret_cast<uint8_t *>(paddings_.data()), 2 * size * sizeof(int));
  padding_tensor->SetData(reinterpret_cast<uint8_t *>(param_->paddings_), 2 * size * sizeof(int));
  hiai_paddings_ = new hiai::op::Const(name_ + "paddings");
  hiai_paddings_->set_attr_value(padding_tensor);

--- a/mindspore/lite/src/runtime/kernel/npu/pad_npu.h
+++ b/mindspore/lite/src/runtime/kernel/npu/pad_npu.h
@@ -39,7 +39,6 @@ class PadNPUKernel : public NPUKernel {
 private:
  hiai::op::PadV2 *op_ = nullptr;
  PadParameter *param_;
  std::vector<int> paddings_;
  hiai::op::Const *hiai_paddings_ = nullptr;
  hiai::op::Const *hiai_constant_ = nullptr;
 };
--- a/mindspore/lite/test/models_npu.cfg
+++ b/mindspore/lite/test/models_npu.cfg
@@ -77,3 +77,5 @@ ml_video_edit_img_segment_adaptise_pb2tflite.tflite 0.5 2
 ml_video_edit_imitate_filter.onnx 103
 hdc_mobilenet_1w_class.onnx 20
 hdc_age_medium 504
 posenet_mobilenet_float_075_1_default_1.tflite 395
 nasnet_mobile.tflite 1