diff --git a/mindspore/lite/src/runtime/agent/npu/npu_manager.cc b/mindspore/lite/src/runtime/agent/npu/npu_manager.cc index e841db7a33..6cda1cdc29 100644 --- a/mindspore/lite/src/runtime/agent/npu/npu_manager.cc +++ b/mindspore/lite/src/runtime/agent/npu/npu_manager.cc @@ -49,6 +49,7 @@ bool NPUManager::CheckEMUIVersion() { auto version = emui_str.substr(pos + 1); int ret = CompareVersion(version, "10.0.0"); if (ret < 0) { + MS_LOG(WARNING) << "EMUI version " << version << " less than 10.0.0"; return false; } } @@ -80,8 +81,9 @@ bool NPUManager::CheckDDKVersion() { auto client = std::make_shared(); if (client->GetVersion() != nullptr) { std::string version = client->GetVersion(); - int ret = CompareVersion(version, "100.330.010.011"); + int ret = CompareVersion(version, "100.320.010.023"); if (ret < 0) { + MS_LOG(WARNING) << "DDK Version " << version << " less than 100.320.010.023"; return false; } } @@ -96,7 +98,7 @@ bool NPUManager::IsSupportNPU() { MS_LOG(INFO) << "The current device support NPU."; } else { is_support_ = false; - MS_LOG(INFO) << "The current device NOT SUPPORT NPU."; + MS_LOG(WARNING) << "The current device NOT SUPPORT NPU."; } return is_support_; } else { @@ -130,6 +132,7 @@ bool NPUManager::IsKirinChip() { cpu_info.close(); return true; } else { + MS_LOG(WARNING) << "Unsupported KirinChip " << kirin_number; cpu_info.close(); return false; } diff --git a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_fusion_pass.cc b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_fusion_pass.cc index beab4db676..65d8c6bfb4 100644 --- a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_fusion_pass.cc +++ b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_fusion_pass.cc @@ -15,6 +15,7 @@ */ #include "src/runtime/agent/npu/optimizer/npu_fusion_pass.h" #include +#include "src/runtime/agent/npu/optimizer/npu_pass_utils.h" #include "src/lite_kernel.h" #include "nnacl/concat_parameter.h" @@ -22,14 +23,16 @@ namespace mindspore::lite { bool CheckFusion(kernel::LiteKernel *kernel) { auto pre_flag = std::all_of(kernel->in_kernels().begin(), kernel->in_kernels().end(), [](const kernel::LiteKernel *in_kernel) { - return in_kernel->Type() == schema::PrimitiveType_Nchw2Nhwc && in_kernel->out_kernels().size() == 1; + return NPUPassUtils::IsNchw2Nhwc(const_cast(in_kernel)) && + in_kernel->out_kernels().size() == 1; }); if (!pre_flag) { return false; } - auto post_flag = std::all_of( - kernel->out_kernels().begin(), kernel->out_kernels().end(), - [](const kernel::LiteKernel *out_kernel) { return out_kernel->Type() == schema::PrimitiveType_Nhwc2Nchw; }); + auto post_flag = + std::all_of(kernel->out_kernels().begin(), kernel->out_kernels().end(), [](const kernel::LiteKernel *out_kernel) { + return NPUPassUtils::IsNhwc2Nchw(const_cast(out_kernel)); + }); return post_flag; } @@ -37,15 +40,17 @@ bool CheckFormatFusion(kernel::LiteKernel *kernel) { if (kernel->out_kernels().empty()) { return false; } - if (kernel->Type() == schema::PrimitiveType_Nhwc2Nchw) { - return std::all_of( - kernel->out_kernels().begin(), kernel->out_kernels().end(), - [](const kernel::LiteKernel *kernel) { return kernel->Type() == schema::PrimitiveType_Nchw2Nhwc; }); + if (NPUPassUtils::IsNhwc2Nchw(const_cast(kernel))) { + return std::all_of(kernel->out_kernels().begin(), kernel->out_kernels().end(), + [](const kernel::LiteKernel *kernel) { + return NPUPassUtils::IsNchw2Nhwc(const_cast(kernel)); + }); } - if (kernel->Type() == schema::PrimitiveType_Nchw2Nhwc) { - return std::all_of( - kernel->out_kernels().begin(), kernel->out_kernels().end(), - [](const kernel::LiteKernel *kernel) { return kernel->Type() == schema::PrimitiveType_Nhwc2Nchw; }); + if (NPUPassUtils::IsNchw2Nhwc(const_cast(kernel))) { + return std::all_of(kernel->out_kernels().begin(), kernel->out_kernels().end(), + [](const kernel::LiteKernel *kernel) { + return NPUPassUtils::IsNhwc2Nchw(const_cast(kernel)); + }); } return false; } @@ -60,6 +65,10 @@ void NPUFusionPass::RemoveAndFreeKernel(kernel::LiteKernel *cur_kernel) { void NPUFusionPass::UpdatePreKernels(kernel::LiteKernel *cur_kernel) { for (auto in_kernel : cur_kernel->in_kernels()) { + // graph in kernel + if (in_kernel->in_kernels().empty()) { + continue; + } auto pre_kernel = in_kernel->in_kernels()[0]; auto pre_out_kernels = pre_kernel->out_kernels(); @@ -85,6 +94,10 @@ void NPUFusionPass::UpdatePreKernels(kernel::LiteKernel *cur_kernel) { void NPUFusionPass::UpdatePostKernels(kernel::LiteKernel *cur_kernel) { for (auto out_kernel : cur_kernel->out_kernels()) { + // graph out kernel + if (out_kernel->out_kernels().empty()) { + continue; + } auto post_kernel = out_kernel->out_kernels()[0]; auto post_in_kernels = post_kernel->in_kernels(); @@ -183,22 +196,13 @@ int NPUFusionPass::ConcatFusion(kernel::LiteKernel *kernel) { int NPUFusionPass::FormatFusion(kernel::LiteKernel *kernel) { auto pre_kernel = kernel->in_kernels()[0]; auto in_tensor = kernel->in_tensors()[0]; - auto out_tensor = kernel->out_tensors()[0]; - auto tensor_itr = std::find(pre_kernel->out_tensors().begin(), pre_kernel->out_tensors().end(), in_tensor); - if (tensor_itr != pre_kernel->out_tensors().end()) { - in_tensor = *tensor_itr; - } else { - MS_LOG(ERROR) << "Can't find the connneted tensor between kernel " << kernel->name() << " and it's pre_kernel."; - return RET_ERROR; - } - std::vector pre_insert_kernels; for (const auto &trans_kernel : kernel->out_kernels()) { for (const auto &post_kernel : trans_kernel->out_kernels()) { // update tensor auto tensors_vec = post_kernel->in_tensors(); for (size_t i = 0; i < tensors_vec.size(); i++) { - if (tensors_vec[i] == out_tensor) { + if (tensors_vec[i] == trans_kernel->out_tensors()[0]) { tensors_vec[i] = in_tensor; break; } @@ -218,10 +222,7 @@ int NPUFusionPass::FormatFusion(kernel::LiteKernel *kernel) { RemoveAndFreeKernel(trans_kernel); } } - auto pre_out_kernels = pre_kernel->out_kernels(); - auto itr = find(pre_out_kernels.begin(), pre_out_kernels.end(), kernel); - pre_out_kernels.insert(itr, pre_insert_kernels.begin(), pre_insert_kernels.end()); - pre_kernel->set_in_kernels(pre_out_kernels); + pre_kernel->set_out_kernels(pre_insert_kernels); RemoveAndFreeKernel(kernel); return RET_OK; } @@ -229,7 +230,8 @@ int NPUFusionPass::FormatFusion(kernel::LiteKernel *kernel) { int NPUFusionPass::Run() { for (size_t i = 0; i < kernels->size(); i++) { auto kernel = (*kernels)[i]; - if (kernel->Type() == schema::PrimitiveType_Nchw2Nhwc || kernel->Type() == schema::PrimitiveType_Nchw2Nhwc) { + if (NPUPassUtils::IsNchw2Nhwc(const_cast(kernel)) || + NPUPassUtils::IsNhwc2Nchw(const_cast(kernel))) { if (CheckFormatFusion(kernel)) { i--; FormatFusion(kernel); diff --git a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_insert_transform_pass.cc b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_insert_transform_pass.cc index e321787f4a..1cc6a93dc3 100644 --- a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_insert_transform_pass.cc +++ b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_insert_transform_pass.cc @@ -30,11 +30,13 @@ int GetInsertState(kernel::LiteKernel *kernel) { return InsertNone; } auto pre_flag = - std::all_of(kernel->in_kernels().begin(), kernel->in_kernels().end(), - [](const kernel::LiteKernel *kernel) { return kernel->Type() == schema::PrimitiveType_Nchw2Nhwc; }); + std::all_of(kernel->in_kernels().begin(), kernel->in_kernels().end(), [](const kernel::LiteKernel *kernel) { + return NPUPassUtils::IsNchw2Nhwc(const_cast(kernel)); + }); auto post_flag = - std::all_of(kernel->out_kernels().begin(), kernel->out_kernels().end(), - [](const kernel::LiteKernel *kernel) { return kernel->Type() == schema::PrimitiveType_Nhwc2Nchw; }); + std::all_of(kernel->out_kernels().begin(), kernel->out_kernels().end(), [](const kernel::LiteKernel *kernel) { + return NPUPassUtils::IsNhwc2Nchw(const_cast(kernel)); + }); if (pre_flag && !post_flag) { return PostInsert; } @@ -48,7 +50,7 @@ int NPUInsertTransformPass::InsertPreNode(const InnerContext *context, kernel::L std::vector *trans_kernels, std::vector *all_tensors) { for (auto in_kernel : kernel->in_kernels()) { - if (in_kernel->Type() == schema::PrimitiveType_Nchw2Nhwc) { + if (NPUPassUtils::IsNchw2Nhwc(const_cast(in_kernel))) { continue; } auto nhwc_shape = in_kernel->out_tensors()[0]->shape(); @@ -86,7 +88,7 @@ int NPUInsertTransformPass::InsertPostNode(const InnerContext *context, kernel:: std::vector *trans_kernels, std::vector *all_tensors) { for (auto out_kernel : kernel->out_kernels()) { - if (out_kernel->Type() == schema::PrimitiveType_Nhwc2Nchw) { + if (NPUPassUtils::IsNhwc2Nchw(const_cast(out_kernel))) { continue; } auto nhwc_shape = kernel->out_tensors()[0]->shape(); diff --git a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_pass_utils.cc b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_pass_utils.cc index 1e91038d86..ac0d2d686f 100644 --- a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_pass_utils.cc +++ b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_pass_utils.cc @@ -14,17 +14,19 @@ * limitations under the License. */ -#include "src/kernel_registry.h" -#include "src/ops/nhwc2nchw.h" -#include "src/ops/nchw2nhwc.h" #include "src/runtime/agent/npu/optimizer/npu_pass_utils.h" +#include "src/ops/transpose.h" +#include "nnacl/transpose.h" +#include "src/ops/populate/populate_register.h" +#include "src/runtime/kernel/arm/fp32/transpose_fp32.h" + namespace mindspore::lite { using kernel::KERNEL_ARCH::kCPU; using kernel::KERNEL_ARCH::kNPU; -PrimitiveC *NPUPassUtils::CreateNchw2NhwcPrimitive() { +PrimitiveC *NPUPassUtils::CreateTransposePrimitive() { flatbuffers::FlatBufferBuilder fbb(1024); auto val_offset = schema::CreateNchw2Nhwc(fbb); - auto prim_offset = schema::CreatePrimitive(fbb, schema::PrimitiveType_Nchw2Nhwc, val_offset.o); + auto prim_offset = schema::CreatePrimitive(fbb, schema::PrimitiveType_Transpose, val_offset.o); fbb.Finish(prim_offset); auto buf = fbb.GetBufferPointer(); if (buf == nullptr) { @@ -39,56 +41,72 @@ PrimitiveC *NPUPassUtils::CreateNchw2NhwcPrimitive() { return nullptr; } memcpy(primitive_buf, buf, fbb.GetSize()); - auto *primitive = PrimitiveC::NewPrimitiveC(flatbuffers::GetRoot(primitive_buf)); + auto *primitive = PrimitiveC::NewPrimitiveC(flatbuffers::GetRoot(primitive_buf)); free(primitive_buf); fbb.Clear(); return primitive; } -PrimitiveC *NPUPassUtils::CreateNhwc2NchwPrimitive() { - flatbuffers::FlatBufferBuilder fbb(1024); - auto val_offset = schema::CreateNhwc2Nchw(fbb); - auto prim_offset = schema::CreatePrimitive(fbb, schema::PrimitiveType_Nhwc2Nchw, val_offset.o); - fbb.Finish(prim_offset); - auto buf = fbb.GetBufferPointer(); - if (buf == nullptr) { - MS_LOG(ERROR) << "GetBufferPointer return nullptr"; - fbb.Clear(); +kernel::LiteKernel *NPUPassUtils::CreateNchw2NhwcKernel(const std::vector &in_tensors, + const std::vector &out_tensors, + const InnerContext *ctx, const std::string &name) { + kernel::KernelKey key{kCPU, kNumberTypeFloat32, schema::PrimitiveType_Transpose}; + auto nchw2nhwc_primitive = CreateTransposePrimitive(); + auto *transpose_param = reinterpret_cast(malloc(sizeof(TransposeParameter))); + if (transpose_param == nullptr) { + MS_LOG(ERROR) << "malloc TransposeParameter failed."; return nullptr; } - auto primitive_buf = reinterpret_cast(malloc(fbb.GetSize())); - if (primitive_buf == nullptr) { - MS_LOG(ERROR) << "Malloc primitive buffer failed."; - fbb.Clear(); + memset(transpose_param, 0, sizeof(TransposeParameter)); + transpose_param->op_parameter_.type_ = nchw2nhwc_primitive->Type(); + transpose_param->perm_[0] = 0; + transpose_param->perm_[1] = 2; + transpose_param->perm_[2] = 3; + transpose_param->perm_[3] = 1; + transpose_param->num_axes_ = 4; + + auto kernel = new (std::nothrow) kernel::TransposeCPUKernel(reinterpret_cast(transpose_param), + in_tensors, out_tensors, ctx, nchw2nhwc_primitive); + if (kernel != nullptr) { + kernel->set_desc(key); + } else { + MS_LOG(ERROR) << "New Nchw2Nhwc Kernel failed."; return nullptr; } - memcpy(primitive_buf, buf, fbb.GetSize()); - auto *primitive = PrimitiveC::NewPrimitiveC(flatbuffers::GetRoot(primitive_buf)); - free(primitive_buf); - fbb.Clear(); - return primitive; -} -kernel::LiteKernel *NPUPassUtils::CreateNchw2NhwcKernel(const std::vector &in_tensors, - const std::vector &out_tensors, - const InnerContext *ctx, const std::string &name) { - kernel::KernelKey key{kCPU, kNumberTypeFloat32, schema::PrimitiveType_Nchw2Nhwc}; - auto nchw2nhwc_primitive = CreateNchw2NhwcPrimitive(); - auto *nchw2nhwc_kernel = - KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, nchw2nhwc_primitive, ctx, key); - nchw2nhwc_kernel->set_name(name); - return nchw2nhwc_kernel; + kernel->set_name(name); + return kernel; } kernel::LiteKernel *NPUPassUtils::CreateNhwc2NchwKernel(const std::vector &in_tensors, const std::vector &out_tensors, const InnerContext *ctx, const std::string &name) { - kernel::KernelKey key{kCPU, kNumberTypeFloat32, schema::PrimitiveType_Nhwc2Nchw}; - auto nhwc2nchw_primitive = CreateNhwc2NchwPrimitive(); - auto *nhwc2nchw_kernel = - KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, nhwc2nchw_primitive, ctx, key); - nhwc2nchw_kernel->set_name(name); - return nhwc2nchw_kernel; + kernel::KernelKey key{kCPU, kNumberTypeFloat32, schema::PrimitiveType_Transpose}; + auto nhwc2nchw_primitive = CreateTransposePrimitive(); + auto *transpose_param = reinterpret_cast(malloc(sizeof(TransposeParameter))); + if (transpose_param == nullptr) { + MS_LOG(ERROR) << "malloc TransposeParameter failed."; + return nullptr; + } + memset(transpose_param, 0, sizeof(TransposeParameter)); + transpose_param->op_parameter_.type_ = nhwc2nchw_primitive->Type(); + transpose_param->perm_[0] = 0; + transpose_param->perm_[1] = 3; + transpose_param->perm_[2] = 1; + transpose_param->perm_[3] = 2; + transpose_param->num_axes_ = 4; + + auto kernel = new (std::nothrow) kernel::TransposeCPUKernel(reinterpret_cast(transpose_param), + in_tensors, out_tensors, ctx, nhwc2nchw_primitive); + if (kernel != nullptr) { + kernel->set_desc(key); + } else { + MS_LOG(ERROR) << "New Nhwc2Nchw Kernel failed."; + return nullptr; + } + + kernel->set_name(name); + return kernel; } void NPUPassUtils::UpdateKernel(kernel::LiteKernel *kernel, const std::vector &in_kernels, @@ -173,4 +191,39 @@ void NPUPassUtils::UpdateNC2NHTransNodeAfterKernel(kernel::LiteKernel *kernel, k post_kernel->set_in_kernels(post_in_kernels); post_kernel->set_in_tensors({post_in_tensors}); } + +bool NPUPassUtils::IsNhwc2Nchw(kernel::LiteKernel *kernel) { + if (kernel->Type() != schema::PrimitiveType_Transpose) { + return false; + } + auto parameter = reinterpret_cast(kernel->op_parameter()); + if (parameter->num_axes_ != 4) { + return false; + } + + std::vector perm = {parameter->perm_[0], parameter->perm_[1], parameter->perm_[2], parameter->perm_[3]}; + std::vector nh2nc_perm = {0, 3, 1, 2}; + if (nh2nc_perm == perm) { + return true; + } + return false; +} + +bool NPUPassUtils::IsNchw2Nhwc(kernel::LiteKernel *kernel) { + if (kernel->Type() != schema::PrimitiveType_Transpose) { + return false; + } + auto parameter = reinterpret_cast(kernel->op_parameter()); + if (parameter->num_axes_ != 4) { + return false; + } + + std::vector perm = {parameter->perm_[0], parameter->perm_[1], parameter->perm_[2], parameter->perm_[3]}; + std::vector nh2nc_perm = {0, 2, 3, 1}; + if (nh2nc_perm == perm) { + return true; + } + return false; +} + } // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_pass_utils.h b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_pass_utils.h index c5a3cc1eab..d7bdedf0e8 100644 --- a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_pass_utils.h +++ b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_pass_utils.h @@ -47,10 +47,12 @@ class NPUPassUtils { static void UpdateNC2NHTransNodeAfterKernel(kernel::LiteKernel *kernel, kernel::LiteKernel *trans_kernel, kernel::LiteKernel *post_kernel); - private: - static PrimitiveC *CreateNchw2NhwcPrimitive(); + static bool IsNhwc2Nchw(kernel::LiteKernel *kernel); + + static bool IsNchw2Nhwc(kernel::LiteKernel *kernel); - static PrimitiveC *CreateNhwc2NchwPrimitive(); + private: + static PrimitiveC *CreateTransposePrimitive(); }; } // namespace mindspore::lite #endif // MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_OPTIMIZER_NPU_PASS_UTILS_H_ diff --git a/mindspore/lite/src/runtime/agent/npu/subgraph_npu_kernel.cc b/mindspore/lite/src/runtime/agent/npu/subgraph_npu_kernel.cc index 2718732209..f8f4641b6d 100644 --- a/mindspore/lite/src/runtime/agent/npu/subgraph_npu_kernel.cc +++ b/mindspore/lite/src/runtime/agent/npu/subgraph_npu_kernel.cc @@ -103,7 +103,6 @@ int SubGraphNpuKernel::BuildNPUInputOp() { // input come from npu auto npu_op = reinterpret_cast(in_kernel)->GetNPUOp(); if (npu_op != nullptr) { - npu_op->GetOutputDesc(0).GetName(); node_input_op.push_back(npu_op); is_weight_tensor = false; break; @@ -168,14 +167,13 @@ std::string SubGraphNpuKernel::GetOMModelName() { return this->name_ + ".om"; } int SubGraphNpuKernel::Init() { if (!is_compiled_) { + name_ = "kNpuSubGraph" + std::to_string(mindspore::lite::NPUManager::GetInstance()->index()); auto model_buffer_data = BuildIRModel(); if (model_buffer_data == nullptr) { MS_LOG(ERROR) << "Build IR model failed."; return RET_ERROR; } - name_ = "kNpuSubGraph" + std::to_string(mindspore::lite::NPUManager::GetInstance()->index()); - mindspore::lite::NPUManager::GetInstance()->AddModel(model_buffer_data, GetOMModelName(), context_->GetNpuInfo().frequency_); diff --git a/mindspore/lite/src/runtime/kernel/npu/cast_npu.cc b/mindspore/lite/src/runtime/kernel/npu/cast_npu.cc index 338694ac01..3e9bff42ac 100644 --- a/mindspore/lite/src/runtime/kernel/npu/cast_npu.cc +++ b/mindspore/lite/src/runtime/kernel/npu/cast_npu.cc @@ -36,7 +36,7 @@ int CastNPUKernel::SetNPUInputs(const std::vector &inputs, const } op_->set_input_x(*npu_inputs[0]); op_->set_attr_dst_dtype(lite::ConverterToNPUDataType(static_cast(cast_parameter_->dst_type_))); - op_->set_attr_src_dtype(lite::ConverterToNPUDataType(static_cast(cast_parameter_->src_type_))); + op_->set_attr_src_dtype(lite::ConverterToNPUDataType(static_cast(inputs[0]->data_type()))); return RET_OK; } diff --git a/mindspore/lite/test/models_npu.cfg b/mindspore/lite/test/models_npu.cfg index 91ea4773c0..f189ebf502 100644 --- a/mindspore/lite/test/models_npu.cfg +++ b/mindspore/lite/test/models_npu.cfg @@ -1,3 +1,3 @@ -mobilenet_v1_1.0_224.tflite 1.5 -squeezenet.tflite 1.5 -inception_v3.tflite 0.5 +mobilenet_v1_1.0_224.tflite 2.5 +squeezenet.tflite 2.5 +inception_v3.tflite 1