| @@ -18,6 +18,9 @@ | |||
| #include "src/runtime/agent/npu/optimizer/npu_pass_utils.h" | |||
| #include "src/lite_kernel.h" | |||
| #include "nnacl/concat_parameter.h" | |||
| #include "nnacl/split_parameter.h" | |||
| #include "nnacl/pad_parameter.h" | |||
| #include "nnacl/strided_slice_parameter.h" | |||
| namespace mindspore::lite { | |||
| bool CheckFusion(kernel::LiteKernel *kernel) { | |||
| @@ -119,7 +122,7 @@ void NPUFusionPass::UpdatePostKernels(kernel::LiteKernel *cur_kernel) { | |||
| } | |||
| void UpdatePreTensors(kernel::LiteKernel *cur_kernel) { | |||
| auto tensors_vec = cur_kernel->in_tensors(); | |||
| auto tensors_vec = NPUPassUtils::GetNonConstInputs(cur_kernel); | |||
| for (auto in_kernel : cur_kernel->in_kernels()) { | |||
| lite::Tensor *cur_tensor = nullptr; | |||
| auto in_tensor = in_kernel->in_tensors()[0]; | |||
| @@ -136,6 +139,15 @@ void UpdatePreTensors(kernel::LiteKernel *cur_kernel) { | |||
| } | |||
| } | |||
| } | |||
| // add constant inputs back | |||
| if (nodes2const_index.find(static_cast<schema::PrimitiveType>(cur_kernel->op_parameter()->type_)) != | |||
| nodes2const_index.end()) { | |||
| tensors_vec.resize(cur_kernel->in_tensors().size()); | |||
| auto const_index = nodes2const_index[static_cast<schema::PrimitiveType>(cur_kernel->op_parameter()->type_)]; | |||
| for (auto index : const_index) { | |||
| tensors_vec[index] = cur_kernel->in_tensors()[index]; | |||
| } | |||
| } | |||
| cur_kernel->set_in_tensors(tensors_vec); | |||
| } | |||
| @@ -275,15 +287,75 @@ int NPUFusionPass::FormatFusion(kernel::LiteKernel *kernel) { | |||
| return RET_OK; | |||
| } | |||
| int NPUFusionPass::SplitFusion(kernel::LiteKernel *kernel) { | |||
| UpdateKernel(kernel); | |||
| auto split_param = reinterpret_cast<SplitParameter *>(kernel->op_parameter()); | |||
| split_param->split_dim_ = TransFormAxis(split_param->split_dim_); | |||
| return RET_OK; | |||
| } | |||
| int NPUFusionPass::PadFusion(kernel::LiteKernel *kernel) { | |||
| UpdateKernel(kernel); | |||
| auto pad_param = reinterpret_cast<PadParameter *>(kernel->op_parameter()); | |||
| int c1 = pad_param->paddings_[6]; | |||
| int c2 = pad_param->paddings_[7]; | |||
| // 0 1 2 3 4 5 6 7 | |||
| // n n h h w w c c | |||
| // n n c c h h w w | |||
| pad_param->paddings_[6] = pad_param->paddings_[4]; | |||
| pad_param->paddings_[7] = pad_param->paddings_[5]; | |||
| pad_param->paddings_[4] = pad_param->paddings_[2]; | |||
| pad_param->paddings_[5] = pad_param->paddings_[3]; | |||
| pad_param->paddings_[2] = c1; | |||
| pad_param->paddings_[3] = c2; | |||
| return RET_OK; | |||
| } | |||
| int NPUFusionPass::StridedSliceFusion(kernel::LiteKernel *kernel) { | |||
| // basic requirement: input is nhwc 4d | |||
| UpdateKernel(kernel); | |||
| auto param = reinterpret_cast<StridedSliceParameter *>(kernel->op_parameter()); | |||
| auto begin_tensor = kernel->in_tensors().at(1); | |||
| int *begin = reinterpret_cast<int *>(begin_tensor->data_c()); | |||
| (void)NPUPassUtils::AssistDataNHWC2NCHW(begin, 1); | |||
| auto end_tensor = kernel->in_tensors().at(2); | |||
| int *end = reinterpret_cast<int *>(end_tensor->data_c()); | |||
| NPUPassUtils::AssistDataNHWC2NCHW(end, 1); | |||
| auto stride_tensor = kernel->in_tensors().at(3); | |||
| if (kernel->in_tensors().size() == 5) { | |||
| stride_tensor = kernel->in_tensors().at(4); | |||
| } | |||
| int *stride = reinterpret_cast<int *>(stride_tensor->data_c()); | |||
| NPUPassUtils::AssistDataNHWC2NCHW(stride, 1); | |||
| param->begins_mask_ = NPUPassUtils::MaskDataNHWC2NCHW(param->begins_mask_); | |||
| param->ends_mask_ = NPUPassUtils::MaskDataNHWC2NCHW(param->ends_mask_); | |||
| param->ellipsisMask_ = NPUPassUtils::MaskDataNHWC2NCHW(param->ellipsisMask_); | |||
| param->newAxisMask_ = NPUPassUtils::MaskDataNHWC2NCHW(param->newAxisMask_); | |||
| param->shrinkAxisMask_ = NPUPassUtils::MaskDataNHWC2NCHW(param->shrinkAxisMask_); | |||
| return RET_OK; | |||
| } | |||
| int NPUFusionPass::Run() { | |||
| for (size_t i = 0; i < kernels->size(); i++) { | |||
| auto kernel = (*kernels)[i]; | |||
| if (CheckFusion(kernel)) { | |||
| switch (kernel->Type()) { | |||
| case schema::PrimitiveType_Split: | |||
| i -= kernel->in_kernels().size(); | |||
| SplitFusion(kernel); | |||
| continue; | |||
| case schema::PrimitiveType_Concat: | |||
| i -= kernel->in_kernels().size(); | |||
| ConcatFusion(kernel); | |||
| continue; | |||
| case schema::PrimitiveType_PadFusion: | |||
| i -= kernel->in_kernels().size(); | |||
| PadFusion(kernel); | |||
| continue; | |||
| case schema::PrimitiveType_StridedSlice: | |||
| i -= kernel->in_kernels().size(); | |||
| StridedSliceFusion(kernel); | |||
| continue; | |||
| case schema::PrimitiveType_AddFusion: | |||
| case schema::PrimitiveType_Activation: | |||
| case schema::PrimitiveType_Eltwise: | |||
| @@ -39,6 +39,9 @@ class NPUFusionPass : public NPUBasePass { | |||
| int CommonFusion(kernel::LiteKernel *kernel); | |||
| int ConcatFusion(kernel::LiteKernel *kernel); | |||
| int FormatFusion(kernel::LiteKernel *kernel); | |||
| int SplitFusion(kernel::LiteKernel *kernel); | |||
| int PadFusion(kernel::LiteKernel *kernel); | |||
| int StridedSliceFusion(kernel::LiteKernel *kernel); | |||
| private: | |||
| std::vector<kernel::LiteKernel *> *kernels; | |||
| @@ -23,8 +23,10 @@ namespace mindspore::lite { | |||
| using kernel::KERNEL_ARCH::kNPU; | |||
| enum InsertState { InsertNone, PreInsert, PostInsert, BothInsert }; | |||
| std::set<mindspore::schema::PrimitiveType> npu_insert_nodes = { | |||
| schema::PrimitiveType_Concat, schema::PrimitiveType_AddFusion, schema::PrimitiveType_Eltwise, | |||
| schema::PrimitiveType_Activation}; | |||
| schema::PrimitiveType_Concat, schema::PrimitiveType_AddFusion, schema::PrimitiveType_Eltwise, | |||
| schema::PrimitiveType_Activation, schema::PrimitiveType_Split, schema::PrimitiveType_PadFusion, | |||
| schema::PrimitiveType_StridedSlice, schema::PrimitiveType_Activation}; | |||
| // this pass goal is to minimize subgraphs generated | |||
| // by inserting nchw2nhwc or nhwc2nchw before or after the operator (e.g. concat, add, etc..) together with | |||
| // fusion pass. If transpose inserted are more than half of input output, we will insert remaining input | |||
| @@ -44,7 +46,7 @@ std::set<mindspore::schema::PrimitiveType> npu_insert_nodes = { | |||
| // so we won't insert nc2nh or nh2nc when op's in kernels and out kernels contains no nc2nh or nh2nc. | |||
| // This pass should be run after npu_transform_pass, which insert transpose for nchw-input-limited op like conv2d. | |||
| int GetInsertState(kernel::LiteKernel *kernel) { | |||
| int NPUInsertTransformPass::GetInsertState(kernel::LiteKernel *kernel) { | |||
| // filter out irrelevant kernel | |||
| if (npu_insert_nodes.find(kernel->Type()) == npu_insert_nodes.end()) { | |||
| return InsertNone; | |||
| @@ -52,15 +54,17 @@ int GetInsertState(kernel::LiteKernel *kernel) { | |||
| // current kernel is target kernel | |||
| // use out kernels to count how many out lines from current kernel | |||
| std::vector<Tensor *> in_tensors = NPUPassUtils::GetNonConstInputs(kernel); | |||
| size_t in_out_tensor_num = | |||
| kernel->in_tensors().size() + std::max(kernel->out_kernels().size(), static_cast<size_t>(1)); | |||
| in_tensors.size() + | |||
| std::max(std::max(kernel->out_kernels().size(), static_cast<size_t>(1)), kernel->out_tensors().size()); | |||
| size_t transpose_input_num = 0; | |||
| size_t transpose_output_num = 0; | |||
| bool need_pre_insert = false; | |||
| bool need_post_insert = false; | |||
| // count number of input tensor from nc2nh and output tensor to nh2nc | |||
| for (size_t i = 0; i < kernel->in_tensors().size(); ++i) { | |||
| auto in_kernel = NPUPassUtils::KernelInputFromKernel(kernel, i); | |||
| for (size_t i = 0; i < in_tensors.size(); ++i) { | |||
| auto in_kernel = NPUPassUtils::KernelInputFromKernel(kernel, in_tensors.at(i)); | |||
| if (NPUPassUtils::IsNchw2Nhwc(in_kernel)) { | |||
| transpose_input_num++; | |||
| } else { | |||
| @@ -81,21 +85,22 @@ int GetInsertState(kernel::LiteKernel *kernel) { | |||
| // won't insert any thing if num of transpose tensor is smaller than half of total input output. | |||
| // won't insert if total input output are all transpose tensor, the fusion pass will handle this. | |||
| size_t transpose_tensor_num = transpose_input_num + transpose_output_num; | |||
| if (transpose_tensor_num <= in_out_tensor_num / 2 || transpose_tensor_num == in_out_tensor_num) { | |||
| if (transpose_tensor_num == 0 || transpose_tensor_num * 2 < in_out_tensor_num || | |||
| transpose_tensor_num == in_out_tensor_num) { | |||
| return InsertNone; | |||
| } | |||
| InsertState ret; | |||
| if (need_pre_insert && !need_post_insert) { | |||
| return PreInsert; | |||
| } | |||
| if (need_pre_insert && need_post_insert) { | |||
| return BothInsert; | |||
| } | |||
| if (!need_pre_insert && need_post_insert) { | |||
| return PostInsert; | |||
| ret = PreInsert; | |||
| } else if (need_pre_insert && need_post_insert) { | |||
| ret = BothInsert; | |||
| } else if (!need_pre_insert && need_post_insert) { | |||
| ret = PostInsert; | |||
| } else { | |||
| ret = InsertNone; | |||
| } | |||
| return InsertNone; | |||
| return ret; | |||
| } | |||
| int NPUInsertTransformPass::InsertNode(kernel::LiteKernel *kernel, kernel::LiteKernel *post_kernel, | |||
| @@ -200,13 +205,20 @@ int NPUInsertTransformPass::InsertForOutputTensor(kernel::LiteKernel *kernel, ke | |||
| int NPUInsertTransformPass::InsertPreNodes(kernel::LiteKernel *kernel, | |||
| std::vector<kernel::LiteKernel *> *trans_kernels) { | |||
| int ret = RET_OK; | |||
| for (size_t i = 0; i < kernel->in_tensors().size(); ++i) { | |||
| auto pre_kernel = NPUPassUtils::KernelInputFromKernel(kernel, i); | |||
| auto in_tensors = NPUPassUtils::GetNonConstInputs(kernel); | |||
| for (auto tensor : in_tensors) { | |||
| auto pre_kernel = NPUPassUtils::KernelInputFromKernel(kernel, tensor); | |||
| if (NPUPassUtils::IsNchw2Nhwc(pre_kernel)) { | |||
| continue; | |||
| } | |||
| // if this tensor is input of graph, pre_kernel is nullptr. | |||
| ret = InsertForInputTensor(kernel, i, pre_kernel, trans_kernels); | |||
| auto it = find(kernel->in_tensors().begin(), kernel->in_tensors().end(), tensor); | |||
| if (it == kernel->in_tensors().end()) { | |||
| MS_LOG(ERROR) << "Find in tensor index error"; | |||
| return RET_ERROR; | |||
| } | |||
| size_t index = it - kernel->in_tensors().begin(); | |||
| ret = InsertForInputTensor(kernel, index, pre_kernel, trans_kernels); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel before kernel " << kernel->name() << " failed."; | |||
| return ret; | |||
| @@ -249,59 +261,63 @@ int NPUInsertTransformPass::InsertPostNodes(kernel::LiteKernel *kernel, | |||
| int NPUInsertTransformPass::Run() { | |||
| std::vector<kernel::LiteKernel *> insert_kernels; | |||
| for (size_t i = 0; i < all_kernels_->size(); i++) { | |||
| auto kernel = (*all_kernels_)[i]; | |||
| if (kernel->desc().arch != kNPU) { | |||
| continue; | |||
| } | |||
| auto insert_state = GetInsertState(kernel); | |||
| insert_kernels.clear(); | |||
| // If the every output kernel is nhwc2nchw, insert | |||
| // modify loop index add post_kernels.size() to the next kernel in the origin vector | |||
| switch (insert_state) { | |||
| case PreInsert: { | |||
| auto ret = InsertPreNodes(kernel, &insert_kernels); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel before kernel " << kernel->name() | |||
| << " failed."; | |||
| return RET_ERROR; | |||
| } | |||
| all_kernels_->insert(all_kernels_->begin() + i, insert_kernels.begin(), insert_kernels.end()); | |||
| i += insert_kernels.size(); | |||
| break; | |||
| for (int j = 0; j < 2; ++j) { | |||
| for (size_t i = 0; i < all_kernels_->size(); i++) { | |||
| auto kernel = (*all_kernels_)[i]; | |||
| if (kernel->desc().arch != kNPU) { | |||
| continue; | |||
| } | |||
| case PostInsert: { | |||
| auto ret = InsertPostNodes(kernel, &insert_kernels); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel after kernel " << kernel->name() << " failed."; | |||
| return RET_ERROR; | |||
| auto insert_state = GetInsertState(kernel); | |||
| insert_kernels.clear(); | |||
| // If the every output kernel is nhwc2nchw, insert | |||
| // modify loop index add post_kernels.size() to the next kernel in the origin vector | |||
| switch (insert_state) { | |||
| case PreInsert: { | |||
| auto ret = InsertPreNodes(kernel, &insert_kernels); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel before kernel " << kernel->name() | |||
| << " failed."; | |||
| return RET_ERROR; | |||
| } | |||
| all_kernels_->insert(all_kernels_->begin() + i, insert_kernels.begin(), insert_kernels.end()); | |||
| i += insert_kernels.size(); | |||
| break; | |||
| } | |||
| all_kernels_->insert(all_kernels_->begin() + i + 1, insert_kernels.begin(), insert_kernels.end()); | |||
| i += insert_kernels.size(); | |||
| break; | |||
| } | |||
| case BothInsert: { | |||
| auto ret = InsertPreNodes(kernel, &insert_kernels); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel before kernel " << kernel->name() | |||
| << " failed."; | |||
| return RET_ERROR; | |||
| case PostInsert: { | |||
| auto ret = InsertPostNodes(kernel, &insert_kernels); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel after kernel " << kernel->name() | |||
| << " failed."; | |||
| return RET_ERROR; | |||
| } | |||
| all_kernels_->insert(all_kernels_->begin() + i + 1, insert_kernels.begin(), insert_kernels.end()); | |||
| i += insert_kernels.size(); | |||
| break; | |||
| } | |||
| all_kernels_->insert(all_kernels_->begin() + i, insert_kernels.begin(), insert_kernels.end()); | |||
| i += insert_kernels.size(); | |||
| case BothInsert: { | |||
| auto ret = InsertPreNodes(kernel, &insert_kernels); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel before kernel " << kernel->name() | |||
| << " failed."; | |||
| return RET_ERROR; | |||
| } | |||
| all_kernels_->insert(all_kernels_->begin() + i, insert_kernels.begin(), insert_kernels.end()); | |||
| i += insert_kernels.size(); | |||
| insert_kernels.clear(); | |||
| ret = InsertPostNodes(kernel, &insert_kernels); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel after kernel " << kernel->name() << " failed."; | |||
| return RET_ERROR; | |||
| insert_kernels.clear(); | |||
| ret = InsertPostNodes(kernel, &insert_kernels); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel after kernel " << kernel->name() | |||
| << " failed."; | |||
| return RET_ERROR; | |||
| } | |||
| all_kernels_->insert(all_kernels_->begin() + i + 1, insert_kernels.begin(), insert_kernels.end()); | |||
| i += insert_kernels.size(); | |||
| break; | |||
| } | |||
| all_kernels_->insert(all_kernels_->begin() + i + 1, insert_kernels.begin(), insert_kernels.end()); | |||
| i += insert_kernels.size(); | |||
| break; | |||
| default: | |||
| MS_LOG(DEBUG) << "Insert Nothing on kernel " << kernel->name(); | |||
| } | |||
| default: | |||
| MS_LOG(DEBUG) << "Insert Nothing on kernel " << kernel->name(); | |||
| } | |||
| } | |||
| return RET_OK; | |||
| @@ -34,6 +34,7 @@ class NPUInsertTransformPass : public NPUBasePass { | |||
| int Run() override; | |||
| private: | |||
| int GetInsertState(kernel::LiteKernel *kernel); | |||
| int InsertPreNodes(kernel::LiteKernel *kernel, std::vector<kernel::LiteKernel *> *trans_kernels); | |||
| int InsertPostNodes(kernel::LiteKernel *kernel, std::vector<kernel::LiteKernel *> *trans_kernels); | |||
| @@ -25,7 +25,10 @@ | |||
| namespace mindspore::lite { | |||
| using kernel::KERNEL_ARCH::kCPU; | |||
| using kernel::KERNEL_ARCH::kNPU; | |||
| std::unordered_map<schema::PrimitiveType, std::set<int>> nodes2const_index{ | |||
| {schema::PrimitiveType_Split, {1}}, | |||
| {schema::PrimitiveType_PadFusion, {1}}, | |||
| {schema::PrimitiveType_StridedSlice, {1, 2, 3}}}; | |||
| kernel::LiteKernel *NPUPassUtils::CreateNchw2NhwcKernel(const std::vector<Tensor *> &in_tensors, | |||
| const std::vector<Tensor *> &out_tensors, | |||
| const InnerContext *ctx, const std::string &name) { | |||
| @@ -125,8 +128,8 @@ void NPUPassUtils::UpdateNC2NHTransNodePreKernel(kernel::LiteKernel *pre_kernel, | |||
| } | |||
| std::copy(trans_kernels.begin(), trans_kernels.end(), std::back_inserter(cur_out_kernels)); | |||
| pre_kernel->set_out_kernels(cur_out_kernels); | |||
| // For kernel before trans, the output tensor is used for output tensor of trans, so replace the output tensor with | |||
| // the input tensor of trans. | |||
| // For kernel before trans, the output tensor is used for output tensor of trans, so replace the output tensor | |||
| // with the input tensor of trans. | |||
| pre_kernel->set_out_tensors({trans_kernels.at(0)->in_tensors().at(0)}); | |||
| } | |||
| @@ -158,7 +161,7 @@ void NPUPassUtils::UpdateNC2NHTransNodePostKernel(kernel::LiteKernel *kernel, ke | |||
| Tensor *old_in_tensor = nullptr; | |||
| // find out which input tensor of post_kernel should be updated | |||
| for (size_t i = 0; i < post_in_tensors.size(); ++i) { | |||
| if (KernelInputFromKernel(post_kernel, i) == kernel) { | |||
| if (KernelInputFromKernel(post_kernel, post_in_tensors.at(i)) == kernel) { | |||
| old_in_tensor = post_in_tensors.at(i); | |||
| break; | |||
| } | |||
| @@ -219,17 +222,16 @@ bool NPUPassUtils::IsNchw2Nhwc(const kernel::LiteKernel *kernel) { | |||
| } | |||
| return false; | |||
| } | |||
| kernel::LiteKernel *NPUPassUtils::KernelInputFromKernel(const kernel::LiteKernel *kernel, size_t in_tensor_index) { | |||
| kernel::LiteKernel *NPUPassUtils::KernelInputFromKernel(const kernel::LiteKernel *kernel, Tensor *in_tensor) { | |||
| // given kernel and input tensor index, get which kernel output this tensor. | |||
| // If input tensor is graph input, return nullptr. | |||
| if (kernel == nullptr) { | |||
| return nullptr; | |||
| } | |||
| auto tensor = kernel->in_tensors().at(in_tensor_index); | |||
| auto in_kernels = kernel->in_kernels(); | |||
| auto output_contain = [tensor](const kernel::LiteKernel *kernel) { | |||
| auto output_contain = [in_tensor](const kernel::LiteKernel *kernel) { | |||
| auto out_tensors = kernel->out_tensors(); | |||
| return std::find(out_tensors.begin(), out_tensors.end(), tensor) != out_tensors.end(); | |||
| return std::find(out_tensors.begin(), out_tensors.end(), in_tensor) != out_tensors.end(); | |||
| }; | |||
| auto it = std::find_if(in_kernels.begin(), in_kernels.end(), output_contain); | |||
| if (it == in_kernels.end()) { | |||
| @@ -238,10 +240,57 @@ kernel::LiteKernel *NPUPassUtils::KernelInputFromKernel(const kernel::LiteKernel | |||
| return *it; | |||
| } | |||
| std::vector<Tensor *> NPUPassUtils::GetNonConstInputs(kernel::LiteKernel *kernel) { | |||
| if (kernel == nullptr) { | |||
| return std::vector<Tensor *>{}; | |||
| } | |||
| auto type = static_cast<schema::PrimitiveType>(kernel->op_parameter()->type_); | |||
| auto it = nodes2const_index.find(type); | |||
| if (it != nodes2const_index.end()) { | |||
| auto const_input_indices = it->second; | |||
| std::vector<Tensor *> non_const_in_tensors; | |||
| auto in_tensors = kernel->in_tensors(); | |||
| for (auto i = 0; i < in_tensors.size(); ++i) { | |||
| if (const_input_indices.find(i) == const_input_indices.end()) { | |||
| non_const_in_tensors.push_back(in_tensors[i]); | |||
| } | |||
| } | |||
| return non_const_in_tensors; | |||
| } | |||
| return kernel->in_tensors(); | |||
| } | |||
| bool NPUPassUtils::Scale4dCase(const kernel::LiteKernel *kernel) { | |||
| MS_ASSERT(kernel != nullptr && kernel->op_parameter() != nullptr); | |||
| auto scale_param = reinterpret_cast<ScaleParameter *>(kernel->op_parameter()); | |||
| auto in_tensor = kernel->in_tensors().at(1); | |||
| return in_tensor->shape().size() == 1 && (scale_param->axis_ == 3 || scale_param->axis_ == -1); | |||
| } | |||
| void NPUPassUtils::AssistDataNHWC2NCHW(int *data, size_t unit_size) { | |||
| MS_ASSERT(data != nullptr); | |||
| for (size_t i = 0; i < unit_size; ++i) { | |||
| int c = data[3 * unit_size + i]; | |||
| // n h w c | |||
| // n c h w | |||
| data[3 * unit_size + i] = data[2 * unit_size + i]; | |||
| data[2 * unit_size + i] = data[unit_size + i]; | |||
| data[unit_size + i] = c; | |||
| } | |||
| } | |||
| int NPUPassUtils::MaskDataNHWC2NCHW(int mask) { | |||
| int mask_vec[4]; | |||
| for (int i = 0; i < 4; ++i) { | |||
| mask_vec[i] = (uint32_t)(mask) & (1 << i); | |||
| } | |||
| AssistDataNHWC2NCHW(mask_vec, 1); | |||
| int ret = 0; | |||
| for (int i = 0; i < 4; ++i) { | |||
| if (mask_vec[i]) { | |||
| ret += 1 << i; | |||
| } | |||
| } | |||
| return ret; | |||
| } | |||
| } // namespace mindspore::lite | |||
| @@ -17,9 +17,12 @@ | |||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_OPTIMIZER_NPU_PASS_UTILS_H_ | |||
| #define MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_OPTIMIZER_NPU_PASS_UTILS_H_ | |||
| #include <vector> | |||
| #include <set> | |||
| #include <string> | |||
| #include <unordered_map> | |||
| #include "src/lite_kernel.h" | |||
| namespace mindspore::lite { | |||
| extern std::unordered_map<schema::PrimitiveType, std::set<int>> nodes2const_index; | |||
| class NPUPassUtils { | |||
| public: | |||
| static kernel::LiteKernel *CreateNchw2NhwcKernel(const std::vector<Tensor *> &in_tensors, | |||
| @@ -52,8 +55,11 @@ class NPUPassUtils { | |||
| static bool IsNhwc2Nchw(const kernel::LiteKernel *kernel); | |||
| static bool IsNchw2Nhwc(const kernel::LiteKernel *kernel); | |||
| static kernel::LiteKernel *KernelInputFromKernel(const kernel::LiteKernel *kernel, size_t in_tensor_index); | |||
| static kernel::LiteKernel *KernelInputFromKernel(const kernel::LiteKernel *kernel, Tensor *in_tensor); | |||
| static std::vector<Tensor *> GetNonConstInputs(kernel::LiteKernel *kernel); | |||
| static bool Scale4dCase(const kernel::LiteKernel *kernel); | |||
| static void AssistDataNHWC2NCHW(int *data, size_t unit_size); | |||
| static int MaskDataNHWC2NCHW(int mask); | |||
| }; | |||
| } // namespace mindspore::lite | |||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_OPTIMIZER_NPU_PASS_UTILS_H_ | |||
| @@ -14,7 +14,6 @@ | |||
| * limitations under the License. | |||
| */ | |||
| #include "src/runtime/agent/npu/optimizer/npu_transform_pass.h" | |||
| #include <set> | |||
| #include <vector> | |||
| #include "src/lite_kernel.h" | |||
| #include "src/runtime/agent/npu/npu_manager.h" | |||
| @@ -22,7 +21,7 @@ | |||
| namespace mindspore::lite { | |||
| using kernel::KERNEL_ARCH::kNPU; | |||
| static std::set<mindspore::schema::PrimitiveType> npu_trans_nodes = { | |||
| std::set<mindspore::schema::PrimitiveType> npu_trans_nodes = { | |||
| schema::PrimitiveType_Conv2DFusion, schema::PrimitiveType_Conv2dTransposeFusion, schema::PrimitiveType_Resize, | |||
| schema::PrimitiveType_MaxPoolFusion, schema::PrimitiveType_AvgPoolFusion, schema::PrimitiveType_ScaleFusion}; | |||
| @@ -16,11 +16,14 @@ | |||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_OPTIMIZER_NPU_TRANSFORM_PASS_H_ | |||
| #define MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_OPTIMIZER_NPU_TRANSFORM_PASS_H_ | |||
| #include <set> | |||
| #include <vector> | |||
| #include "src/lite_kernel.h" | |||
| #include "src/runtime/agent/npu/optimizer/npu_base_pass.h" | |||
| namespace mindspore::lite { | |||
| extern std::set<mindspore::schema::PrimitiveType> npu_trans_nodes; | |||
| class NPUTransformPass : public NPUBasePass { | |||
| public: | |||
| int Run() override; | |||
| @@ -31,7 +31,7 @@ int PadNPUKernel::IsSupport(const std::vector<lite::Tensor *> &inputs, const std | |||
| } | |||
| if (inputs.size() >= 2 && inputs[1]->data_c() != nullptr) { | |||
| for (int i = 0; i < inputs[1]->ElementsNum(); i++) { | |||
| paddings_.push_back(static_cast<int *>(inputs[1]->data_c())[i]); | |||
| param_->paddings_[i] = static_cast<int *>(inputs[1]->data_c())[i]; | |||
| } | |||
| } else { | |||
| MS_LOG(WARNING) << "NPU axis is attribute."; | |||
| @@ -50,7 +50,7 @@ int PadNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *> &inputs, const | |||
| int size = static_cast<int>(param_->padding_length / 2); | |||
| ge::TensorDesc padding_tensor_desc(ge::Shape({size, 2}), ge::FORMAT_NCHW, ge::DT_INT32); | |||
| ge::TensorPtr padding_tensor = std::make_shared<hiai::Tensor>(padding_tensor_desc); | |||
| padding_tensor->SetData(reinterpret_cast<uint8_t *>(paddings_.data()), 2 * size * sizeof(int)); | |||
| padding_tensor->SetData(reinterpret_cast<uint8_t *>(param_->paddings_), 2 * size * sizeof(int)); | |||
| hiai_paddings_ = new hiai::op::Const(name_ + "paddings"); | |||
| hiai_paddings_->set_attr_value(padding_tensor); | |||
| @@ -39,7 +39,6 @@ class PadNPUKernel : public NPUKernel { | |||
| private: | |||
| hiai::op::PadV2 *op_ = nullptr; | |||
| PadParameter *param_; | |||
| std::vector<int> paddings_; | |||
| hiai::op::Const *hiai_paddings_ = nullptr; | |||
| hiai::op::Const *hiai_constant_ = nullptr; | |||
| }; | |||
| @@ -77,3 +77,5 @@ ml_video_edit_img_segment_adaptise_pb2tflite.tflite 0.5 2 | |||
| ml_video_edit_imitate_filter.onnx 103 | |||
| hdc_mobilenet_1w_class.onnx 20 | |||
| hdc_age_medium 504 | |||
| posenet_mobilenet_float_075_1_default_1.tflite 395 | |||
| nasnet_mobile.tflite 1 | |||