From: @yangruoqi713 Reviewed-by: @zhang_xue_tong,@zhanghaibo5 Signed-off-by: @zhang_xue_tongtags/v1.2.0-rc1
| @@ -38,10 +38,28 @@ int NPUExecutor::Prepare(const std::vector<kernel::LiteKernel *> &kernels) { | |||
| } | |||
| bool IsSameShapeTensor(Tensor *tensor, std::shared_ptr<hiai::AiTensor> npu_tensor) { | |||
| return tensor->Batch() == npu_tensor->GetTensorDimension().GetNumber() && | |||
| tensor->Channel() == npu_tensor->GetTensorDimension().GetChannel() && | |||
| tensor->Height() == npu_tensor->GetTensorDimension().GetHeight() && | |||
| tensor->Width() == npu_tensor->GetTensorDimension().GetWidth(); | |||
| if (tensor->shape().size() == 4) { | |||
| return tensor->Batch() == npu_tensor->GetTensorDimension().GetNumber() && | |||
| tensor->Channel() == npu_tensor->GetTensorDimension().GetChannel() && | |||
| tensor->Height() == npu_tensor->GetTensorDimension().GetHeight() && | |||
| tensor->Width() == npu_tensor->GetTensorDimension().GetWidth(); | |||
| } | |||
| if (tensor->shape().size() > 4) { | |||
| MS_LOG(ERROR) << "Npu doesn't support input tensor dims greater than 4"; | |||
| return false; | |||
| } | |||
| std::vector<int> npu_shape; | |||
| auto dim = tensor->shape().size(); | |||
| if (dim > 0) { | |||
| npu_shape.push_back(npu_tensor->GetTensorDimension().GetNumber()); | |||
| } | |||
| if (dim > 1) { | |||
| npu_shape.push_back(npu_tensor->GetTensorDimension().GetChannel()); | |||
| } | |||
| if (dim > 2) { | |||
| npu_shape.push_back(npu_tensor->GetTensorDimension().GetWidth()); | |||
| } | |||
| return npu_shape == tensor->shape(); | |||
| } | |||
| int NPUExecutor::Run(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors, | |||
| @@ -49,10 +67,11 @@ int NPUExecutor::Run(const std::vector<Tensor *> &in_tensors, const std::vector< | |||
| const std::vector<kernel::LiteKernel *> &kernels, Allocator *allocator, | |||
| const KernelCallBack &before, const KernelCallBack &after) { | |||
| hiai::AiContext context; | |||
| std::vector<bool> inputs_visited(in_tensors.size(), false); | |||
| for (int i = 0; i < npu_input_tensors_.size(); ++i) { | |||
| int index = 0; | |||
| for (; index < in_tensors.size(); index++) { | |||
| if (IsSameShapeTensor(in_tensors[index], npu_input_tensors_[i])) { | |||
| if (!inputs_visited[index] && IsSameShapeTensor(in_tensors[index], npu_input_tensors_[i])) { | |||
| void *data = in_tensors[index]->data_c(); | |||
| if (data == nullptr) { | |||
| MS_LOG(ERROR) << model_name_ << " Inputs data is nullptr"; | |||
| @@ -60,6 +79,7 @@ int NPUExecutor::Run(const std::vector<Tensor *> &in_tensors, const std::vector< | |||
| } | |||
| memcpy(npu_input_tensors_[i]->GetBuffer(), data, in_tensors[index]->Size()); | |||
| inputs_visited[index] = true; | |||
| in_tensors[index]->set_ref_count(in_tensors[index]->ref_count() - 1); | |||
| if (in_tensors[index]->ref_count() <= 0) { | |||
| in_tensors[index]->FreeData(); | |||
| @@ -85,33 +105,14 @@ int NPUExecutor::Run(const std::vector<Tensor *> &in_tensors, const std::vector< | |||
| return RET_ERROR; | |||
| } | |||
| // For the output kernel of the entire model, and the format is nchw, the output tensor needs to be nchw TO nhwc. | |||
| std::vector<Tensor *> trans_tensors; | |||
| for (auto kernel : out_kernels) { | |||
| if (kernel->out_kernels().empty() && npu_trans_nodes.find(kernel->Type()) != npu_trans_nodes.end()) { | |||
| for (int i = 0; i < kernel->out_tensors().size(); ++i) { | |||
| trans_tensors.push_back(kernel->out_tensors()[i]); | |||
| } | |||
| } | |||
| } | |||
| for (int i = 0; i < npu_output_tensors_.size(); ++i) { | |||
| void *data = out_tensors[i]->MutableData(); | |||
| if (data == nullptr) { | |||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||
| return RET_ERROR; | |||
| } | |||
| if (std::find(trans_tensors.begin(), trans_tensors.end(), out_tensors[i]) != trans_tensors.end()) { | |||
| // Change data&tensor shape nc->nh | |||
| PackNCHWToNHWCFp32(npu_output_tensors_[i]->GetBuffer(), data, | |||
| npu_output_tensors_[i]->GetTensorDimension().GetNumber(), | |||
| npu_output_tensors_[i]->GetTensorDimension().GetWidth() * | |||
| npu_output_tensors_[i]->GetTensorDimension().GetHeight(), | |||
| npu_output_tensors_[i]->GetTensorDimension().GetChannel()); | |||
| } else { | |||
| memcpy(data, npu_output_tensors_[i]->GetBuffer(), npu_output_tensors_[i]->GetSize()); | |||
| out_tensors[i]->ResetRefCount(); | |||
| } | |||
| memcpy(data, npu_output_tensors_[i]->GetBuffer(), npu_output_tensors_[i]->GetSize()); | |||
| out_tensors[i]->ResetRefCount(); | |||
| } | |||
| return RET_OK; | |||
| } | |||
| @@ -29,9 +29,8 @@ bool CheckFusion(kernel::LiteKernel *kernel) { | |||
| return false; | |||
| } | |||
| auto post_flag = | |||
| std::all_of(kernel->out_kernels().begin(), kernel->out_kernels().end(), [](const kernel::LiteKernel *out_kernel) { | |||
| return NPUPassUtils::IsNhwc2Nchw(out_kernel) && (!out_kernel->out_kernels().empty()); | |||
| }); | |||
| std::all_of(kernel->out_kernels().begin(), kernel->out_kernels().end(), | |||
| [](const kernel::LiteKernel *out_kernel) { return NPUPassUtils::IsNhwc2Nchw(out_kernel); }); | |||
| return post_flag; | |||
| } | |||
| @@ -41,15 +40,11 @@ bool CheckFormatFusion(kernel::LiteKernel *kernel) { | |||
| } | |||
| if (NPUPassUtils::IsNhwc2Nchw(kernel)) { | |||
| return std::all_of(kernel->out_kernels().begin(), kernel->out_kernels().end(), | |||
| [](const kernel::LiteKernel *kernel) { | |||
| return NPUPassUtils::IsNchw2Nhwc(kernel) && (!kernel->out_kernels().empty()); | |||
| }); | |||
| [](const kernel::LiteKernel *kernel) { return NPUPassUtils::IsNchw2Nhwc(kernel); }); | |||
| } | |||
| if (NPUPassUtils::IsNchw2Nhwc(kernel)) { | |||
| return std::all_of(kernel->out_kernels().begin(), kernel->out_kernels().end(), | |||
| [](const kernel::LiteKernel *kernel) { | |||
| return NPUPassUtils::IsNhwc2Nchw(kernel) && (!kernel->out_kernels().empty()); | |||
| }); | |||
| [](const kernel::LiteKernel *kernel) { return NPUPassUtils::IsNhwc2Nchw(kernel); }); | |||
| } | |||
| return false; | |||
| } | |||
| @@ -92,32 +87,32 @@ void NPUFusionPass::UpdatePreKernels(kernel::LiteKernel *cur_kernel) { | |||
| } | |||
| void NPUFusionPass::UpdatePostKernels(kernel::LiteKernel *cur_kernel) { | |||
| auto cur_out_kernels = cur_kernel->out_kernels(); | |||
| for (auto out_kernel : cur_kernel->out_kernels()) { | |||
| // graph out kernel | |||
| if (out_kernel->out_kernels().empty()) { | |||
| continue; | |||
| } | |||
| auto post_kernel = out_kernel->out_kernels()[0]; | |||
| auto post_in_kernels = post_kernel->in_kernels(); | |||
| for (size_t i = 0; i < post_in_kernels.size(); i++) { | |||
| if (post_in_kernels[i] == out_kernel) { | |||
| post_in_kernels[i] = cur_kernel; | |||
| break; | |||
| cur_out_kernels.erase(find(cur_out_kernels.begin(), cur_out_kernels.end(), out_kernel)); | |||
| } else { | |||
| auto post_kernel = out_kernel->out_kernels()[0]; | |||
| auto post_in_kernels = post_kernel->in_kernels(); | |||
| for (size_t i = 0; i < post_in_kernels.size(); i++) { | |||
| if (post_in_kernels[i] == out_kernel) { | |||
| post_in_kernels[i] = cur_kernel; | |||
| break; | |||
| } | |||
| } | |||
| } | |||
| post_kernel->set_in_kernels(post_in_kernels); | |||
| post_kernel->set_in_kernels(post_in_kernels); | |||
| auto cur_out_kernels = cur_kernel->out_kernels(); | |||
| for (size_t i = 0; i < cur_out_kernels.size(); i++) { | |||
| if (cur_out_kernels[i] == out_kernel) { | |||
| cur_out_kernels[i] = post_kernel; | |||
| break; | |||
| for (size_t i = 0; i < cur_out_kernels.size(); i++) { | |||
| if (cur_out_kernels[i] == out_kernel) { | |||
| cur_out_kernels[i] = post_kernel; | |||
| break; | |||
| } | |||
| } | |||
| } | |||
| cur_kernel->set_out_kernels(cur_out_kernels); | |||
| RemoveAndFreeKernel(out_kernel); | |||
| } | |||
| cur_kernel->set_out_kernels(cur_out_kernels); | |||
| } | |||
| void UpdatePreTensors(kernel::LiteKernel *cur_kernel) { | |||
| @@ -145,6 +140,9 @@ void UpdatePostTensors(kernel::LiteKernel *cur_kernel) { | |||
| auto tensor = cur_kernel->out_tensors()[0]; | |||
| for (auto out_kernel : cur_kernel->out_kernels()) { | |||
| auto out_tensor = out_kernel->out_tensors()[0]; | |||
| if (out_kernel->out_kernels().empty()) { | |||
| cur_kernel->set_out_tensors({out_kernel->out_tensors()[0]}); | |||
| } | |||
| for (auto post_kernel : out_kernel->out_kernels()) { | |||
| auto tensors_vec = post_kernel->in_tensors(); | |||
| for (int i = 0; i < tensors_vec.size(); i++) { | |||
| @@ -197,6 +195,10 @@ int NPUFusionPass::FormatFusion(kernel::LiteKernel *kernel) { | |||
| auto in_tensor = kernel->in_tensors()[0]; | |||
| std::vector<kernel::LiteKernel *> pre_insert_kernels; | |||
| for (const auto &trans_kernel : kernel->out_kernels()) { | |||
| if (trans_kernel->out_kernels().empty()) { | |||
| // kernel is a trans kernel, it's input kernel num and input tensor num must be 1 | |||
| kernel->in_kernels()[0]->set_out_tensors({trans_kernel->out_tensors()[0]}); | |||
| } | |||
| for (const auto &post_kernel : trans_kernel->out_kernels()) { | |||
| // update tensor | |||
| auto tensors_vec = post_kernel->in_tensors(); | |||
| @@ -218,8 +220,8 @@ int NPUFusionPass::FormatFusion(kernel::LiteKernel *kernel) { | |||
| } | |||
| post_kernel->set_in_kernels(post_in_kernels); | |||
| pre_insert_kernels.push_back(post_kernel); | |||
| RemoveAndFreeKernel(trans_kernel); | |||
| } | |||
| RemoveAndFreeKernel(trans_kernel); | |||
| } | |||
| pre_kernel->set_out_kernels(pre_insert_kernels); | |||
| RemoveAndFreeKernel(kernel); | |||
| @@ -42,7 +42,7 @@ int PadNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *> &inputs, const | |||
| int size = static_cast<int>(pad_->GetPaddings().size() / 2); | |||
| ge::TensorDesc padding_tensor_desc(ge::Shape({size, 2}), ge::FORMAT_NCHW, ge::DT_INT32); | |||
| ge::TensorPtr padding_tensor = std::make_shared<hiai::Tensor>(padding_tensor_desc); | |||
| padding_tensor->SetData(reinterpret_cast<uint8_t *>(pad_->GetPaddings().data()), size * sizeof(int)); | |||
| padding_tensor->SetData(reinterpret_cast<uint8_t *>(pad_->GetPaddings().data()), 2 * size * sizeof(int)); | |||
| auto paddings = new hiai::op::Const(name_ + "paddings"); | |||
| paddings->set_attr_value(padding_tensor); | |||
| @@ -24,6 +24,10 @@ using mindspore::schema::PrimitiveType_SoftMax; | |||
| namespace mindspore::kernel { | |||
| int SoftmaxNPUKernel::IsSupport(const std::vector<lite::Tensor *> &inputs, const std::vector<lite::Tensor *> &outputs, | |||
| OpParameter *opParameter) { | |||
| if (inputs[0]->shape().size() > 4) { | |||
| MS_LOG(ERROR) << "Npu softmax only supports tensor'dim less than 4."; | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| @@ -67,3 +67,4 @@ PoseNet_dla_17_x512 | |||
| ml_location_scene_division | |||
| ml_tabel_recog | |||
| ml_text_division | |||
| 6c_seg_nomean_20200610 | |||
| @@ -1,4 +1,5 @@ | |||
| mobilenet_v2_1.0_224.tflite 2.5 | |||
| squeezenet.tflite 2.5 | |||
| inception_v3.tflite 1 | |||
| 6c_seg_nomean_20200610 1.5 | |||
| porseg_tmp.onnx 1 2 | |||