| @@ -0,0 +1,46 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "src/runtime/kernel/arm/fp16/common_fp16.h" | |||
| #include "src/runtime/kernel/arm/nnacl/fp16/cast_fp16.h" | |||
| namespace mindspore::kernel { | |||
| float16_t *ConvertInputFp32toFp16(lite::tensor::Tensor *input, const lite::Context *ctx) { | |||
| float16_t *fp16_data = nullptr; | |||
| auto data_type = input->data_type(); | |||
| if (data_type == kNumberTypeFloat32) { | |||
| auto ele_num = input->ElementsNum(); | |||
| fp16_data = reinterpret_cast<float16_t *>(ctx->allocator->Malloc(ele_num * sizeof(float16_t))); | |||
| auto ori_data = reinterpret_cast<float *>(input->Data()); | |||
| Float32ToFloat16(ori_data, fp16_data, ele_num); | |||
| } else { | |||
| fp16_data = reinterpret_cast<float16_t *>(input->Data()); | |||
| } | |||
| return fp16_data; | |||
| } | |||
| float16_t *MallocOutputFp16(lite::tensor::Tensor *output, const lite::Context *ctx) { | |||
| float16_t *fp16_data = nullptr; | |||
| auto data_type = output->data_type(); | |||
| if (data_type == kNumberTypeFloat32) { | |||
| auto ele_num = output->ElementsNum(); | |||
| fp16_data = reinterpret_cast<float16_t *>(ctx->allocator->Malloc(ele_num * sizeof(float16_t))); | |||
| } else { | |||
| fp16_data = reinterpret_cast<float16_t *>(output->Data()); | |||
| } | |||
| return fp16_data; | |||
| } | |||
| } // namespace mindspore::kernel | |||
| @@ -0,0 +1,28 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_COMMON_FP16_H_ | |||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_COMMON_FP16_H_ | |||
| #include "src/lite_kernel.h" | |||
| namespace mindspore::kernel { | |||
| float16_t *ConvertInputFp32toFp16(lite::tensor::Tensor *input, const lite::Context *ctx); | |||
| float16_t *MallocOutputFp16(lite::tensor::Tensor *output, const lite::Context *ctx); | |||
| } // namespace mindspore::kernel | |||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_COMMON_FP16_H_ | |||
| @@ -98,28 +98,6 @@ int Convolution1x1FP16CPUKernel::InitWeightBias() { | |||
| return RET_OK; | |||
| } | |||
| int Convolution1x1FP16CPUKernel::InitBuffer() { | |||
| /*=============================fp16_input_============================*/ | |||
| size_t fp16_input_size = conv_param_->input_channel_ * conv_param_->input_batch_ * conv_param_->input_h_ * | |||
| conv_param_->input_w_ * sizeof(float16_t); | |||
| fp16_input_ = reinterpret_cast<float16_t *>(malloc(fp16_input_size)); | |||
| if (fp16_input_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc fp16_input_ failed."; | |||
| return RET_ERROR; | |||
| } | |||
| memset(fp16_input_, 0, fp16_input_size); | |||
| /*=============================fp16_out_============================*/ | |||
| size_t fp16_output_size = conv_param_->output_channel_ * conv_param_->output_batch_ * conv_param_->output_h_ * | |||
| conv_param_->output_w_ * sizeof(float16_t); | |||
| fp16_out_ = reinterpret_cast<float16_t *>(malloc(fp16_output_size)); | |||
| if (fp16_out_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc fp16_out_ failed."; | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int Convolution1x1FP16CPUKernel::Init() { | |||
| auto ret = ConvolutionBaseCPUKernel::Init(); | |||
| if (ret != RET_OK) { | |||
| @@ -136,11 +114,6 @@ int Convolution1x1FP16CPUKernel::Init() { | |||
| MS_LOG(ERROR) << "Init conv1x1 param failed."; | |||
| return ret; | |||
| } | |||
| ret = InitBuffer(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Init buffer failed."; | |||
| return ret; | |||
| } | |||
| ret = InitWeightBias(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Init weight bias failed."; | |||
| @@ -150,12 +123,6 @@ int Convolution1x1FP16CPUKernel::Init() { | |||
| } | |||
| int Convolution1x1FP16CPUKernel::ReSize() { | |||
| if (fp16_out_ != nullptr) { | |||
| free(fp16_out_); | |||
| } | |||
| if (fp16_input_ != nullptr) { | |||
| free(fp16_input_); | |||
| } | |||
| if (fp16_weight_ != nullptr) { | |||
| free(fp16_weight_); | |||
| } | |||
| @@ -181,12 +148,6 @@ int Convolution1x1FP16CPUKernel::ReSize() { | |||
| MS_LOG(ERROR) << "Init conv1x1 param failed."; | |||
| return ret; | |||
| } | |||
| ret = InitBuffer(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Init buffer failed."; | |||
| return ret; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| @@ -253,6 +214,7 @@ int Convolution1x1FP16CPUKernel::Run() { | |||
| } | |||
| ConvolutionBaseFP16CPUKernel::IfCastOutput(); | |||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||
| return RET_OK; | |||
| } | |||
| } // namespace mindspore::kernel | |||
| @@ -35,15 +35,9 @@ class Convolution1x1FP16CPUKernel : public ConvolutionBaseFP16CPUKernel { | |||
| matmul_param_ = new MatMulParameter(); | |||
| } | |||
| ~Convolution1x1FP16CPUKernel() override { | |||
| if (fp16_input_ != nullptr) { | |||
| free(fp16_input_); | |||
| } | |||
| if (fp16_weight_ != nullptr) { | |||
| free(fp16_weight_); | |||
| } | |||
| if (fp16_out_ != nullptr) { | |||
| free(fp16_out_); | |||
| } | |||
| if (input_ptr_ != nullptr) { | |||
| free(input_ptr_); | |||
| } | |||
| @@ -57,7 +51,6 @@ class Convolution1x1FP16CPUKernel : public ConvolutionBaseFP16CPUKernel { | |||
| int ReSize() override; | |||
| int Run() override; | |||
| int RunImpl(int task_id); | |||
| int InitBuffer(); | |||
| int InitConv1x1Param(); | |||
| int InitMatmulParam(); | |||
| int InitWeightBias(); | |||
| @@ -132,16 +132,6 @@ int Convolution3x3FP16CPUKernel::InitTmpBuffer() { | |||
| } | |||
| memset(tmp_out_, 0, tmp_out_size); | |||
| /*=============================fp16_input_============================*/ | |||
| size_t fp16_input_size = conv_param_->input_channel_ * conv_param_->input_batch_ * conv_param_->input_h_ * | |||
| conv_param_->input_w_ * sizeof(float16_t); | |||
| fp16_input_ = reinterpret_cast<float16_t *>(malloc(fp16_input_size)); | |||
| if (fp16_input_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc fp16_input_ failed."; | |||
| return RET_ERROR; | |||
| } | |||
| memset(fp16_input_, 0, fp16_input_size); | |||
| /*=============================nhwc4_input_============================*/ | |||
| size_t nhwc4_input_size = | |||
| iC4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t); | |||
| @@ -152,14 +142,6 @@ int Convolution3x3FP16CPUKernel::InitTmpBuffer() { | |||
| } | |||
| memset(nhwc4_input_, 0, nhwc4_input_size); | |||
| /*=============================fp16_out_============================*/ | |||
| size_t fp16_output_size = conv_param_->output_channel_ * conv_param_->output_batch_ * conv_param_->output_h_ * | |||
| conv_param_->output_w_ * sizeof(float16_t); | |||
| fp16_out_ = reinterpret_cast<float16_t *>(malloc(fp16_output_size)); | |||
| if (fp16_out_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc fp16_out_ failed."; | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| @@ -207,12 +189,6 @@ int Convolution3x3FP16CPUKernel::ReSize() { | |||
| if (tmp_out_ != nullptr) { | |||
| free(tmp_out_); | |||
| } | |||
| if (fp16_out_ != nullptr) { | |||
| free(fp16_out_); | |||
| } | |||
| if (fp16_input_ != nullptr) { | |||
| free(fp16_input_); | |||
| } | |||
| if (nhwc4_input_ != nullptr) { | |||
| free(nhwc4_input_); | |||
| } | |||
| @@ -304,6 +280,7 @@ int Convolution3x3FP16CPUKernel::Run() { | |||
| } | |||
| ConvolutionBaseFP16CPUKernel::IfCastOutput(); | |||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||
| return RET_OK; | |||
| } | |||
| } // namespace mindspore::kernel | |||
| @@ -31,15 +31,9 @@ class Convolution3x3FP16CPUKernel : public ConvolutionBaseFP16CPUKernel { | |||
| const lite::Primitive *primitive) | |||
| : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} | |||
| ~Convolution3x3FP16CPUKernel() override { | |||
| if (fp16_input_ != nullptr) { | |||
| free(fp16_input_); | |||
| } | |||
| if (fp16_weight_ != nullptr) { | |||
| free(fp16_weight_); | |||
| } | |||
| if (fp16_out_ != nullptr) { | |||
| free(fp16_out_); | |||
| } | |||
| if (transformed_filter_addr_ != nullptr) { | |||
| free(transformed_filter_addr_); | |||
| } | |||
| @@ -16,6 +16,7 @@ | |||
| #include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h" | |||
| #include "src/runtime/kernel/arm/nnacl/fp16/cast_fp16.h" | |||
| #include "src/runtime/kernel/arm/fp16/common_fp16.h" | |||
| #include "schema/model_generated.h" | |||
| #include "src/kernel_factory.h" | |||
| #include "include/errorcode.h" | |||
| @@ -25,28 +26,17 @@ namespace mindspore::kernel { | |||
| int ConvolutionBaseFP16CPUKernel::GetExecuteTensor() { | |||
| // ===================input====================// | |||
| auto input_tensor = in_tensors_.at(kInputIndex); | |||
| auto input_data_type = input_tensor->data_type(); | |||
| MS_ASSERT(input_data_type == kNumberTypeFloat32 || input_data_type == kNumberTypeFloat16); | |||
| if (input_data_type == kNumberTypeFloat32) { | |||
| auto input_ele_num = input_tensor->ElementsNum(); | |||
| auto ori_input_data = reinterpret_cast<float *>(input_tensor->Data()); | |||
| Float32ToFloat16(ori_input_data, fp16_input_, input_ele_num); | |||
| execute_input_ = fp16_input_; | |||
| } else { | |||
| auto ori_input_data = reinterpret_cast<float16_t *>(input_tensor->Data()); | |||
| execute_input_ = ori_input_data; | |||
| } | |||
| in_data_type_ = input_tensor->data_type(); | |||
| MS_ASSERT(in_data_type_ == kNumberTypeFloat32 || in_data_type_ == kNumberTypeFloat16); | |||
| execute_input_ = ConvertInputFp32toFp16(input_tensor, context_); | |||
| // ==================output====================// | |||
| auto out_tensor = out_tensors_.at(kOutputIndex); | |||
| auto out_data_type = out_tensor->data_type(); | |||
| MS_ASSERT(out_data_type == kNumberTypeFloat32 || out_data_type == kNumberTypeFloat16); | |||
| out_data_type_ = out_data_type; | |||
| if (out_data_type == kNumberTypeFloat32) { | |||
| execute_output_ = fp16_out_; | |||
| } else { | |||
| auto out_ptr = reinterpret_cast<float16_t *>(out_tensor->Data()); | |||
| execute_output_ = out_ptr; | |||
| } | |||
| out_data_type_ = out_tensor->data_type(); | |||
| MS_ASSERT(out_data_type_ == kNumberTypeFloat32 || out_data_type_ == kNumberTypeFloat16); | |||
| execute_output_ = MallocOutputFp16(out_tensor, context_); | |||
| return RET_OK; | |||
| } | |||
| @@ -79,7 +69,16 @@ void ConvolutionBaseFP16CPUKernel::IfCastOutput() { | |||
| auto out_tensor = out_tensors_.at(kOutputIndex); | |||
| auto out_ele_num = out_tensor->ElementsNum(); | |||
| auto output_addr = reinterpret_cast<float *>(out_tensor->Data()); | |||
| Float16ToFloat32(fp16_out_, output_addr, out_ele_num); | |||
| Float16ToFloat32(execute_output_, output_addr, out_ele_num); | |||
| } | |||
| } | |||
| void ConvolutionBaseFP16CPUKernel::FreeTmpBuffer() { | |||
| if (in_data_type_ == kNumberTypeFloat32) { | |||
| context_->allocator->Free(execute_input_); | |||
| } | |||
| if (out_data_type_ == kNumberTypeFloat32) { | |||
| context_->allocator->Free(execute_output_); | |||
| } | |||
| } | |||
| @@ -39,14 +39,14 @@ class ConvolutionBaseFP16CPUKernel : public ConvolutionBaseCPUKernel { | |||
| virtual int GetExecuteTensor(); | |||
| virtual int GetExecuteFilter(); | |||
| virtual void IfCastOutput(); | |||
| void FreeTmpBuffer(); | |||
| protected: | |||
| float16_t *fp16_input_ = nullptr; | |||
| float16_t *fp16_weight_ = nullptr; | |||
| float16_t *fp16_out_ = nullptr; | |||
| float16_t *execute_input_; | |||
| float16_t *execute_input_; // ctx allocator malloc and free | |||
| float16_t *execute_weight_; | |||
| float16_t *execute_output_; | |||
| float16_t *execute_output_; // ctx allocator malloc and free | |||
| TypeId in_data_type_; | |||
| TypeId out_data_type_; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| @@ -173,22 +173,13 @@ int ConvolutionDepthwiseFp16CPUKernel::Run() { | |||
| return RET_ERROR; | |||
| } | |||
| auto input_tensor = in_tensors_.at(kInputIndex); | |||
| float16_t *input_addr; | |||
| if (input_tensor->data_type() == kNumberTypeFloat32) { | |||
| input_addr = | |||
| reinterpret_cast<float16_t *>(context_->allocator->Malloc(input_tensor->ElementsNum() * sizeof(float16_t))); | |||
| if (input_addr == nullptr) { | |||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||
| return RET_ERROR; | |||
| } | |||
| Float32ToFloat16(reinterpret_cast<float *>(input_tensor->Data()), input_addr, input_tensor->ElementsNum()); | |||
| } else { | |||
| input_addr = reinterpret_cast<float16_t *>(input_tensor->Data()); | |||
| ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Get Execute tensor failed."; | |||
| return ret; | |||
| } | |||
| // pack input: to nhwc8 | |||
| PackNHWCToNHWC8Fp16(input_addr, packed_input_, conv_param_->input_batch_, | |||
| PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_, | |||
| conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); | |||
| ret = LiteBackendParallelLaunch(ConvDwFp16Run, this, conv_param_->thread_num_); | |||
| @@ -197,13 +188,11 @@ int ConvolutionDepthwiseFp16CPUKernel::Run() { | |||
| return RET_ERROR; | |||
| } | |||
| auto output_addr = reinterpret_cast<float16_t *>(out_tensors_.at(kOutputIndex)->Data()); | |||
| PackNHWC8ToNHWCFp16(packed_output_, output_addr, conv_param_->output_batch_, | |||
| PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_, | |||
| conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); | |||
| if (input_tensor->data_type() == kNumberTypeFloat32) { | |||
| context_->allocator->Free(input_addr); | |||
| } | |||
| ConvolutionBaseFP16CPUKernel::IfCastOutput(); | |||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||
| return RET_OK; | |||
| } | |||
| @@ -19,7 +19,7 @@ | |||
| #include <vector> | |||
| #include "src/lite_kernel.h" | |||
| #include "src/runtime/kernel/arm/base/convolution_base.h" | |||
| #include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h" | |||
| #include "src/runtime/kernel/arm/nnacl/fp16/conv_depthwise_fp16.h" | |||
| #ifdef __cplusplus | |||
| @@ -34,12 +34,12 @@ void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const flo | |||
| namespace mindspore::kernel { | |||
| class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseCPUKernel { | |||
| class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel { | |||
| public: | |||
| ConvolutionDepthwiseFp16CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs, | |||
| const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx, | |||
| const lite::Primitive *primitive) | |||
| : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} | |||
| : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} | |||
| ~ConvolutionDepthwiseFp16CPUKernel() override; | |||
| int Init() override; | |||
| @@ -103,15 +103,6 @@ int ConvolutionFP16CPUKernel::InitTmpBuffer() { | |||
| } | |||
| memset(packed_input_, 0, in_batch * packed_input_size * sizeof(float16_t)); | |||
| /*=============================fp16_input_============================*/ | |||
| size_t fp16_input_size = | |||
| in_channel * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t); | |||
| fp16_input_ = reinterpret_cast<float16_t *>(malloc(fp16_input_size)); | |||
| if (fp16_input_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc fp16_input_ failed."; | |||
| return RET_ERROR; | |||
| } | |||
| /*=============================nhwc4_input_============================*/ | |||
| size_t nhwc4_input_size = channel_block * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * | |||
| conv_param_->input_w_ * sizeof(float16_t); | |||
| @@ -129,14 +120,6 @@ int ConvolutionFP16CPUKernel::InitTmpBuffer() { | |||
| return RET_ERROR; | |||
| } | |||
| /*=============================fp16_out_============================*/ | |||
| size_t fp16_output_size = | |||
| out_channel * conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * sizeof(float16_t); | |||
| fp16_out_ = reinterpret_cast<float16_t *>(malloc(fp16_output_size)); | |||
| if (fp16_out_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc fp16_out_ failed."; | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| @@ -181,12 +164,6 @@ int ConvolutionFP16CPUKernel::ReSize() { | |||
| if (nhwc4_input_ != nullptr) { | |||
| free(nhwc4_input_); | |||
| } | |||
| if (fp16_input_ != nullptr) { | |||
| free(fp16_input_); | |||
| } | |||
| if (fp16_out_ != nullptr) { | |||
| free(fp16_out_); | |||
| } | |||
| auto ret = ConvolutionBaseCPUKernel::Init(); | |||
| if (ret != RET_OK) { | |||
| @@ -242,6 +219,7 @@ int ConvolutionFP16CPUKernel::Run() { | |||
| return RET_ERROR; | |||
| } | |||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||
| ConvolutionBaseFP16CPUKernel::IfCastOutput(); | |||
| return RET_OK; | |||
| } | |||
| @@ -30,15 +30,9 @@ class ConvolutionFP16CPUKernel : public ConvolutionBaseFP16CPUKernel { | |||
| const lite::Primitive *primitive) | |||
| : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} | |||
| ~ConvolutionFP16CPUKernel() override { | |||
| if (fp16_input_ != nullptr) { | |||
| free(fp16_input_); | |||
| } | |||
| if (fp16_weight_ != nullptr) { | |||
| free(fp16_weight_); | |||
| } | |||
| if (fp16_out_ != nullptr) { | |||
| free(fp16_out_); | |||
| } | |||
| if (packed_input_ != nullptr) { | |||
| free(packed_input_); | |||
| } | |||
| @@ -106,15 +106,6 @@ int ConvolutionSWFP16CPUKernel::InitTmpBuffer() { | |||
| int channel_block = UP_DIV(in_channel, C4NUM); | |||
| int oc4 = UP_DIV(out_channel, C4NUM); | |||
| /*=============================fp16_input_============================*/ | |||
| size_t fp16_input_size = | |||
| in_channel * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t); | |||
| fp16_input_ = reinterpret_cast<float16_t *>(malloc(fp16_input_size)); | |||
| if (fp16_input_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc fp16_input_ failed."; | |||
| return RET_ERROR; | |||
| } | |||
| /*=============================nhwc4_input_============================*/ | |||
| size_t nhwc4_input_size = channel_block * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * | |||
| conv_param_->input_w_ * sizeof(float16_t); | |||
| @@ -133,14 +124,6 @@ int ConvolutionSWFP16CPUKernel::InitTmpBuffer() { | |||
| return RET_ERROR; | |||
| } | |||
| /*=============================fp16_out_============================*/ | |||
| size_t fp16_output_size = | |||
| out_channel * conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * sizeof(float16_t); | |||
| fp16_out_ = reinterpret_cast<float16_t *>(malloc(fp16_output_size)); | |||
| if (fp16_out_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc fp16_out_ failed."; | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| @@ -186,12 +169,6 @@ int ConvolutionSWFP16CPUKernel::ReSize() { | |||
| if (nhwc4_input_ != nullptr) { | |||
| free(nhwc4_input_); | |||
| } | |||
| if (fp16_input_ != nullptr) { | |||
| free(fp16_input_); | |||
| } | |||
| if (fp16_out_ != nullptr) { | |||
| free(fp16_out_); | |||
| } | |||
| delete slidingWindow_param_; | |||
| auto ret = ConvolutionBaseCPUKernel::Init(); | |||
| @@ -258,6 +235,7 @@ int ConvolutionSWFP16CPUKernel::Run() { | |||
| conv_param_->output_channel_); | |||
| } | |||
| ConvolutionBaseFP16CPUKernel::IfCastOutput(); | |||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||
| return RET_OK; | |||
| } | |||
| } // namespace mindspore::kernel | |||
| @@ -29,15 +29,9 @@ class ConvolutionSWFP16CPUKernel : public ConvolutionBaseFP16CPUKernel { | |||
| const lite::Primitive *primitive) | |||
| : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} | |||
| ~ConvolutionSWFP16CPUKernel() override { | |||
| if (fp16_input_ != nullptr) { | |||
| free(fp16_input_); | |||
| } | |||
| if (fp16_weight_ != nullptr) { | |||
| free(fp16_weight_); | |||
| } | |||
| if (fp16_out_ != nullptr) { | |||
| free(fp16_out_); | |||
| } | |||
| if (packed_weight_ != nullptr) { | |||
| free(packed_weight_); | |||
| } | |||
| @@ -187,15 +187,6 @@ int ConvolutionWinogradFP16CPUKernel::InitTmpBuffer() { | |||
| int ic4 = UP_DIV(channel_in, C4NUM); | |||
| int oc8 = UP_DIV(channel_out, C8NUM); | |||
| /*=============================fp16_input_============================*/ | |||
| size_t fp16_input_size = conv_param_->input_channel_ * conv_param_->input_batch_ * conv_param_->input_h_ * | |||
| conv_param_->input_w_ * sizeof(float16_t); | |||
| fp16_input_ = reinterpret_cast<float16_t *>(malloc(fp16_input_size)); | |||
| if (fp16_input_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc fp16_input_ failed."; | |||
| return RET_ERROR; | |||
| } | |||
| /*=============================trans_input_============================*/ | |||
| size_t tile_buffer_size = thread_count_ * cal_num * input_unit_ * input_unit_ * ic4 * C4NUM * sizeof(float16_t); | |||
| trans_input_ = reinterpret_cast<float16_t *>(malloc(tile_buffer_size)); | |||
| @@ -222,14 +213,6 @@ int ConvolutionWinogradFP16CPUKernel::InitTmpBuffer() { | |||
| MS_LOG(ERROR) << "malloc tmp_out_data_ failed."; | |||
| return RET_ERROR; | |||
| } | |||
| /*=============================fp16_out_============================*/ | |||
| size_t fp16_output_size = conv_param_->output_channel_ * conv_param_->output_batch_ * conv_param_->output_h_ * | |||
| conv_param_->output_w_ * sizeof(float16_t); | |||
| fp16_out_ = reinterpret_cast<float16_t *>(malloc(fp16_output_size)); | |||
| if (fp16_out_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc fp16_out_ failed."; | |||
| return RET_ERROR; | |||
| } | |||
| /*=============================tmp_data_============================*/ | |||
| tmp_data_ = | |||
| @@ -327,12 +310,6 @@ int ConvolutionWinogradFP16CPUKernel::ReSize() { | |||
| if (nhwc4_input_ != nullptr) { | |||
| free(nhwc4_input_); | |||
| } | |||
| if (fp16_input_ != nullptr) { | |||
| free(fp16_input_); | |||
| } | |||
| if (fp16_out_ != nullptr) { | |||
| free(fp16_out_); | |||
| } | |||
| auto ret = ConvolutionBaseCPUKernel::Init(); | |||
| if (ret != RET_OK) { | |||
| @@ -412,6 +389,7 @@ int ConvolutionWinogradFP16CPUKernel::Run() { | |||
| // do nothing | |||
| } | |||
| ConvolutionBaseFP16CPUKernel::IfCastOutput(); | |||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||
| return RET_OK; | |||
| } | |||
| } // namespace mindspore::kernel | |||
| @@ -34,15 +34,9 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel { | |||
| const lite::Primitive *primitive) | |||
| : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} | |||
| ~ConvolutionWinogradFP16CPUKernel() override { | |||
| if (fp16_input_ != nullptr) { | |||
| free(fp16_input_); | |||
| } | |||
| if (fp16_weight_ != nullptr) { | |||
| free(fp16_weight_); | |||
| } | |||
| if (fp16_out_ != nullptr) { | |||
| free(fp16_out_); | |||
| } | |||
| if (tmp_data_ != nullptr) { | |||
| free(tmp_data_); | |||
| } | |||
| @@ -185,11 +185,14 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() { | |||
| return RET_ERROR; | |||
| } | |||
| auto input_tensor = in_tensors_.at(kInputIndex); | |||
| auto input_addr = reinterpret_cast<float *>(input_tensor->Data()); | |||
| ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Get Execute tensor failed."; | |||
| return ret; | |||
| } | |||
| // pack input: to nhwc8 | |||
| PackNHWCFp32ToNHWC8Fp16(input_addr, packed_input_, conv_param_->input_batch_, | |||
| conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); | |||
| PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_, | |||
| conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); | |||
| ret = LiteBackendParallelLaunch(DeconvDwFp16Run, this, conv_param_->thread_num_); | |||
| if (ret != RET_OK) { | |||
| @@ -197,9 +200,10 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() { | |||
| return RET_ERROR; | |||
| } | |||
| auto output_addr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->Data()); | |||
| PackNHWC8Fp16ToNHWCFp32(packed_output_, output_addr, conv_param_->output_batch_, | |||
| conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); | |||
| PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_, | |||
| conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); | |||
| ConvolutionBaseFP16CPUKernel::IfCastOutput(); | |||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||
| return RET_OK; | |||
| } | |||
| @@ -19,7 +19,7 @@ | |||
| #include <vector> | |||
| #include "src/lite_kernel.h" | |||
| #include "src/runtime/kernel/arm/base/convolution_base.h" | |||
| #include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h" | |||
| #include "src/runtime/kernel/arm/nnacl/fp16/conv_depthwise_fp16.h" | |||
| #ifdef __cplusplus | |||
| @@ -34,12 +34,12 @@ void ComputeStrides(int *shape, int *strides, int ndim); | |||
| #endif | |||
| namespace mindspore::kernel { | |||
| class DeconvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseCPUKernel { | |||
| class DeconvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel { | |||
| public: | |||
| DeconvolutionDepthwiseFp16CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs, | |||
| const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx, | |||
| const lite::Primitive *primitive) | |||
| : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} | |||
| : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} | |||
| ~DeconvolutionDepthwiseFp16CPUKernel() override; | |||
| int Init() override; | |||
| @@ -392,7 +392,7 @@ void PackNHWCToNHWC8Fp16(float16_t *src, float16_t *dst, int batch, int plane, i | |||
| for (int i = 0; i < plane; i++) { | |||
| float16_t *dst_plane = dst_batch + i * c8_channel; | |||
| float16_t *src_plane = src_batch + i * channel; | |||
| memcpy(dst_plane, src_batch, channel * sizeof(float16_t)); | |||
| memcpy(dst_plane, src_plane, channel * sizeof(float16_t)); | |||
| } | |||
| } | |||
| } | |||
| @@ -405,7 +405,7 @@ void PackNHWC8ToNHWCFp16(float16_t *src, float16_t *dst, int batch, int plane, i | |||
| for (int i = 0; i < plane; i++) { | |||
| float16_t *src_plane = src_batch + i * c8_channel; | |||
| float16_t *dst_plane = dst_batch + i * channel; | |||
| memcpy(dst_plane, src_batch, channel * sizeof(float16_t)); | |||
| memcpy(dst_plane, src_plane, channel * sizeof(float16_t)); | |||
| } | |||
| } | |||
| } | |||