| @@ -0,0 +1,46 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "src/runtime/kernel/arm/fp16/common_fp16.h" | |||||
| #include "src/runtime/kernel/arm/nnacl/fp16/cast_fp16.h" | |||||
| namespace mindspore::kernel { | |||||
| float16_t *ConvertInputFp32toFp16(lite::tensor::Tensor *input, const lite::Context *ctx) { | |||||
| float16_t *fp16_data = nullptr; | |||||
| auto data_type = input->data_type(); | |||||
| if (data_type == kNumberTypeFloat32) { | |||||
| auto ele_num = input->ElementsNum(); | |||||
| fp16_data = reinterpret_cast<float16_t *>(ctx->allocator->Malloc(ele_num * sizeof(float16_t))); | |||||
| auto ori_data = reinterpret_cast<float *>(input->Data()); | |||||
| Float32ToFloat16(ori_data, fp16_data, ele_num); | |||||
| } else { | |||||
| fp16_data = reinterpret_cast<float16_t *>(input->Data()); | |||||
| } | |||||
| return fp16_data; | |||||
| } | |||||
| float16_t *MallocOutputFp16(lite::tensor::Tensor *output, const lite::Context *ctx) { | |||||
| float16_t *fp16_data = nullptr; | |||||
| auto data_type = output->data_type(); | |||||
| if (data_type == kNumberTypeFloat32) { | |||||
| auto ele_num = output->ElementsNum(); | |||||
| fp16_data = reinterpret_cast<float16_t *>(ctx->allocator->Malloc(ele_num * sizeof(float16_t))); | |||||
| } else { | |||||
| fp16_data = reinterpret_cast<float16_t *>(output->Data()); | |||||
| } | |||||
| return fp16_data; | |||||
| } | |||||
| } // namespace mindspore::kernel | |||||
| @@ -0,0 +1,28 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_COMMON_FP16_H_ | |||||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_COMMON_FP16_H_ | |||||
| #include "src/lite_kernel.h" | |||||
| namespace mindspore::kernel { | |||||
| float16_t *ConvertInputFp32toFp16(lite::tensor::Tensor *input, const lite::Context *ctx); | |||||
| float16_t *MallocOutputFp16(lite::tensor::Tensor *output, const lite::Context *ctx); | |||||
| } // namespace mindspore::kernel | |||||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_COMMON_FP16_H_ | |||||
| @@ -98,28 +98,6 @@ int Convolution1x1FP16CPUKernel::InitWeightBias() { | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int Convolution1x1FP16CPUKernel::InitBuffer() { | |||||
| /*=============================fp16_input_============================*/ | |||||
| size_t fp16_input_size = conv_param_->input_channel_ * conv_param_->input_batch_ * conv_param_->input_h_ * | |||||
| conv_param_->input_w_ * sizeof(float16_t); | |||||
| fp16_input_ = reinterpret_cast<float16_t *>(malloc(fp16_input_size)); | |||||
| if (fp16_input_ == nullptr) { | |||||
| MS_LOG(ERROR) << "malloc fp16_input_ failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| memset(fp16_input_, 0, fp16_input_size); | |||||
| /*=============================fp16_out_============================*/ | |||||
| size_t fp16_output_size = conv_param_->output_channel_ * conv_param_->output_batch_ * conv_param_->output_h_ * | |||||
| conv_param_->output_w_ * sizeof(float16_t); | |||||
| fp16_out_ = reinterpret_cast<float16_t *>(malloc(fp16_output_size)); | |||||
| if (fp16_out_ == nullptr) { | |||||
| MS_LOG(ERROR) << "malloc fp16_out_ failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int Convolution1x1FP16CPUKernel::Init() { | int Convolution1x1FP16CPUKernel::Init() { | ||||
| auto ret = ConvolutionBaseCPUKernel::Init(); | auto ret = ConvolutionBaseCPUKernel::Init(); | ||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| @@ -136,11 +114,6 @@ int Convolution1x1FP16CPUKernel::Init() { | |||||
| MS_LOG(ERROR) << "Init conv1x1 param failed."; | MS_LOG(ERROR) << "Init conv1x1 param failed."; | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| ret = InitBuffer(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Init buffer failed."; | |||||
| return ret; | |||||
| } | |||||
| ret = InitWeightBias(); | ret = InitWeightBias(); | ||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| MS_LOG(ERROR) << "Init weight bias failed."; | MS_LOG(ERROR) << "Init weight bias failed."; | ||||
| @@ -150,12 +123,6 @@ int Convolution1x1FP16CPUKernel::Init() { | |||||
| } | } | ||||
| int Convolution1x1FP16CPUKernel::ReSize() { | int Convolution1x1FP16CPUKernel::ReSize() { | ||||
| if (fp16_out_ != nullptr) { | |||||
| free(fp16_out_); | |||||
| } | |||||
| if (fp16_input_ != nullptr) { | |||||
| free(fp16_input_); | |||||
| } | |||||
| if (fp16_weight_ != nullptr) { | if (fp16_weight_ != nullptr) { | ||||
| free(fp16_weight_); | free(fp16_weight_); | ||||
| } | } | ||||
| @@ -181,12 +148,6 @@ int Convolution1x1FP16CPUKernel::ReSize() { | |||||
| MS_LOG(ERROR) << "Init conv1x1 param failed."; | MS_LOG(ERROR) << "Init conv1x1 param failed."; | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| ret = InitBuffer(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Init buffer failed."; | |||||
| return ret; | |||||
| } | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -253,6 +214,7 @@ int Convolution1x1FP16CPUKernel::Run() { | |||||
| } | } | ||||
| ConvolutionBaseFP16CPUKernel::IfCastOutput(); | ConvolutionBaseFP16CPUKernel::IfCastOutput(); | ||||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -35,15 +35,9 @@ class Convolution1x1FP16CPUKernel : public ConvolutionBaseFP16CPUKernel { | |||||
| matmul_param_ = new MatMulParameter(); | matmul_param_ = new MatMulParameter(); | ||||
| } | } | ||||
| ~Convolution1x1FP16CPUKernel() override { | ~Convolution1x1FP16CPUKernel() override { | ||||
| if (fp16_input_ != nullptr) { | |||||
| free(fp16_input_); | |||||
| } | |||||
| if (fp16_weight_ != nullptr) { | if (fp16_weight_ != nullptr) { | ||||
| free(fp16_weight_); | free(fp16_weight_); | ||||
| } | } | ||||
| if (fp16_out_ != nullptr) { | |||||
| free(fp16_out_); | |||||
| } | |||||
| if (input_ptr_ != nullptr) { | if (input_ptr_ != nullptr) { | ||||
| free(input_ptr_); | free(input_ptr_); | ||||
| } | } | ||||
| @@ -57,7 +51,6 @@ class Convolution1x1FP16CPUKernel : public ConvolutionBaseFP16CPUKernel { | |||||
| int ReSize() override; | int ReSize() override; | ||||
| int Run() override; | int Run() override; | ||||
| int RunImpl(int task_id); | int RunImpl(int task_id); | ||||
| int InitBuffer(); | |||||
| int InitConv1x1Param(); | int InitConv1x1Param(); | ||||
| int InitMatmulParam(); | int InitMatmulParam(); | ||||
| int InitWeightBias(); | int InitWeightBias(); | ||||
| @@ -132,16 +132,6 @@ int Convolution3x3FP16CPUKernel::InitTmpBuffer() { | |||||
| } | } | ||||
| memset(tmp_out_, 0, tmp_out_size); | memset(tmp_out_, 0, tmp_out_size); | ||||
| /*=============================fp16_input_============================*/ | |||||
| size_t fp16_input_size = conv_param_->input_channel_ * conv_param_->input_batch_ * conv_param_->input_h_ * | |||||
| conv_param_->input_w_ * sizeof(float16_t); | |||||
| fp16_input_ = reinterpret_cast<float16_t *>(malloc(fp16_input_size)); | |||||
| if (fp16_input_ == nullptr) { | |||||
| MS_LOG(ERROR) << "malloc fp16_input_ failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| memset(fp16_input_, 0, fp16_input_size); | |||||
| /*=============================nhwc4_input_============================*/ | /*=============================nhwc4_input_============================*/ | ||||
| size_t nhwc4_input_size = | size_t nhwc4_input_size = | ||||
| iC4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t); | iC4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t); | ||||
| @@ -152,14 +142,6 @@ int Convolution3x3FP16CPUKernel::InitTmpBuffer() { | |||||
| } | } | ||||
| memset(nhwc4_input_, 0, nhwc4_input_size); | memset(nhwc4_input_, 0, nhwc4_input_size); | ||||
| /*=============================fp16_out_============================*/ | |||||
| size_t fp16_output_size = conv_param_->output_channel_ * conv_param_->output_batch_ * conv_param_->output_h_ * | |||||
| conv_param_->output_w_ * sizeof(float16_t); | |||||
| fp16_out_ = reinterpret_cast<float16_t *>(malloc(fp16_output_size)); | |||||
| if (fp16_out_ == nullptr) { | |||||
| MS_LOG(ERROR) << "malloc fp16_out_ failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -207,12 +189,6 @@ int Convolution3x3FP16CPUKernel::ReSize() { | |||||
| if (tmp_out_ != nullptr) { | if (tmp_out_ != nullptr) { | ||||
| free(tmp_out_); | free(tmp_out_); | ||||
| } | } | ||||
| if (fp16_out_ != nullptr) { | |||||
| free(fp16_out_); | |||||
| } | |||||
| if (fp16_input_ != nullptr) { | |||||
| free(fp16_input_); | |||||
| } | |||||
| if (nhwc4_input_ != nullptr) { | if (nhwc4_input_ != nullptr) { | ||||
| free(nhwc4_input_); | free(nhwc4_input_); | ||||
| } | } | ||||
| @@ -304,6 +280,7 @@ int Convolution3x3FP16CPUKernel::Run() { | |||||
| } | } | ||||
| ConvolutionBaseFP16CPUKernel::IfCastOutput(); | ConvolutionBaseFP16CPUKernel::IfCastOutput(); | ||||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -31,15 +31,9 @@ class Convolution3x3FP16CPUKernel : public ConvolutionBaseFP16CPUKernel { | |||||
| const lite::Primitive *primitive) | const lite::Primitive *primitive) | ||||
| : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} | : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} | ||||
| ~Convolution3x3FP16CPUKernel() override { | ~Convolution3x3FP16CPUKernel() override { | ||||
| if (fp16_input_ != nullptr) { | |||||
| free(fp16_input_); | |||||
| } | |||||
| if (fp16_weight_ != nullptr) { | if (fp16_weight_ != nullptr) { | ||||
| free(fp16_weight_); | free(fp16_weight_); | ||||
| } | } | ||||
| if (fp16_out_ != nullptr) { | |||||
| free(fp16_out_); | |||||
| } | |||||
| if (transformed_filter_addr_ != nullptr) { | if (transformed_filter_addr_ != nullptr) { | ||||
| free(transformed_filter_addr_); | free(transformed_filter_addr_); | ||||
| } | } | ||||
| @@ -16,6 +16,7 @@ | |||||
| #include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h" | #include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h" | ||||
| #include "src/runtime/kernel/arm/nnacl/fp16/cast_fp16.h" | #include "src/runtime/kernel/arm/nnacl/fp16/cast_fp16.h" | ||||
| #include "src/runtime/kernel/arm/fp16/common_fp16.h" | |||||
| #include "schema/model_generated.h" | #include "schema/model_generated.h" | ||||
| #include "src/kernel_factory.h" | #include "src/kernel_factory.h" | ||||
| #include "include/errorcode.h" | #include "include/errorcode.h" | ||||
| @@ -25,28 +26,17 @@ namespace mindspore::kernel { | |||||
| int ConvolutionBaseFP16CPUKernel::GetExecuteTensor() { | int ConvolutionBaseFP16CPUKernel::GetExecuteTensor() { | ||||
| // ===================input====================// | // ===================input====================// | ||||
| auto input_tensor = in_tensors_.at(kInputIndex); | auto input_tensor = in_tensors_.at(kInputIndex); | ||||
| auto input_data_type = input_tensor->data_type(); | |||||
| MS_ASSERT(input_data_type == kNumberTypeFloat32 || input_data_type == kNumberTypeFloat16); | |||||
| if (input_data_type == kNumberTypeFloat32) { | |||||
| auto input_ele_num = input_tensor->ElementsNum(); | |||||
| auto ori_input_data = reinterpret_cast<float *>(input_tensor->Data()); | |||||
| Float32ToFloat16(ori_input_data, fp16_input_, input_ele_num); | |||||
| execute_input_ = fp16_input_; | |||||
| } else { | |||||
| auto ori_input_data = reinterpret_cast<float16_t *>(input_tensor->Data()); | |||||
| execute_input_ = ori_input_data; | |||||
| } | |||||
| in_data_type_ = input_tensor->data_type(); | |||||
| MS_ASSERT(in_data_type_ == kNumberTypeFloat32 || in_data_type_ == kNumberTypeFloat16); | |||||
| execute_input_ = ConvertInputFp32toFp16(input_tensor, context_); | |||||
| // ==================output====================// | // ==================output====================// | ||||
| auto out_tensor = out_tensors_.at(kOutputIndex); | auto out_tensor = out_tensors_.at(kOutputIndex); | ||||
| auto out_data_type = out_tensor->data_type(); | |||||
| MS_ASSERT(out_data_type == kNumberTypeFloat32 || out_data_type == kNumberTypeFloat16); | |||||
| out_data_type_ = out_data_type; | |||||
| if (out_data_type == kNumberTypeFloat32) { | |||||
| execute_output_ = fp16_out_; | |||||
| } else { | |||||
| auto out_ptr = reinterpret_cast<float16_t *>(out_tensor->Data()); | |||||
| execute_output_ = out_ptr; | |||||
| } | |||||
| out_data_type_ = out_tensor->data_type(); | |||||
| MS_ASSERT(out_data_type_ == kNumberTypeFloat32 || out_data_type_ == kNumberTypeFloat16); | |||||
| execute_output_ = MallocOutputFp16(out_tensor, context_); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -79,7 +69,16 @@ void ConvolutionBaseFP16CPUKernel::IfCastOutput() { | |||||
| auto out_tensor = out_tensors_.at(kOutputIndex); | auto out_tensor = out_tensors_.at(kOutputIndex); | ||||
| auto out_ele_num = out_tensor->ElementsNum(); | auto out_ele_num = out_tensor->ElementsNum(); | ||||
| auto output_addr = reinterpret_cast<float *>(out_tensor->Data()); | auto output_addr = reinterpret_cast<float *>(out_tensor->Data()); | ||||
| Float16ToFloat32(fp16_out_, output_addr, out_ele_num); | |||||
| Float16ToFloat32(execute_output_, output_addr, out_ele_num); | |||||
| } | |||||
| } | |||||
| void ConvolutionBaseFP16CPUKernel::FreeTmpBuffer() { | |||||
| if (in_data_type_ == kNumberTypeFloat32) { | |||||
| context_->allocator->Free(execute_input_); | |||||
| } | |||||
| if (out_data_type_ == kNumberTypeFloat32) { | |||||
| context_->allocator->Free(execute_output_); | |||||
| } | } | ||||
| } | } | ||||
| @@ -39,14 +39,14 @@ class ConvolutionBaseFP16CPUKernel : public ConvolutionBaseCPUKernel { | |||||
| virtual int GetExecuteTensor(); | virtual int GetExecuteTensor(); | ||||
| virtual int GetExecuteFilter(); | virtual int GetExecuteFilter(); | ||||
| virtual void IfCastOutput(); | virtual void IfCastOutput(); | ||||
| void FreeTmpBuffer(); | |||||
| protected: | protected: | ||||
| float16_t *fp16_input_ = nullptr; | |||||
| float16_t *fp16_weight_ = nullptr; | float16_t *fp16_weight_ = nullptr; | ||||
| float16_t *fp16_out_ = nullptr; | |||||
| float16_t *execute_input_; | |||||
| float16_t *execute_input_; // ctx allocator malloc and free | |||||
| float16_t *execute_weight_; | float16_t *execute_weight_; | ||||
| float16_t *execute_output_; | |||||
| float16_t *execute_output_; // ctx allocator malloc and free | |||||
| TypeId in_data_type_; | |||||
| TypeId out_data_type_; | TypeId out_data_type_; | ||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -173,22 +173,13 @@ int ConvolutionDepthwiseFp16CPUKernel::Run() { | |||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| auto input_tensor = in_tensors_.at(kInputIndex); | |||||
| float16_t *input_addr; | |||||
| if (input_tensor->data_type() == kNumberTypeFloat32) { | |||||
| input_addr = | |||||
| reinterpret_cast<float16_t *>(context_->allocator->Malloc(input_tensor->ElementsNum() * sizeof(float16_t))); | |||||
| if (input_addr == nullptr) { | |||||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| Float32ToFloat16(reinterpret_cast<float *>(input_tensor->Data()), input_addr, input_tensor->ElementsNum()); | |||||
| } else { | |||||
| input_addr = reinterpret_cast<float16_t *>(input_tensor->Data()); | |||||
| ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Get Execute tensor failed."; | |||||
| return ret; | |||||
| } | } | ||||
| // pack input: to nhwc8 | // pack input: to nhwc8 | ||||
| PackNHWCToNHWC8Fp16(input_addr, packed_input_, conv_param_->input_batch_, | |||||
| PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_, | |||||
| conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); | conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); | ||||
| ret = LiteBackendParallelLaunch(ConvDwFp16Run, this, conv_param_->thread_num_); | ret = LiteBackendParallelLaunch(ConvDwFp16Run, this, conv_param_->thread_num_); | ||||
| @@ -197,13 +188,11 @@ int ConvolutionDepthwiseFp16CPUKernel::Run() { | |||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| auto output_addr = reinterpret_cast<float16_t *>(out_tensors_.at(kOutputIndex)->Data()); | |||||
| PackNHWC8ToNHWCFp16(packed_output_, output_addr, conv_param_->output_batch_, | |||||
| PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_, | |||||
| conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); | conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); | ||||
| if (input_tensor->data_type() == kNumberTypeFloat32) { | |||||
| context_->allocator->Free(input_addr); | |||||
| } | |||||
| ConvolutionBaseFP16CPUKernel::IfCastOutput(); | |||||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -19,7 +19,7 @@ | |||||
| #include <vector> | #include <vector> | ||||
| #include "src/lite_kernel.h" | #include "src/lite_kernel.h" | ||||
| #include "src/runtime/kernel/arm/base/convolution_base.h" | |||||
| #include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h" | |||||
| #include "src/runtime/kernel/arm/nnacl/fp16/conv_depthwise_fp16.h" | #include "src/runtime/kernel/arm/nnacl/fp16/conv_depthwise_fp16.h" | ||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||
| @@ -34,12 +34,12 @@ void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const flo | |||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseCPUKernel { | |||||
| class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel { | |||||
| public: | public: | ||||
| ConvolutionDepthwiseFp16CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs, | ConvolutionDepthwiseFp16CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs, | ||||
| const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx, | const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx, | ||||
| const lite::Primitive *primitive) | const lite::Primitive *primitive) | ||||
| : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} | |||||
| : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} | |||||
| ~ConvolutionDepthwiseFp16CPUKernel() override; | ~ConvolutionDepthwiseFp16CPUKernel() override; | ||||
| int Init() override; | int Init() override; | ||||
| @@ -103,15 +103,6 @@ int ConvolutionFP16CPUKernel::InitTmpBuffer() { | |||||
| } | } | ||||
| memset(packed_input_, 0, in_batch * packed_input_size * sizeof(float16_t)); | memset(packed_input_, 0, in_batch * packed_input_size * sizeof(float16_t)); | ||||
| /*=============================fp16_input_============================*/ | |||||
| size_t fp16_input_size = | |||||
| in_channel * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t); | |||||
| fp16_input_ = reinterpret_cast<float16_t *>(malloc(fp16_input_size)); | |||||
| if (fp16_input_ == nullptr) { | |||||
| MS_LOG(ERROR) << "malloc fp16_input_ failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| /*=============================nhwc4_input_============================*/ | /*=============================nhwc4_input_============================*/ | ||||
| size_t nhwc4_input_size = channel_block * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * | size_t nhwc4_input_size = channel_block * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * | ||||
| conv_param_->input_w_ * sizeof(float16_t); | conv_param_->input_w_ * sizeof(float16_t); | ||||
| @@ -129,14 +120,6 @@ int ConvolutionFP16CPUKernel::InitTmpBuffer() { | |||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| /*=============================fp16_out_============================*/ | |||||
| size_t fp16_output_size = | |||||
| out_channel * conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * sizeof(float16_t); | |||||
| fp16_out_ = reinterpret_cast<float16_t *>(malloc(fp16_output_size)); | |||||
| if (fp16_out_ == nullptr) { | |||||
| MS_LOG(ERROR) << "malloc fp16_out_ failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -181,12 +164,6 @@ int ConvolutionFP16CPUKernel::ReSize() { | |||||
| if (nhwc4_input_ != nullptr) { | if (nhwc4_input_ != nullptr) { | ||||
| free(nhwc4_input_); | free(nhwc4_input_); | ||||
| } | } | ||||
| if (fp16_input_ != nullptr) { | |||||
| free(fp16_input_); | |||||
| } | |||||
| if (fp16_out_ != nullptr) { | |||||
| free(fp16_out_); | |||||
| } | |||||
| auto ret = ConvolutionBaseCPUKernel::Init(); | auto ret = ConvolutionBaseCPUKernel::Init(); | ||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| @@ -242,6 +219,7 @@ int ConvolutionFP16CPUKernel::Run() { | |||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||||
| ConvolutionBaseFP16CPUKernel::IfCastOutput(); | ConvolutionBaseFP16CPUKernel::IfCastOutput(); | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -30,15 +30,9 @@ class ConvolutionFP16CPUKernel : public ConvolutionBaseFP16CPUKernel { | |||||
| const lite::Primitive *primitive) | const lite::Primitive *primitive) | ||||
| : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} | : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} | ||||
| ~ConvolutionFP16CPUKernel() override { | ~ConvolutionFP16CPUKernel() override { | ||||
| if (fp16_input_ != nullptr) { | |||||
| free(fp16_input_); | |||||
| } | |||||
| if (fp16_weight_ != nullptr) { | if (fp16_weight_ != nullptr) { | ||||
| free(fp16_weight_); | free(fp16_weight_); | ||||
| } | } | ||||
| if (fp16_out_ != nullptr) { | |||||
| free(fp16_out_); | |||||
| } | |||||
| if (packed_input_ != nullptr) { | if (packed_input_ != nullptr) { | ||||
| free(packed_input_); | free(packed_input_); | ||||
| } | } | ||||
| @@ -106,15 +106,6 @@ int ConvolutionSWFP16CPUKernel::InitTmpBuffer() { | |||||
| int channel_block = UP_DIV(in_channel, C4NUM); | int channel_block = UP_DIV(in_channel, C4NUM); | ||||
| int oc4 = UP_DIV(out_channel, C4NUM); | int oc4 = UP_DIV(out_channel, C4NUM); | ||||
| /*=============================fp16_input_============================*/ | |||||
| size_t fp16_input_size = | |||||
| in_channel * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t); | |||||
| fp16_input_ = reinterpret_cast<float16_t *>(malloc(fp16_input_size)); | |||||
| if (fp16_input_ == nullptr) { | |||||
| MS_LOG(ERROR) << "malloc fp16_input_ failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| /*=============================nhwc4_input_============================*/ | /*=============================nhwc4_input_============================*/ | ||||
| size_t nhwc4_input_size = channel_block * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * | size_t nhwc4_input_size = channel_block * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * | ||||
| conv_param_->input_w_ * sizeof(float16_t); | conv_param_->input_w_ * sizeof(float16_t); | ||||
| @@ -133,14 +124,6 @@ int ConvolutionSWFP16CPUKernel::InitTmpBuffer() { | |||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| /*=============================fp16_out_============================*/ | |||||
| size_t fp16_output_size = | |||||
| out_channel * conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * sizeof(float16_t); | |||||
| fp16_out_ = reinterpret_cast<float16_t *>(malloc(fp16_output_size)); | |||||
| if (fp16_out_ == nullptr) { | |||||
| MS_LOG(ERROR) << "malloc fp16_out_ failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -186,12 +169,6 @@ int ConvolutionSWFP16CPUKernel::ReSize() { | |||||
| if (nhwc4_input_ != nullptr) { | if (nhwc4_input_ != nullptr) { | ||||
| free(nhwc4_input_); | free(nhwc4_input_); | ||||
| } | } | ||||
| if (fp16_input_ != nullptr) { | |||||
| free(fp16_input_); | |||||
| } | |||||
| if (fp16_out_ != nullptr) { | |||||
| free(fp16_out_); | |||||
| } | |||||
| delete slidingWindow_param_; | delete slidingWindow_param_; | ||||
| auto ret = ConvolutionBaseCPUKernel::Init(); | auto ret = ConvolutionBaseCPUKernel::Init(); | ||||
| @@ -258,6 +235,7 @@ int ConvolutionSWFP16CPUKernel::Run() { | |||||
| conv_param_->output_channel_); | conv_param_->output_channel_); | ||||
| } | } | ||||
| ConvolutionBaseFP16CPUKernel::IfCastOutput(); | ConvolutionBaseFP16CPUKernel::IfCastOutput(); | ||||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -29,15 +29,9 @@ class ConvolutionSWFP16CPUKernel : public ConvolutionBaseFP16CPUKernel { | |||||
| const lite::Primitive *primitive) | const lite::Primitive *primitive) | ||||
| : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} | : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} | ||||
| ~ConvolutionSWFP16CPUKernel() override { | ~ConvolutionSWFP16CPUKernel() override { | ||||
| if (fp16_input_ != nullptr) { | |||||
| free(fp16_input_); | |||||
| } | |||||
| if (fp16_weight_ != nullptr) { | if (fp16_weight_ != nullptr) { | ||||
| free(fp16_weight_); | free(fp16_weight_); | ||||
| } | } | ||||
| if (fp16_out_ != nullptr) { | |||||
| free(fp16_out_); | |||||
| } | |||||
| if (packed_weight_ != nullptr) { | if (packed_weight_ != nullptr) { | ||||
| free(packed_weight_); | free(packed_weight_); | ||||
| } | } | ||||
| @@ -187,15 +187,6 @@ int ConvolutionWinogradFP16CPUKernel::InitTmpBuffer() { | |||||
| int ic4 = UP_DIV(channel_in, C4NUM); | int ic4 = UP_DIV(channel_in, C4NUM); | ||||
| int oc8 = UP_DIV(channel_out, C8NUM); | int oc8 = UP_DIV(channel_out, C8NUM); | ||||
| /*=============================fp16_input_============================*/ | |||||
| size_t fp16_input_size = conv_param_->input_channel_ * conv_param_->input_batch_ * conv_param_->input_h_ * | |||||
| conv_param_->input_w_ * sizeof(float16_t); | |||||
| fp16_input_ = reinterpret_cast<float16_t *>(malloc(fp16_input_size)); | |||||
| if (fp16_input_ == nullptr) { | |||||
| MS_LOG(ERROR) << "malloc fp16_input_ failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| /*=============================trans_input_============================*/ | /*=============================trans_input_============================*/ | ||||
| size_t tile_buffer_size = thread_count_ * cal_num * input_unit_ * input_unit_ * ic4 * C4NUM * sizeof(float16_t); | size_t tile_buffer_size = thread_count_ * cal_num * input_unit_ * input_unit_ * ic4 * C4NUM * sizeof(float16_t); | ||||
| trans_input_ = reinterpret_cast<float16_t *>(malloc(tile_buffer_size)); | trans_input_ = reinterpret_cast<float16_t *>(malloc(tile_buffer_size)); | ||||
| @@ -222,14 +213,6 @@ int ConvolutionWinogradFP16CPUKernel::InitTmpBuffer() { | |||||
| MS_LOG(ERROR) << "malloc tmp_out_data_ failed."; | MS_LOG(ERROR) << "malloc tmp_out_data_ failed."; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| /*=============================fp16_out_============================*/ | |||||
| size_t fp16_output_size = conv_param_->output_channel_ * conv_param_->output_batch_ * conv_param_->output_h_ * | |||||
| conv_param_->output_w_ * sizeof(float16_t); | |||||
| fp16_out_ = reinterpret_cast<float16_t *>(malloc(fp16_output_size)); | |||||
| if (fp16_out_ == nullptr) { | |||||
| MS_LOG(ERROR) << "malloc fp16_out_ failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| /*=============================tmp_data_============================*/ | /*=============================tmp_data_============================*/ | ||||
| tmp_data_ = | tmp_data_ = | ||||
| @@ -327,12 +310,6 @@ int ConvolutionWinogradFP16CPUKernel::ReSize() { | |||||
| if (nhwc4_input_ != nullptr) { | if (nhwc4_input_ != nullptr) { | ||||
| free(nhwc4_input_); | free(nhwc4_input_); | ||||
| } | } | ||||
| if (fp16_input_ != nullptr) { | |||||
| free(fp16_input_); | |||||
| } | |||||
| if (fp16_out_ != nullptr) { | |||||
| free(fp16_out_); | |||||
| } | |||||
| auto ret = ConvolutionBaseCPUKernel::Init(); | auto ret = ConvolutionBaseCPUKernel::Init(); | ||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| @@ -412,6 +389,7 @@ int ConvolutionWinogradFP16CPUKernel::Run() { | |||||
| // do nothing | // do nothing | ||||
| } | } | ||||
| ConvolutionBaseFP16CPUKernel::IfCastOutput(); | ConvolutionBaseFP16CPUKernel::IfCastOutput(); | ||||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -34,15 +34,9 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel { | |||||
| const lite::Primitive *primitive) | const lite::Primitive *primitive) | ||||
| : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} | : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} | ||||
| ~ConvolutionWinogradFP16CPUKernel() override { | ~ConvolutionWinogradFP16CPUKernel() override { | ||||
| if (fp16_input_ != nullptr) { | |||||
| free(fp16_input_); | |||||
| } | |||||
| if (fp16_weight_ != nullptr) { | if (fp16_weight_ != nullptr) { | ||||
| free(fp16_weight_); | free(fp16_weight_); | ||||
| } | } | ||||
| if (fp16_out_ != nullptr) { | |||||
| free(fp16_out_); | |||||
| } | |||||
| if (tmp_data_ != nullptr) { | if (tmp_data_ != nullptr) { | ||||
| free(tmp_data_); | free(tmp_data_); | ||||
| } | } | ||||
| @@ -185,11 +185,14 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() { | |||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| auto input_tensor = in_tensors_.at(kInputIndex); | |||||
| auto input_addr = reinterpret_cast<float *>(input_tensor->Data()); | |||||
| ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Get Execute tensor failed."; | |||||
| return ret; | |||||
| } | |||||
| // pack input: to nhwc8 | // pack input: to nhwc8 | ||||
| PackNHWCFp32ToNHWC8Fp16(input_addr, packed_input_, conv_param_->input_batch_, | |||||
| conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); | |||||
| PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_, | |||||
| conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); | |||||
| ret = LiteBackendParallelLaunch(DeconvDwFp16Run, this, conv_param_->thread_num_); | ret = LiteBackendParallelLaunch(DeconvDwFp16Run, this, conv_param_->thread_num_); | ||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| @@ -197,9 +200,10 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() { | |||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| auto output_addr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->Data()); | |||||
| PackNHWC8Fp16ToNHWCFp32(packed_output_, output_addr, conv_param_->output_batch_, | |||||
| conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); | |||||
| PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_, | |||||
| conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); | |||||
| ConvolutionBaseFP16CPUKernel::IfCastOutput(); | |||||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -19,7 +19,7 @@ | |||||
| #include <vector> | #include <vector> | ||||
| #include "src/lite_kernel.h" | #include "src/lite_kernel.h" | ||||
| #include "src/runtime/kernel/arm/base/convolution_base.h" | |||||
| #include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h" | |||||
| #include "src/runtime/kernel/arm/nnacl/fp16/conv_depthwise_fp16.h" | #include "src/runtime/kernel/arm/nnacl/fp16/conv_depthwise_fp16.h" | ||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||
| @@ -34,12 +34,12 @@ void ComputeStrides(int *shape, int *strides, int ndim); | |||||
| #endif | #endif | ||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| class DeconvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseCPUKernel { | |||||
| class DeconvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel { | |||||
| public: | public: | ||||
| DeconvolutionDepthwiseFp16CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs, | DeconvolutionDepthwiseFp16CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs, | ||||
| const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx, | const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx, | ||||
| const lite::Primitive *primitive) | const lite::Primitive *primitive) | ||||
| : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} | |||||
| : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} | |||||
| ~DeconvolutionDepthwiseFp16CPUKernel() override; | ~DeconvolutionDepthwiseFp16CPUKernel() override; | ||||
| int Init() override; | int Init() override; | ||||
| @@ -392,7 +392,7 @@ void PackNHWCToNHWC8Fp16(float16_t *src, float16_t *dst, int batch, int plane, i | |||||
| for (int i = 0; i < plane; i++) { | for (int i = 0; i < plane; i++) { | ||||
| float16_t *dst_plane = dst_batch + i * c8_channel; | float16_t *dst_plane = dst_batch + i * c8_channel; | ||||
| float16_t *src_plane = src_batch + i * channel; | float16_t *src_plane = src_batch + i * channel; | ||||
| memcpy(dst_plane, src_batch, channel * sizeof(float16_t)); | |||||
| memcpy(dst_plane, src_plane, channel * sizeof(float16_t)); | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| @@ -405,7 +405,7 @@ void PackNHWC8ToNHWCFp16(float16_t *src, float16_t *dst, int batch, int plane, i | |||||
| for (int i = 0; i < plane; i++) { | for (int i = 0; i < plane; i++) { | ||||
| float16_t *src_plane = src_batch + i * c8_channel; | float16_t *src_plane = src_batch + i * c8_channel; | ||||
| float16_t *dst_plane = dst_batch + i * channel; | float16_t *dst_plane = dst_batch + i * channel; | ||||
| memcpy(dst_plane, src_batch, channel * sizeof(float16_t)); | |||||
| memcpy(dst_plane, src_plane, channel * sizeof(float16_t)); | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||