| @@ -172,7 +172,8 @@ union PrimitiveType { | |||
| TupleGetItem, | |||
| Div, | |||
| Where, | |||
| OneHot | |||
| OneHot, | |||
| Lstm | |||
| } | |||
| enum QuantType: int { | |||
| @@ -718,3 +718,7 @@ table Where{ | |||
| table OneHot { | |||
| axis: int; | |||
| } | |||
| table Lstm{ | |||
| bidirection: bool = false; | |||
| } | |||
| @@ -25,6 +25,8 @@ | |||
| #ifdef ENABLE_FP16 | |||
| #include "src/runtime/kernel/arm/fp16/convolution_fp16.h" | |||
| #include "src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h" | |||
| #include "src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h" | |||
| #include "src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h" | |||
| #endif | |||
| #include "src/runtime/kernel/arm/int8/deconvolution_int8.h" | |||
| #include "src/runtime/kernel/arm/int8/convolution_int8.h" | |||
| @@ -347,6 +349,19 @@ kernel::LiteKernel *CpuConvDwFp32KernelCreator(const std::vector<lite::tensor::T | |||
| return kernel; | |||
| } | |||
| #ifdef ENABLE_FP16 | |||
| kernel::LiteKernel *CpuConvDwFp16KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs, | |||
| const std::vector<lite::tensor::Tensor *> &outputs, | |||
| OpParameter *opParameter, const Context *ctx) { | |||
| auto kernel = new (std::nothrow) ConvolutionDepthwiseFp16CPUKernel(opParameter, inputs, outputs, ctx); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "kernel is nullptr."; | |||
| return nullptr; | |||
| } | |||
| return kernel; | |||
| } | |||
| #endif | |||
| kernel::LiteKernel *CpuConvDwInt8KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs, | |||
| const std::vector<lite::tensor::Tensor *> &outputs, | |||
| OpParameter *opParameter, const Context *ctx) { | |||
| @@ -372,12 +387,12 @@ kernel::LiteKernel *CpuConvDwKernelCreator(const std::vector<lite::tensor::Tenso | |||
| break; | |||
| case kNumberTypeUInt8: | |||
| break; | |||
| #ifdef ENABLE_FP16 | |||
| case kNumberTypeFloat16: | |||
| break; | |||
| #endif | |||
| case kNumberTypeFloat32: | |||
| #ifdef ENABLE_FP16 | |||
| kernel = CpuConvDwFp16KernelCreator(inputs, outputs, opParameter, ctx); | |||
| #else | |||
| kernel = CpuConvDwFp32KernelCreator(inputs, outputs, opParameter, ctx); | |||
| #endif | |||
| break; | |||
| default: | |||
| break; | |||
| @@ -407,6 +422,19 @@ kernel::LiteKernel *CpuDeconvDwFp32KernelCreator(const std::vector<lite::tensor: | |||
| return kernel; | |||
| } | |||
| #ifdef ENABLE_FP16 | |||
| kernel::LiteKernel *CpuDeconvDwFp16KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs, | |||
| const std::vector<lite::tensor::Tensor *> &outputs, | |||
| OpParameter *opParameter, const lite::Context *ctx) { | |||
| auto kernel = new (std::nothrow) DeconvolutionDepthwiseFp16CPUKernel(opParameter, inputs, outputs, ctx); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "kernel is nullptr."; | |||
| return nullptr; | |||
| } | |||
| return kernel; | |||
| } | |||
| #endif | |||
| kernel::LiteKernel *CpuDeconvDwInt8KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs, | |||
| const std::vector<lite::tensor::Tensor *> &outputs, | |||
| OpParameter *opParameter, const lite::Context *ctx) { | |||
| @@ -432,7 +460,11 @@ kernel::LiteKernel *CpuDeconvDwKernelCreator(const std::vector<lite::tensor::Ten | |||
| kernel = CpuDeconvDwInt8KernelCreator(inputs, outputs, opParameter, ctx); | |||
| break; | |||
| case kNumberTypeFloat32: | |||
| #ifdef ENABLE_FP16 | |||
| kernel = CpuDeconvDwFp16KernelCreator(inputs, outputs, opParameter, ctx); | |||
| #else | |||
| kernel = CpuDeconvDwFp32KernelCreator(inputs, outputs, opParameter, ctx); | |||
| #endif | |||
| break; | |||
| default: | |||
| break; | |||
| @@ -0,0 +1,164 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h" | |||
| #include "schema/model_generated.h" | |||
| #include "src/kernel_registry.h" | |||
| #include "include/errorcode.h" | |||
| #include "src/runtime/runtime_api.h" | |||
| using mindspore::kernel::KERNEL_ARCH::kCPU; | |||
| using mindspore::lite::KernelRegistrar; | |||
| using mindspore::lite::RET_ERROR; | |||
| using mindspore::lite::RET_OK; | |||
| using mindspore::schema::PrimitiveType_DepthwiseConv2D; | |||
| namespace mindspore::kernel { | |||
| int ConvolutionDepthwiseFp16CPUKernel::InitBuffer() { | |||
| // malloc pack input buffer | |||
| int C8 = UP_DIV(conv_param_->input_channel_, C8NUM); | |||
| int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C8NUM * C8; | |||
| packed_input_ = reinterpret_cast<float16_t *>(malloc(pack_input_size * sizeof(float16_t))); | |||
| if (packed_input_ == nullptr) { | |||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||
| return RET_ERROR; | |||
| } | |||
| memset(packed_input_, 0, pack_input_size * sizeof(float16_t)); | |||
| // malloc pack output buffer | |||
| int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C8NUM * C8; | |||
| packed_output_ = reinterpret_cast<float16_t *>(malloc(pack_output_size * sizeof(float16_t))); | |||
| if (packed_output_ == nullptr) { | |||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||
| return RET_ERROR; | |||
| } | |||
| memset(packed_output_, 0, pack_output_size * sizeof(float16_t)); | |||
| return RET_OK; | |||
| } | |||
| int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() { | |||
| // init weight: o, h, w, i; o == group, i == 1 | |||
| int OC8 = UP_DIV(conv_param_->output_channel_, C8NUM); | |||
| auto weight_tensor = inputs_[kWeightIndex]; | |||
| auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data()); | |||
| int pack_weight_size = C8NUM * OC8 * conv_param_->kernel_h_ * conv_param_->kernel_w_; | |||
| packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t))); | |||
| if (packed_weight_ == nullptr) { | |||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||
| return RET_ERROR; | |||
| } | |||
| memset(packed_weight_, 0, pack_weight_size * sizeof(float16_t)); | |||
| PackNCHWFp32ToNC8HW8Fp16(origin_weight, packed_weight_, 1, conv_param_->kernel_h_ * conv_param_->kernel_w_, | |||
| conv_param_->output_channel_); | |||
| // init bias | |||
| bias_data_ = reinterpret_cast<float16_t *>(malloc(C8NUM * OC8 * sizeof(float16_t))); | |||
| if (bias_data_ == nullptr) { | |||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||
| return RET_ERROR; | |||
| } | |||
| memset(bias_data_, 0, C8NUM * OC8 * sizeof(float16_t)); | |||
| auto bias_fp16 = reinterpret_cast<float16_t *>(bias_data_); | |||
| if (inputs_.size() == kInputSize2) { | |||
| auto ori_bias = reinterpret_cast<float *>(inputs_.at(kBiasIndex)->Data()); | |||
| for (int i = 0; i < conv_param_->output_channel_; i++) { | |||
| bias_fp16[i] = (float16_t)ori_bias[i]; | |||
| } | |||
| } | |||
| conv_param_->thread_num_ = MSMIN(thread_count_, OC8); | |||
| return RET_OK; | |||
| } | |||
| int ConvolutionDepthwiseFp16CPUKernel::Init() { | |||
| // conv base init | |||
| ConvolutionBaseCPUKernel::Init(); | |||
| // init sliding_ window param | |||
| sliding_ = new SlidingWindowParam; | |||
| InitSlidingParam(sliding_, conv_param_, C8NUM); | |||
| auto ret = InitWeightBias(); | |||
| if (ret != 0) { | |||
| MS_LOG(ERROR) << "Convolution depthwise fp16 InitWeightBias failed."; | |||
| return RET_ERROR; | |||
| } | |||
| ret = InitBuffer(); | |||
| if (ret != 0) { | |||
| MS_LOG(ERROR) << "Convolution depthwise fp16 InitBuffer failed."; | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int ConvolutionDepthwiseFp16CPUKernel::ReSize() { | |||
| free(packed_input_); | |||
| free(packed_output_); | |||
| ConvolutionBaseCPUKernel::Init(); | |||
| InitSlidingParam(sliding_, conv_param_, C8NUM); | |||
| auto ret = InitBuffer(); | |||
| if (ret != 0) { | |||
| MS_LOG(ERROR) << "Convolution depthwise fp16 InitBuffer failed."; | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int ConvolutionDepthwiseFp16CPUKernel::Execute(int task_id) { | |||
| ConvDwC8Fp16(packed_output_, packed_input_, packed_weight_, reinterpret_cast<float16_t *>(bias_data_), conv_param_, | |||
| sliding_, task_id); | |||
| return RET_OK; | |||
| } | |||
| int ConvDwFp16Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) { | |||
| auto conv_dw_fp16 = reinterpret_cast<ConvolutionDepthwiseFp16CPUKernel *>(cdata); | |||
| auto ret = conv_dw_fp16->Execute(task_id); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "ConvolutionDepthwiseFp16Run error task_id[" << task_id << "] error_code[" << ret << "]"; | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int ConvolutionDepthwiseFp16CPUKernel::Run() { | |||
| if (conv_param_->input_channel_ != conv_param_->output_channel_) { | |||
| MS_LOG(ERROR) << "Only support input channel equals output channel."; | |||
| return RET_ERROR; | |||
| } | |||
| auto input_tensor = inputs_.at(kInputIndex); | |||
| auto input_addr = reinterpret_cast<float *>(input_tensor->Data()); | |||
| // pack input: to nhwc8 | |||
| PackNHWCFp32ToNHWC8Fp16(input_addr, packed_input_, conv_param_->input_batch_, | |||
| conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); | |||
| auto ret = LiteBackendParallelLaunch(ConvDwFp16Run, this, conv_param_->thread_num_); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "ConvDwFp16Run error: error_code[" << ret << "]"; | |||
| return RET_ERROR; | |||
| } | |||
| auto output_addr = reinterpret_cast<float *>(outputs_.at(kOutputIndex)->Data()); | |||
| PackNHWC8Fp16ToNHWCFp32(packed_output_, output_addr, conv_param_->output_batch_, | |||
| conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); | |||
| return RET_OK; | |||
| } | |||
| } // namespace mindspore::kernel | |||
| @@ -0,0 +1,54 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DEPTHWISE_FP16_H_ | |||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DEPTHWISE_FP16_H_ | |||
| #include <vector> | |||
| #include "src/lite_kernel.h" | |||
| #include "src/runtime/kernel/arm/base/convolution_base.h" | |||
| #include "src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.h" | |||
| namespace mindspore::kernel { | |||
| class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseCPUKernel { | |||
| public: | |||
| ConvolutionDepthwiseFp16CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs, | |||
| const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx) | |||
| : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx) {} | |||
| ~ConvolutionDepthwiseFp16CPUKernel() override { | |||
| delete sliding_; | |||
| free(packed_weight_); | |||
| free(packed_input_); | |||
| free(packed_output_); | |||
| } | |||
| int Init() override; | |||
| int ReSize() override; | |||
| int Run() override; | |||
| int InitBuffer(); | |||
| int InitWeightBias(); | |||
| int Execute(int task_id); | |||
| private: | |||
| SlidingWindowParam *sliding_; | |||
| float16_t *packed_weight_; | |||
| float16_t *packed_input_; | |||
| float16_t *packed_output_; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DEPTHWISE_FP16_H_ | |||
| @@ -0,0 +1,174 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h" | |||
| #include "schema/model_generated.h" | |||
| #include "src/kernel_registry.h" | |||
| #include "include/errorcode.h" | |||
| #include "src/runtime/runtime_api.h" | |||
| using mindspore::kernel::KERNEL_ARCH::kCPU; | |||
| using mindspore::lite::KernelRegistrar; | |||
| using mindspore::lite::RET_ERROR; | |||
| using mindspore::lite::RET_OK; | |||
| using mindspore::schema::PrimitiveType_DepthwiseConv2D; | |||
| namespace mindspore::kernel { | |||
| int DeconvolutionDepthwiseFp16CPUKernel::InitSlideParam() { | |||
| conv_param_->input_batch_ = outputs_.front()->shape().at(kNHWC_N); | |||
| conv_param_->input_h_ = outputs_.front()->shape().at(kNHWC_H); | |||
| conv_param_->input_w_ = outputs_.front()->shape().at(kNHWC_W); | |||
| conv_param_->input_channel_ = outputs_.front()->shape().at(kNHWC_C); | |||
| conv_param_->output_batch_ = inputs_.front()->shape().at(kNHWC_N); | |||
| conv_param_->output_h_ = inputs_.front()->shape().at(kNHWC_H); | |||
| conv_param_->output_w_ = inputs_.front()->shape().at(kNHWC_W); | |||
| conv_param_->output_channel_ = inputs_.front()->shape().at(kNHWC_C); | |||
| // init sliding_ window param | |||
| InitSlidingParam(sliding_, conv_param_, C8NUM); | |||
| return RET_OK; | |||
| } | |||
| int DeconvolutionDepthwiseFp16CPUKernel::InitBuffer() { | |||
| // malloc pack input buffer | |||
| int C8 = UP_DIV(conv_param_->input_channel_, C8NUM); | |||
| int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C8NUM * C8; | |||
| packed_input_ = reinterpret_cast<float16_t *>(malloc(pack_input_size * sizeof(float16_t))); | |||
| if (packed_input_ == nullptr) { | |||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||
| return RET_ERROR; | |||
| } | |||
| memset(packed_input_, 0, pack_input_size * sizeof(float16_t)); | |||
| int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C8NUM * C8; | |||
| packed_output_ = reinterpret_cast<float16_t *>(malloc(pack_output_size * sizeof(float16_t))); | |||
| if (packed_output_ == nullptr) { | |||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||
| return RET_ERROR; | |||
| } | |||
| memset(packed_output_, 0, pack_output_size * sizeof(float16_t)); | |||
| return RET_OK; | |||
| } | |||
| int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() { | |||
| // init weight: o, h, w, i; o == group, i == 1 | |||
| int OC8 = UP_DIV(conv_param_->output_channel_, C8NUM); | |||
| auto weight_tensor = inputs_[kWeightIndex]; | |||
| auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data()); | |||
| int pack_weight_size = C8NUM * OC8 * conv_param_->kernel_h_ * conv_param_->kernel_w_; | |||
| packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t))); | |||
| if (packed_weight_ == nullptr) { | |||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||
| return RET_ERROR; | |||
| } | |||
| memset(packed_weight_, 0, pack_weight_size * sizeof(float16_t)); | |||
| PackNCHWFp32ToNC8HW8Fp16(origin_weight, packed_weight_, 1, conv_param_->kernel_h_ * conv_param_->kernel_w_, | |||
| conv_param_->output_channel_); | |||
| // init bias | |||
| bias_data_ = reinterpret_cast<float16_t *>(malloc(C8NUM * OC8 * sizeof(float16_t))); | |||
| if (bias_data_ == nullptr) { | |||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||
| return RET_ERROR; | |||
| } | |||
| memset(bias_data_, 0, C8NUM * OC8 * sizeof(float16_t)); | |||
| if (inputs_.size() == kInputSize2) { | |||
| auto ori_bias = reinterpret_cast<float *>(inputs_.at(kBiasIndex)->Data()); | |||
| for (int i = 0; i < conv_param_->output_channel_; i++) { | |||
| reinterpret_cast<float *>(bias_data_)[i] = (float16_t)ori_bias[i]; | |||
| } | |||
| } | |||
| conv_param_->thread_num_ = MSMIN(thread_count_, OC8); | |||
| return RET_OK; | |||
| } | |||
| int DeconvolutionDepthwiseFp16CPUKernel::Init() { | |||
| sliding_ = new SlidingWindowParam; | |||
| InitSlideParam(); | |||
| // conv base init | |||
| ConvolutionBaseCPUKernel::Init(); | |||
| auto ret = InitWeightBias(); | |||
| if (ret != 0) { | |||
| MS_LOG(ERROR) << "Deconvolution depthwise fp16 InitWeightBias failed."; | |||
| return RET_ERROR; | |||
| } | |||
| ret = InitBuffer(); | |||
| if (ret != 0) { | |||
| MS_LOG(ERROR) << "Deconvolution depthwise fp16 InitBuffer failed."; | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int DeconvolutionDepthwiseFp16CPUKernel::ReSize() { | |||
| free(packed_input_); | |||
| free(packed_output_); | |||
| InitSlideParam(); | |||
| ConvolutionBaseCPUKernel::Init(); | |||
| auto ret = InitBuffer(); | |||
| if (ret != 0) { | |||
| MS_LOG(ERROR) << "Convolution depthwise fp16 InitBuffer failed."; | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int DeconvolutionDepthwiseFp16CPUKernel::Execute(int task_id) { | |||
| DeconvDwC8Fp16(packed_output_, packed_input_, packed_weight_, reinterpret_cast<float16_t *>(bias_data_), conv_param_, | |||
| sliding_, task_id); | |||
| return RET_OK; | |||
| } | |||
| int DeconvDwFp16Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) { | |||
| auto deconv_dw_fp16 = reinterpret_cast<DeconvolutionDepthwiseFp16CPUKernel *>(cdata); | |||
| auto ret = deconv_dw_fp16->Execute(task_id); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "DeconvolutionDepthwiseFp16Run error task_id[" << task_id << "] error_code[" << ret << "]"; | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int DeconvolutionDepthwiseFp16CPUKernel::Run() { | |||
| if (conv_param_->input_channel_ != conv_param_->output_channel_) { | |||
| MS_LOG(ERROR) << "Only support input channel equals output channel."; | |||
| return RET_ERROR; | |||
| } | |||
| auto input_tensor = inputs_.at(kInputIndex); | |||
| auto input_addr = reinterpret_cast<float *>(input_tensor->Data()); | |||
| // pack input: to nhwc8 | |||
| PackNHWCFp32ToNHWC8Fp16(input_addr, packed_input_, conv_param_->input_batch_, | |||
| conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); | |||
| auto ret = LiteBackendParallelLaunch(DeconvDwFp16Run, this, conv_param_->thread_num_); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "DeconvDwFp16Run error: error_code[" << ret << "]"; | |||
| return RET_ERROR; | |||
| } | |||
| auto output_addr = reinterpret_cast<float *>(outputs_.at(kOutputIndex)->Data()); | |||
| PackNHWC8Fp16ToNHWCFp32(packed_output_, output_addr, conv_param_->output_batch_, | |||
| conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); | |||
| return RET_OK; | |||
| } | |||
| } // namespace mindspore::kernel | |||
| @@ -0,0 +1,58 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_DECONVOLUTION_DEPTHWISE_FP16_H_ | |||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_DECONVOLUTION_DEPTHWISE_FP16_H_ | |||
| #include <vector> | |||
| #include "src/lite_kernel.h" | |||
| #include "src/runtime/kernel/arm/base/convolution_base.h" | |||
| #include "src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.h" | |||
| namespace mindspore::kernel { | |||
| class DeconvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseCPUKernel { | |||
| public: | |||
| DeconvolutionDepthwiseFp16CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs, | |||
| const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx) | |||
| : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx) {} | |||
| ~DeconvolutionDepthwiseFp16CPUKernel() override { | |||
| delete sliding_; | |||
| free(packed_weight_); | |||
| if (need_align_) { | |||
| free(packed_input_); | |||
| free(packed_output_); | |||
| } | |||
| }; | |||
| int Init() override; | |||
| int ReSize() override; | |||
| int Run() override; | |||
| int InitBuffer(); | |||
| int InitWeightBias(); | |||
| int InitSlideParam(); | |||
| int Execute(int task_id); | |||
| private: | |||
| SlidingWindowParam *sliding_; | |||
| float16_t *packed_weight_; | |||
| float16_t *packed_input_; | |||
| float16_t *packed_output_; | |||
| bool need_align_ = false; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_DECONVOLUTION_DEPTHWISE_FP16_H_ | |||
| @@ -32,8 +32,8 @@ int ConvolutionDepthwiseCPUKernel::Init() { | |||
| ConvolutionBaseCPUKernel::Init(); | |||
| // init sliding window param | |||
| sliding = new SlidingWindowParam; | |||
| InitSlidingParam(sliding, conv_param_, C4NUM); | |||
| sliding_ = new SlidingWindowParam; | |||
| InitSlidingParam(sliding_, conv_param_, C4NUM); | |||
| // pack input function: convert_func_ | |||
| auto input_tensor = inputs_[kInputIndex]; | |||
| @@ -97,7 +97,7 @@ int ConvolutionDepthwiseCPUKernel::ReSize() { | |||
| int ConvolutionDepthwiseCPUKernel::Execute(int task_id) { | |||
| ConvDwC4Fp32(packed_output_, packed_input_, packed_weight_, reinterpret_cast<float *>(bias_data_), conv_param_, | |||
| sliding, task_id); | |||
| sliding_, task_id); | |||
| return RET_OK; | |||
| } | |||
| @@ -29,7 +29,7 @@ class ConvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel { | |||
| const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx) | |||
| : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx) {} | |||
| ~ConvolutionDepthwiseCPUKernel() override { | |||
| delete sliding; | |||
| delete sliding_; | |||
| free(packed_weight_); | |||
| if (convert_func_ != nullptr) { | |||
| free(packed_input_); | |||
| @@ -46,7 +46,7 @@ class ConvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel { | |||
| int Execute(int task_id); | |||
| private: | |||
| SlidingWindowParam *sliding; | |||
| SlidingWindowParam *sliding_; | |||
| float *packed_weight_; | |||
| float *packed_input_; | |||
| float *packed_output_; | |||
| @@ -38,8 +38,8 @@ int DeconvolutionDepthwiseCPUKernel::InitSlideParam() { | |||
| conv_param_->output_channel_ = inputs_.front()->shape().at(kNHWC_C); | |||
| // init sliding window param | |||
| sliding = new SlidingWindowParam; | |||
| InitSlidingParam(sliding, conv_param_, C4NUM); | |||
| sliding_ = new SlidingWindowParam; | |||
| InitSlidingParam(sliding_, conv_param_, C4NUM); | |||
| return RET_OK; | |||
| } | |||
| @@ -110,7 +110,7 @@ int DeconvolutionDepthwiseCPUKernel::ReSize() { | |||
| int DeconvolutionDepthwiseCPUKernel::DoExcute(int task_id) { | |||
| DeconvDwC4Fp32(packed_output_, packed_input_, packed_weight_, reinterpret_cast<float *>(bias_data_), conv_param_, | |||
| sliding, task_id); | |||
| sliding_, task_id); | |||
| return RET_OK; | |||
| } | |||
| @@ -29,7 +29,7 @@ class DeconvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel { | |||
| const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx) | |||
| : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx) {} | |||
| ~DeconvolutionDepthwiseCPUKernel() override { | |||
| delete sliding; | |||
| delete sliding_; | |||
| free(packed_weight_); | |||
| free(packed_input_); | |||
| free(packed_output_); | |||
| @@ -43,7 +43,7 @@ class DeconvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel { | |||
| int DoExcute(int task_id); | |||
| private: | |||
| SlidingWindowParam *sliding; | |||
| SlidingWindowParam *sliding_; | |||
| float *packed_weight_; | |||
| float *packed_input_; | |||
| float *packed_output_; | |||
| @@ -0,0 +1,302 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.h" | |||
| #ifdef ENABLE_FP16 | |||
| #include <arm_neon.h> | |||
| /*conv depthwise fp16 begin*/ | |||
| void DepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, | |||
| int height, int width, int in_kh_step, int in_kw_step, int kernel_w, bool is_relu, | |||
| bool is_relu6) { | |||
| const float16_t *src_kh = src; | |||
| const float16_t *weight_kh = weight; | |||
| for (int kh = 0; kh < height; kh++) { | |||
| const float16_t *src_kw = src_kh; | |||
| const float16_t *weight_kw = weight_kh; | |||
| for (int kw = 0; kw < width; kw++) { | |||
| float16x8_t src_8 = vld1q_f16(src_kw); | |||
| float16x8_t weight_8 = vld1q_f16(weight_kw); | |||
| float16x8_t dst_8 = vld1q_f16(dst); | |||
| dst_8 = vfmaq_f16(dst_8, src_8, weight_8); | |||
| vst1q_f16(dst, dst_8); | |||
| src_kw += in_kw_step; | |||
| weight_kw += C8NUM; | |||
| } // kernel_w loop | |||
| src_kh += in_kh_step; | |||
| weight_kh += kernel_w * C8NUM; | |||
| } // kernel_h loop | |||
| for (int c = 0; c < C8NUM; c++) { | |||
| dst[c] += bias[c]; | |||
| dst[c] = (is_relu) ? (MSMAX(0, dst[c])) : (dst[c]); | |||
| dst[c] = (is_relu6) ? (MSMIN(6, MSMAX(0, dst[c]))) : (dst[c]); | |||
| } | |||
| } | |||
| void DepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, int top, | |||
| int bottom, int left, int right, const ConvParameter *conv_param, | |||
| const SlidingWindowParam *sliding) { | |||
| float16_t *dst_h = dst + top * sliding->out_h_step_; | |||
| for (int oh = top; oh < bottom; oh++) { | |||
| int ih = oh * conv_param->stride_h_ - conv_param->pad_h_; | |||
| int start_kh = MSMAX(0, UP_DIV(-ih, conv_param->dilation_h_)); | |||
| int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih, conv_param->dilation_h_)); | |||
| const float16_t *src_h = src + ih * sliding->in_h_step_; | |||
| float16_t *dst_kernel = dst_h + left * sliding->block_channel_; | |||
| for (int ow = left; ow < right; ow++) { | |||
| int iw = ow * conv_param->stride_w_ - conv_param->pad_w_; | |||
| int start_kw = MSMAX(0, UP_DIV(-iw, conv_param->dilation_w_)); | |||
| int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->input_w_ - iw, conv_param->dilation_w_)); | |||
| const float16_t *src_w = src_h + iw * sliding->block_channel_; | |||
| const float16_t *src_kernel = src_w + start_kh * sliding->in_kh_step_ + start_kw * sliding->in_kw_step_; | |||
| const float16_t *weight_kernel = weight + (start_kh * conv_param->kernel_w_ + start_kw) * C8NUM; | |||
| DepthwiseBorderPixelFp16(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw, | |||
| sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_, conv_param->is_relu_, | |||
| conv_param->is_relu6_); | |||
| dst_kernel += sliding->block_channel_; | |||
| } // width loop | |||
| dst_h += sliding->out_h_step_; | |||
| } // height loop | |||
| } | |||
| void DepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, | |||
| int height, int width, int kernel_h, int kernel_w, int out_h_step, int block_channel, | |||
| int in_sh_step, int in_sw_step, int in_kh_step, int in_kw_step, bool is_relu, bool is_relu6) { | |||
| float16_t *dst_h = dst; | |||
| const float16_t *src_h = src; | |||
| for (int oh = 0; oh < height; oh++) { | |||
| float16_t *dst_w = dst_h; | |||
| const float16_t *src_w = src_h; | |||
| for (int ow = 0; ow < width; ow++) { | |||
| const float16_t *src_kh = src_w; | |||
| const float16_t *weight_kh = weight; | |||
| for (int kh = 0; kh < kernel_h; kh++) { | |||
| const float16_t *src_kw = src_kh; | |||
| const float16_t *weight_kw = weight_kh; | |||
| for (int kw = 0; kw < kernel_w; kw++) { | |||
| float16x8_t src_8 = vld1q_f16(src_kw); | |||
| float16x8_t weight_8 = vld1q_f16(weight_kw); | |||
| float16x8_t dst_8 = vld1q_f16(dst_w); | |||
| dst_8 = vfmaq_f16(dst_8, src_8, weight_8); | |||
| vst1q_f16(dst_w, dst_8); | |||
| src_kw += in_kw_step; | |||
| weight_kw += C8NUM; | |||
| } // kernel_w loop | |||
| src_kh += in_kh_step; | |||
| weight_kh += kernel_w * C8NUM; | |||
| } // kernel_h loop | |||
| // add biad relu | |||
| for (int c = 0; c < C8NUM; c++) { | |||
| dst_w[c] += bias[c]; | |||
| dst_w[c] = (is_relu) ? (MSMAX(0, dst_w[c])) : (dst_w[c]); | |||
| dst_w[c] = (is_relu6) ? (MSMIN(6, MSMAX(0, dst_w[c]))) : (dst_w[c]); | |||
| } | |||
| dst_w += block_channel; | |||
| src_w += in_sw_step; | |||
| } // dst_width loop | |||
| dst_h += out_h_step; | |||
| src_h += in_sh_step; | |||
| } // dst_height loop | |||
| } | |||
| // conv depthwise fp16: sliding window | |||
| void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data, | |||
| const float16_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding, | |||
| int task_id) { | |||
| const float16_t *src = input_data; | |||
| float16_t *dst = output_data; | |||
| for (int b = 0; b < conv_param->output_batch_; b++) { | |||
| for (int oc = task_id; oc < sliding->c_block_; oc += conv_param->thread_num_) { | |||
| const float16_t *src_data = src + oc * C8NUM; | |||
| float16_t *dst_data = dst + oc * C8NUM; | |||
| const float16_t *weight = weight_data + oc * sliding->kernel_step_; | |||
| const float16_t *bias = bias_data + oc * C8NUM; | |||
| DepthwiseBorderFp16(dst_data, src_data, weight, bias, 0, sliding->top_, 0, conv_param->output_w_, conv_param, | |||
| sliding); | |||
| DepthwiseBorderFp16(dst_data, src_data, weight, bias, sliding->bottom_, conv_param->output_h_, 0, | |||
| conv_param->output_w_, conv_param, sliding); | |||
| DepthwiseBorderFp16(dst_data, src_data, weight, bias, sliding->top_, sliding->bottom_, 0, sliding->left_, | |||
| conv_param, sliding); | |||
| DepthwiseBorderFp16(dst_data, src_data, weight, bias, sliding->top_, sliding->bottom_, sliding->right_, | |||
| conv_param->output_w_, conv_param, sliding); | |||
| if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) { | |||
| int in_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_h_; | |||
| int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_; | |||
| const float16_t *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * sliding->block_channel_; | |||
| float16_t *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_; | |||
| DepthwiseCenterFp16(out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_, | |||
| sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_, | |||
| sliding->out_h_step_, sliding->block_channel_, sliding->in_sh_step_, sliding->in_sw_step_, | |||
| sliding->in_kh_step_, sliding->in_kw_step_, conv_param->is_relu_, conv_param->is_relu6_); | |||
| } | |||
| } // output C8 loop | |||
| src += sliding->in_step_; | |||
| dst += sliding->out_step_; | |||
| } // batch loop | |||
| // output nchwc8 | |||
| } | |||
| /*conv depthwise fp16 end*/ | |||
| /*deconv depthwise fp16 begin*/ | |||
| void DeconvDepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const float16_t *weight, int height, | |||
| int width, int in_kh_step, int in_kw_step, int kernel_w) { | |||
| float16_t *dst_kh = dst; | |||
| const float16_t *weight_kh = weight; | |||
| for (int kh = 0; kh < height; kh++) { | |||
| float16_t *dst_kw = dst_kh; | |||
| const float16_t *weight_kw = weight_kh; | |||
| for (int kw = 0; kw < width; kw++) { | |||
| float16x8_t src_8 = vld1q_f16(src); | |||
| float16x8_t weight_8 = vld1q_f16(weight_kw); | |||
| float16x8_t dst_8 = vld1q_f16(dst_kw); | |||
| dst_8 = vfmaq_f16(dst_8, src_8, weight_8); | |||
| vst1q_f16(dst_kw, dst_8); | |||
| dst_kw += in_kw_step; | |||
| weight_kw += C8NUM; | |||
| } // kernel_w loop | |||
| dst_kh += in_kh_step; | |||
| weight_kh += kernel_w * C8NUM; | |||
| } // kernel_h loop | |||
| } | |||
| void DeconvDepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float16_t *weight, int top, int bottom, | |||
| int left, int right, const ConvParameter *conv_param, | |||
| const SlidingWindowParam *sliding) { | |||
| const float16_t *src_h = src + top * sliding->out_h_step_; | |||
| for (int ih = top; ih < bottom; ih++) { | |||
| int oh = ih * conv_param->stride_h_ - conv_param->pad_h_; | |||
| int start_kh = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_)); | |||
| int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_)); | |||
| float16_t *dst_h = dst + oh * sliding->in_h_step_; | |||
| const float16_t *src_kernel = src_h + left * sliding->block_channel_; | |||
| for (int iw = left; iw < right; iw++) { | |||
| int ow = iw * conv_param->stride_w_ - conv_param->pad_w_; | |||
| int start_kw = MSMAX(0, UP_DIV(-ow, conv_param->dilation_w_)); | |||
| int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->output_w_ - ow, conv_param->dilation_w_)); | |||
| float16_t *dst_w = dst_h + ow * sliding->block_channel_; | |||
| const float16_t *weight_kernel = weight + (start_kh * conv_param->kernel_w_ + start_kw) * C8NUM; | |||
| float16_t *dst_kernel = dst_w + start_kh * sliding->in_kh_step_ + start_kw * sliding->in_kw_step_; | |||
| DeconvDepthwiseBorderPixelFp16(dst_kernel, src_kernel, weight_kernel, end_kh - start_kh, end_kw - start_kw, | |||
| sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_); | |||
| src_kernel += sliding->block_channel_; | |||
| } // width loop | |||
| src_h += sliding->out_h_step_; | |||
| } // height loop | |||
| } | |||
| void DeconvDepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t *weight, int height, int width, | |||
| int kernel_h, int kernel_w, int out_h_step, int block_channel, int in_sh_step, | |||
| int in_sw_step, int in_kh_step, int in_kw_step) { | |||
| float16_t *dst_h = dst; | |||
| const float16_t *src_h = src; | |||
| for (int oh = 0; oh < height; oh++) { | |||
| float16_t *dst_w = dst_h; | |||
| const float16_t *src_w = src_h; | |||
| for (int ow = 0; ow < width; ow++) { | |||
| float16_t *dst_kh = dst_w; | |||
| const float16_t *weight_kh = weight; | |||
| for (int kh = 0; kh < kernel_h; kh++) { | |||
| float16_t *dst_kw = dst_kh; | |||
| const float16_t *weight_kw = weight_kh; | |||
| for (int kw = 0; kw < kernel_w; kw++) { | |||
| float16x8_t src_8 = vld1q_f16(src_w); | |||
| float16x8_t weight_8 = vld1q_f16(weight_kw); | |||
| float16x8_t dst_8 = vld1q_f16(dst_kw); | |||
| dst_8 = vfmaq_f16(dst_8, src_8, weight_8); | |||
| vst1q_f16(dst_kw, dst_8); | |||
| dst_kw += in_kw_step; | |||
| weight_kw += C8NUM; | |||
| } // kernel_w loop | |||
| dst_kh += in_kh_step; | |||
| weight_kh += kernel_w * C8NUM; | |||
| } // kernel_h loop | |||
| dst_w += in_sw_step; | |||
| src_w += block_channel; | |||
| } // dst_width loop | |||
| dst_h += in_sh_step; | |||
| src_h += out_h_step; | |||
| } // dst_height loop | |||
| } | |||
| void DeconvDepthwisePostFuncFp16(float16_t *dst, const float16_t *bias, int block_channel, | |||
| const ConvParameter *conv_param) { | |||
| float16_t *dst_k = dst; | |||
| for (int k = 0; k < conv_param->output_h_ * conv_param->output_w_; k++) { | |||
| for (int c = 0; c < C8NUM; c++) { | |||
| dst_k[c] += bias[c]; | |||
| dst_k[c] = (conv_param->is_relu_) ? (MSMAX(0, dst_k[c])) : (dst_k[c]); | |||
| dst_k[c] = (conv_param->is_relu6_) ? (MSMIN(6, MSMAX(0, dst_k[c]))) : (dst_k[c]); | |||
| } | |||
| dst_k += block_channel; | |||
| } | |||
| } | |||
| // deconv depthwise fp16: sliding window | |||
| void DeconvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data, | |||
| const float16_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding, | |||
| int task_id) { | |||
| const float16_t *src = input_data; | |||
| float16_t *dst = output_data; | |||
| for (int b = 0; b < conv_param->output_batch_; b++) { | |||
| for (int oc = task_id; oc < sliding->c_block_; oc += conv_param->thread_num_) { | |||
| const float16_t *src_data = src + oc * C8NUM; | |||
| float16_t *dst_data = dst + oc * C8NUM; | |||
| const float16_t *weight = weight_data + oc * sliding->kernel_step_; | |||
| const float16_t *bias = bias_data + oc * C8NUM; | |||
| DeconvDepthwiseBorderFp16(dst_data, src_data, weight, 0, sliding->top_, 0, conv_param->input_w_, conv_param, | |||
| sliding); | |||
| DeconvDepthwiseBorderFp16(dst_data, src_data, weight, sliding->bottom_, conv_param->input_h_, 0, | |||
| conv_param->input_w_, conv_param, sliding); | |||
| DeconvDepthwiseBorderFp16(dst_data, src_data, weight, sliding->top_, sliding->bottom_, 0, sliding->left_, | |||
| conv_param, sliding); | |||
| DeconvDepthwiseBorderFp16(dst_data, src_data, weight, sliding->top_, sliding->bottom_, sliding->right_, | |||
| conv_param->input_w_, conv_param, sliding); | |||
| if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) { | |||
| int oh_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_h_; | |||
| int oh_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_; | |||
| float16_t *out_t = dst_data + oh_h_start * sliding->in_h_step_ + oh_w_start * sliding->block_channel_; | |||
| const float16_t *in_t = | |||
| src_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_; | |||
| DeconvDepthwiseCenterFp16(out_t, in_t, weight, sliding->bottom_ - sliding->top_, | |||
| sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_, | |||
| sliding->out_h_step_, sliding->block_channel_, sliding->in_sh_step_, | |||
| sliding->in_sw_step_, sliding->in_kh_step_, sliding->in_kw_step_); | |||
| } | |||
| DeconvDepthwisePostFuncFp16(dst_data, bias, sliding->block_channel_, conv_param); | |||
| } // output C8 loop | |||
| src += sliding->in_step_; | |||
| dst += sliding->out_step_; | |||
| } // batch loop | |||
| // output nchwc8 | |||
| } | |||
| /*deconv depthwise fp16 end*/ | |||
| #endif | |||
| @@ -0,0 +1,33 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP16_CONV_DEPTHWISE_FP16_H_ | |||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP16_CONV_DEPTHWISE_FP16_H_ | |||
| #include "src/runtime/kernel/arm/opclib/conv_parameter.h" | |||
| #include "src/runtime/kernel/arm/opclib/fp32/conv_depthwise.h" | |||
| #ifdef ENABLE_FP16 | |||
| void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data, | |||
| const float16_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding, | |||
| int task_id); | |||
| void DeconvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data, | |||
| const float16_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding, | |||
| int task_id); | |||
| #endif | |||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP16_CONV_DEPTHWISE_FP16_H_ | |||
| @@ -14,8 +14,8 @@ | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_P32_CONV_DEPTHWISE_H_ | |||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_P32_CONV_DEPTHWISE_H_ | |||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_CONV_DEPTHWISE_H_ | |||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_CONV_DEPTHWISE_H_ | |||
| #include "src/runtime/kernel/arm/opclib/conv_parameter.h" | |||
| @@ -45,5 +45,5 @@ void ConvDwC4Fp32(float *output_data, const float *input_data, const float *weig | |||
| void DeconvDwC4Fp32(float *output_data, const float *input_data, const float *weight_data, const float *bias_data, | |||
| const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id); | |||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_P32_CONV_DEPTHWISE_H_ | |||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_CONV_DEPTHWISE_H_ | |||
| @@ -292,6 +292,55 @@ void PackNC4HW4ToNCHWFp16(const void *src, void *dst, int batch, int plane, int | |||
| } | |||
| } | |||
| } | |||
| void PackNCHWFp32ToNC8HW8Fp16(float *src, float16_t *dst, int batch, int plane, int channel) { | |||
| int c8 = UP_DIV(channel, C8NUM); | |||
| for (int b = 0; b < batch; b++) { | |||
| int src_offset = b * plane * channel; | |||
| int dst_offset = b * plane * c8 * C8NUM; | |||
| for (int c = 0; c < channel; c++) { | |||
| int c8_block_num = c / C8NUM; | |||
| int c8_block_rem = c % C8NUM; | |||
| int src_c_offset = src_offset + c * plane; | |||
| int dst_c_offset = dst_offset + c8_block_num * plane * C8NUM; | |||
| for (int k = 0; k < plane; k++) { | |||
| int src_kernel_offset = src_c_offset + k; | |||
| int dst_kernel_offset = dst_c_offset + C8NUM * k + c8_block_rem; | |||
| (dst + dst_kernel_offset)[0] = (float16_t)(src + src_kernel_offset)[0]; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| void PackNHWCFp32ToNHWC8Fp16(float *src, float16_t *dst, int batch, int plane, int channel) { | |||
| int c8 = UP_DIV(channel, C8NUM); | |||
| int nhwc8_batch_unit_offset = c8 * C8NUM * plane; | |||
| int nhwc8_batch_offset = 0; | |||
| for (int b = 0; b < batch; b++) { | |||
| int batch_offset = b * channel * plane; | |||
| for (int i = 0; i < plane; i++) { | |||
| for (int c = 0; c < channel; c++) { | |||
| (dst + nhwc8_batch_offset + i * c8 * C8NUM)[c] = (float16_t)(src + batch_offset + i * channel)[c]; | |||
| } | |||
| } | |||
| nhwc8_batch_offset += nhwc8_batch_unit_offset; | |||
| } | |||
| } | |||
| void PackNHWC8Fp16ToNHWCFp32(float16_t *src, float *dst, int batch, int plane, int channel) { | |||
| int c8 = UP_DIV(channel, C8NUM); | |||
| int nhwc_batch_unit_offset = channel * plane; | |||
| int nhwc_batch_offset = 0; | |||
| for (int b = 0; b < batch; b++) { | |||
| int batch_offset = b * c8 * C8NUM * plane; | |||
| for (int i = 0; i < plane; i++) { | |||
| for (int c = 0; c < channel; c++) { | |||
| (dst + nhwc_batch_offset + i * channel)[c] = (float)(src + batch_offset + i * c8 * C8NUM)[c]; | |||
| } | |||
| } | |||
| nhwc_batch_offset += nhwc_batch_unit_offset; | |||
| } | |||
| } | |||
| #endif | |||
| void PackWeightFp32(float *weight_data, ConvParameter *conv_param, float *packed_weight) { | |||
| @@ -1070,7 +1119,7 @@ void PackDepthwiseInt8Input(const int8_t *src, int16_t *dst, const ConvParameter | |||
| auto src_k = src_b + k * conv_param->input_channel_; | |||
| auto dst_k = dst_b + k * ic4 * C4NUM; | |||
| for (int c = 0; c < conv_param->input_channel_; c++) { | |||
| dst_k[c] = (int16_t)((int32_t)(src_k[c]) - input_zp); | |||
| dst_k[c] = (int16_t)(src_k[c] - input_zp); | |||
| } | |||
| } | |||
| } | |||
| @@ -1087,7 +1136,7 @@ void PackDepthwiseInt8Weight(const int8_t *origin_weight, int16_t *packed_weight | |||
| for (int k = 0; k < unit; k++) { | |||
| auto src_kernel = src_c + k; | |||
| auto dst_kernel = dst_c + C4NUM * k + c4_block_rem; | |||
| *dst_kernel = (int16_t)((int32_t)(src_kernel[0]) - weight_zp); | |||
| *dst_kernel = (int16_t)(src_kernel[0] - weight_zp); | |||
| } | |||
| } | |||
| } | |||
| @@ -46,6 +46,14 @@ void PackNC4HW4ToNHWC4Fp16(const void *src, void *dst, int batch, int plane, int | |||
| void PackNC4HW4ToNHWCFp16(const void *src, void *dst, int batch, int plane, int channel); | |||
| void PackNC4HW4ToNCHWFp16(const void *src, void *dst, int batch, int plane, int channel); | |||
| void PackNC8HW8ToNHWCFp16(const void *src, void *dst, int batch, int plane, int channel); | |||
| void PackNCHWFp32ToNC8HW8Fp16(float *src, float16_t *dst, int batch, int plane, int channel); | |||
| void PackNHWCFp32ToNHWC8Fp16(float *src, float16_t *dst, int batch, int plane, int channel); | |||
| void PackNHWC8Fp16ToNHWCFp32(float16_t *src, float *dst, int batch, int plane, int channel); | |||
| #endif | |||
| void Im2ColPackUnitFp32(const float *input_data, ConvParameter *conv_param, float *packed_input, int real_cal_num, | |||
| int block_index); | |||
| @@ -163,4 +171,3 @@ inline void C4UnpackToHwcInt8(int8_t *src_ptr, int8_t *dst_ptr, int channel, int | |||
| } | |||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_PACK_H_ | |||
| @@ -72,7 +72,7 @@ else() | |||
| ) | |||
| endif() | |||
| ### cpu kernel | |||
| file(GLOB_RECURSE KERNEL_OP_SRC | |||
| file(GLOB KERNEL_OP_SRC | |||
| ${LITE_DIR}/src/runtime/kernel/arm/base/*.cc | |||
| ${LITE_DIR}/src/runtime/kernel/arm/fp32/*.cc | |||
| ${LITE_DIR}/src/runtime/kernel/arm/int8/*.cc | |||
| @@ -103,10 +103,13 @@ if (PLATFORM_ARM32) | |||
| ) | |||
| endif() | |||
| if (ENABLE_FP16) | |||
| file(GLOB KERNEL_OP_FP16_SRC | |||
| ${LITE_DIR}/src/runtime/kernel/arm/fp16/*.cc | |||
| ${LITE_DIR}/src/runtime/kernel/arm/opclib/fp16/*.cc | |||
| ) | |||
| set(KERNEL_OP_SRC | |||
| ${KERNEL_OP_SRC} | |||
| ${LITE_DIR}/src/runtime/kernel/arm/fp16/convolution_fp16.cc | |||
| ${LITE_DIR}/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc | |||
| ${KERNEL_OP_FP16_SRC} | |||
| ) | |||
| endif () | |||
| ### gpu kernel | |||