From: @sunsuodong Reviewed-by: @ddwsky,@zhanghaibo5 Signed-off-by: @zhanghaibo5tags/v1.1.0
| @@ -23,7 +23,7 @@ void Calculate_Data(const float *input_data, float *output_data, int num, EluPar | |||
| } | |||
| int Elu(const float *input_data, float *output_data, EluParameter *parameter, int task_id) { | |||
| for (size_t i = task_id; i < parameter->in_size_; i += parameter->thread_num_) { | |||
| for (size_t i = task_id; i < parameter->in_size_; i += parameter->op_parameter_.thread_num_) { | |||
| Calculate_Data(input_data, output_data, i, parameter); | |||
| } | |||
| return NNACL_OK; | |||
| @@ -22,7 +22,6 @@ | |||
| typedef struct EluParameter { | |||
| OpParameter op_parameter_; | |||
| float alpha_; | |||
| int thread_num_; | |||
| int in_size_; | |||
| } EluParameter; | |||
| @@ -47,7 +47,7 @@ int CopyData(float *input_data, int *ids, float *output_data, int num, Embedding | |||
| } | |||
| int EmbeddingLookup(float *input_data, int *ids, float *output_data, EmbeddingLookupParameter *parameter, int task_id) { | |||
| for (size_t i = task_id; i < parameter->ids_size_; i += parameter->thread_num) { | |||
| for (size_t i = task_id; i < parameter->ids_size_; i += parameter->op_parameter_.thread_num_) { | |||
| int ret = CopyData(input_data, ids, output_data, i, parameter); | |||
| if (ret != NNACL_OK) { | |||
| return ret; | |||
| @@ -26,7 +26,6 @@ typedef struct EmbeddingLookupParameter { | |||
| int ids_size_; | |||
| int layer_size_; | |||
| int layer_num_; | |||
| int thread_num; | |||
| } EmbeddingLookupParameter; | |||
| #ifdef __cplusplus | |||
| @@ -40,7 +40,7 @@ ConvolutionDepthwiseSWFp16CPUKernel::~ConvolutionDepthwiseSWFp16CPUKernel() { | |||
| } | |||
| } | |||
| int ConvolutionDepthwiseSWFp16CPUKernel::InitBuffer() { | |||
| int ConvolutionDepthwiseSWFp16CPUKernel::InitPackedInputOutput() { | |||
| if (conv_param_->input_channel_ % C8NUM != 0) { | |||
| need_align_ = true; | |||
| int C8 = UP_DIV(conv_param_->input_channel_, C8NUM); | |||
| @@ -142,19 +142,17 @@ static int ConvDwSWFp16Run(void *cdata, int task_id) { | |||
| } | |||
| int ConvolutionDepthwiseSWFp16CPUKernel::Run() { | |||
| auto ret = InitBuffer(); | |||
| auto ret = InitPackedInputOutput(); | |||
| if (ret != 0) { | |||
| MS_LOG(ERROR) << "Convolution depthwise fp16 InitBuffer failed."; | |||
| context_->allocator->Free(packed_input_); | |||
| context_->allocator->Free(packed_output_); | |||
| MS_LOG(ERROR) << "Convolution depthwise fp16 InitPackedInputOutput failed."; | |||
| FreePackedInputOutput(); | |||
| return ret; | |||
| } | |||
| ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Get Execute tensor failed."; | |||
| context_->allocator->Free(packed_input_); | |||
| context_->allocator->Free(packed_output_); | |||
| FreePackedInputOutput(); | |||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||
| return ret; | |||
| } | |||
| @@ -173,11 +171,19 @@ int ConvolutionDepthwiseSWFp16CPUKernel::Run() { | |||
| if (need_align_) { | |||
| PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_, | |||
| conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); | |||
| context_->allocator->Free(packed_input_); | |||
| context_->allocator->Free(packed_output_); | |||
| } | |||
| ConvolutionBaseFP16CPUKernel::IfCastOutput(); | |||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||
| return RET_OK; | |||
| FreePackedInputOutput(); | |||
| return ret; | |||
| } | |||
| void ConvolutionDepthwiseSWFp16CPUKernel::FreePackedInputOutput() { | |||
| if (need_align_) { | |||
| context_->allocator->Free(packed_input_); | |||
| context_->allocator->Free(packed_output_); | |||
| packed_input_ = nullptr; | |||
| packed_output_ = nullptr; | |||
| } | |||
| } | |||
| } // namespace mindspore::kernel | |||
| @@ -45,11 +45,12 @@ class ConvolutionDepthwiseSWFp16CPUKernel : public ConvolutionBaseFP16CPUKernel | |||
| int ReSize() override; | |||
| int Run() override; | |||
| int InitBuffer(); | |||
| int InitPackedInputOutput(); | |||
| int InitWeightBias(); | |||
| int Execute(int task_id); | |||
| private: | |||
| void FreePackedInputOutput(); | |||
| SlidingWindowParam *sliding_ = nullptr; | |||
| float16_t *packed_weight_ = nullptr; | |||
| float16_t *packed_input_ = nullptr; | |||
| @@ -53,7 +53,7 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitSlideParam() { | |||
| return RET_OK; | |||
| } | |||
| int DeconvolutionDepthwiseFp16CPUKernel::InitBuffer() { | |||
| int DeconvolutionDepthwiseFp16CPUKernel::InitPackedInputOutput() { | |||
| if (conv_param_->input_channel_ % C8NUM != 0) { | |||
| need_align_ = true; | |||
| int C8 = UP_DIV(conv_param_->input_channel_, C8NUM); | |||
| @@ -156,19 +156,17 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() { | |||
| MS_LOG(ERROR) << "Only support input channel equals output channel."; | |||
| return RET_ERROR; | |||
| } | |||
| auto ret = InitBuffer(); | |||
| auto ret = InitPackedInputOutput(); | |||
| if (ret != 0) { | |||
| MS_LOG(ERROR) << "Deconvolution depthwise fp16 InitBuffer failed."; | |||
| context_->allocator->Free(packed_input_); | |||
| context_->allocator->Free(packed_output_); | |||
| MS_LOG(ERROR) << "Deconvolution depthwise fp16 InitPackedInputOutput failed."; | |||
| FreePackedInputOutput(); | |||
| return RET_ERROR; | |||
| } | |||
| ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Get Execute tensor failed."; | |||
| context_->allocator->Free(packed_input_); | |||
| context_->allocator->Free(packed_output_); | |||
| FreePackedInputOutput(); | |||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||
| return ret; | |||
| } | |||
| @@ -191,14 +189,22 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() { | |||
| if (need_align_) { | |||
| PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_, | |||
| conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); | |||
| context_->allocator->Free(packed_input_); | |||
| context_->allocator->Free(packed_output_); | |||
| } | |||
| ConvolutionBaseFP16CPUKernel::IfCastOutput(); | |||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||
| FreePackedInputOutput(); | |||
| return ret; | |||
| } | |||
| void DeconvolutionDepthwiseFp16CPUKernel::FreePackedInputOutput() { | |||
| if (need_align_) { | |||
| context_->allocator->Free(packed_input_); | |||
| context_->allocator->Free(packed_output_); | |||
| packed_input_ = nullptr; | |||
| packed_output_ = nullptr; | |||
| } | |||
| } | |||
| kernel::LiteKernel *CpuDeconvDwFp16KernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter, | |||
| const lite::InnerContext *ctx, const kernel::KernelKey &desc, | |||
| @@ -46,12 +46,13 @@ class DeconvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel | |||
| int ReSize() override; | |||
| int Run() override; | |||
| int InitBuffer(); | |||
| int InitPackedInputOutput(); | |||
| int InitWeightBias(); | |||
| int InitSlideParam(); | |||
| int Execute(int task_id); | |||
| private: | |||
| void FreePackedInputOutput(); | |||
| SlidingWindowParam *sliding_ = nullptr; | |||
| float16_t *packed_weight_ = nullptr; | |||
| float16_t *packed_input_ = nullptr; | |||
| @@ -183,7 +183,7 @@ int DeConvolutionFp16CPUKernel::Run() { | |||
| int error_code = InitRunBuf(); | |||
| if (error_code != RET_OK) { | |||
| MS_LOG(ERROR) << "deconv fp32 InitRunBuf error! error_code[" << error_code << "]"; | |||
| MS_LOG(ERROR) << "deconv fp16 InitRunBuf error! error_code[" << error_code << "]"; | |||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||
| FreeRunBuf(); | |||
| return RET_ERROR; | |||
| @@ -197,7 +197,7 @@ int DeConvolutionFp16CPUKernel::Run() { | |||
| error_code = ParallelLaunch(this->context_->thread_pool_, DeConvFp16Run, this, thread_count_); | |||
| if (error_code != RET_OK) { | |||
| MS_LOG(ERROR) << "deconv fp32 run error! error_code[" << error_code << "]"; | |||
| MS_LOG(ERROR) << "deconv fp16 run error! error_code[" << error_code << "]"; | |||
| } | |||
| } | |||
| @@ -70,7 +70,7 @@ int ConvolutionDepthwiseSWCPUKernel::InitWeightBias() { | |||
| return RET_OK; | |||
| } | |||
| int ConvolutionDepthwiseSWCPUKernel::InitBuffer() { | |||
| int ConvolutionDepthwiseSWCPUKernel::InitPackedInputOutput() { | |||
| if (conv_param_->input_channel_ % C4NUM != 0) { | |||
| need_align_ = true; | |||
| int IC4 = UP_DIV(conv_param_->input_channel_, C4NUM); | |||
| @@ -134,9 +134,10 @@ int ConvDwSWRun(void *cdata, int task_id) { | |||
| } | |||
| int ConvolutionDepthwiseSWCPUKernel::Run() { | |||
| auto ret = InitBuffer(); | |||
| auto ret = InitPackedInputOutput(); | |||
| if (ret != 0) { | |||
| MS_LOG(ERROR) << "Convolution depthwise fp32 InitBuffer failed."; | |||
| MS_LOG(ERROR) << "Convolution depthwise fp32 InitPackedInputOutput failed."; | |||
| FreePackedInputOutput(); | |||
| return RET_ERROR; | |||
| } | |||
| auto input_tensor = in_tensors_.at(kInputIndex); | |||
| @@ -159,16 +160,22 @@ int ConvolutionDepthwiseSWCPUKernel::Run() { | |||
| ret = ParallelLaunch(this->context_->thread_pool_, ConvDwSWRun, this, conv_param_->thread_num_); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "ConvDwSWRun error: error_code[" << ret << "]"; | |||
| return RET_ERROR; | |||
| } | |||
| if (need_align_) { | |||
| PackNHWC4ToNHWCFp32(packed_output_, output_ptr, conv_param_->output_batch_, | |||
| conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); | |||
| } | |||
| FreePackedInputOutput(); | |||
| return ret; | |||
| } | |||
| void ConvolutionDepthwiseSWCPUKernel::FreePackedInputOutput() { | |||
| if (need_align_) { | |||
| context_->allocator->Free(packed_input_); | |||
| context_->allocator->Free(packed_output_); | |||
| packed_input_ = nullptr; | |||
| packed_output_ = nullptr; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| } // namespace mindspore::kernel | |||
| @@ -35,11 +35,12 @@ class ConvolutionDepthwiseSWCPUKernel : public ConvolutionBaseCPUKernel { | |||
| int ReSize() override; | |||
| int Run() override; | |||
| int InitBuffer(); | |||
| int InitWeightBias(); | |||
| int Execute(int task_id); | |||
| private: | |||
| int InitPackedInputOutput(); | |||
| void FreePackedInputOutput(); | |||
| SlidingWindowParam *sliding_ = nullptr; | |||
| float *packed_weight_ = nullptr; | |||
| float *packed_input_ = nullptr; | |||
| @@ -146,21 +146,20 @@ int ConvolutionCPUKernel::Run() { | |||
| auto ret = InitTmpBuffer(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Init tmp buffer failed."; | |||
| FreeTmpBuffer(); | |||
| return RET_ERROR; | |||
| } | |||
| int error_code = ParallelLaunch(this->context_->thread_pool_, ConvolutionImpl, this, thread_count_); | |||
| if (error_code != RET_OK) { | |||
| MS_LOG(ERROR) << "conv error error_code[" << error_code << "]"; | |||
| FreeTmpBuffer(); | |||
| return RET_ERROR; | |||
| ret = ParallelLaunch(this->context_->thread_pool_, ConvolutionImpl, this, thread_count_); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "conv error error_code[" << ret << "]"; | |||
| } | |||
| FreeTmpBuffer(); | |||
| return RET_OK; | |||
| return ret; | |||
| } | |||
| ConvParameter *CreateNewConvParameter(ConvParameter *parameter) { | |||
| auto conv_parameter = reinterpret_cast<ConvParameter *>(malloc(sizeof(ConvParameter))); | |||
| auto conv_parameter = new (std::nothrow) ConvParameter; | |||
| if (conv_parameter == nullptr) { | |||
| MS_LOG(ERROR) << "Malloc new conv parameter failed."; | |||
| return nullptr; | |||
| @@ -222,17 +222,16 @@ int ConvolutionWinogradCPUKernel::Run() { | |||
| auto ret = InitTmpBuffer(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Init tmp buffer failed."; | |||
| FreeTmpBuffer(); | |||
| return RET_ERROR; | |||
| } | |||
| int error_code = ParallelLaunch(this->context_->thread_pool_, ConvolutionWinogradImpl, this, thread_count_); | |||
| if (error_code != RET_OK) { | |||
| MS_LOG(ERROR) << "conv winograd error error_code[" << error_code << "]"; | |||
| FreeTmpBuffer(); | |||
| return RET_ERROR; | |||
| ret = ParallelLaunch(this->context_->thread_pool_, ConvolutionWinogradImpl, this, thread_count_); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "conv winograd error error_code[" << ret << "]"; | |||
| } | |||
| FreeTmpBuffer(); | |||
| return RET_OK; | |||
| return ret; | |||
| } | |||
| } // namespace mindspore::kernel | |||
| @@ -82,7 +82,7 @@ int DeconvolutionDepthwiseCPUKernel::InitWeightBias() { | |||
| return RET_OK; | |||
| } | |||
| int DeconvolutionDepthwiseCPUKernel::InitBuffer() { | |||
| int DeconvolutionDepthwiseCPUKernel::InitPackedInputOutput() { | |||
| if (conv_param_->input_channel_ % C4NUM != 0) { | |||
| need_align_ = true; | |||
| int IC4 = UP_DIV(conv_param_->input_channel_, C4NUM); | |||
| @@ -151,9 +151,10 @@ int DeconvolutionDepthwiseCPUKernel::Run() { | |||
| return RET_ERROR; | |||
| } | |||
| auto ret = InitBuffer(); | |||
| auto ret = InitPackedInputOutput(); | |||
| if (ret != 0) { | |||
| MS_LOG(ERROR) << "Deconvolution depthwise fp32 InitBuffer failed.ret: " << ret; | |||
| MS_LOG(ERROR) << "Deconvolution depthwise fp32 InitPackedInputOutput failed.ret: " << ret; | |||
| FreePackedInputOutput(); | |||
| return ret; | |||
| } | |||
| @@ -176,16 +177,23 @@ int DeconvolutionDepthwiseCPUKernel::Run() { | |||
| ret = ParallelLaunch(this->context_->thread_pool_, DeconvDwRun, this, conv_param_->thread_num_); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "DeconvDwRun error: error_code[" << ret << "]"; | |||
| return RET_ERROR; | |||
| } | |||
| if (need_align_) { | |||
| PackNHWC4ToNHWCFp32(packed_output_, output_addr, conv_param_->output_batch_, | |||
| conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); | |||
| } | |||
| FreePackedInputOutput(); | |||
| return ret; | |||
| } | |||
| void DeconvolutionDepthwiseCPUKernel::FreePackedInputOutput() { | |||
| if (need_align_) { | |||
| context_->allocator->Free(packed_input_); | |||
| context_->allocator->Free(packed_output_); | |||
| packed_input_ = nullptr; | |||
| packed_output_ = nullptr; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| kernel::LiteKernel *CpuDeconvDwFp32KernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| @@ -36,11 +36,12 @@ class DeconvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel { | |||
| int ReSize() override; | |||
| int Run() override; | |||
| int InitBuffer(); | |||
| int InitWeightBias(); | |||
| int Execute(int task_id); | |||
| private: | |||
| int InitPackedInputOutput(); | |||
| void FreePackedInputOutput(); | |||
| SlidingWindowParam *sliding_ = nullptr; | |||
| float *packed_weight_ = nullptr; | |||
| float *packed_input_ = nullptr; | |||
| @@ -202,6 +202,7 @@ int DeConvolutionCPUKernel::Run() { | |||
| int error_code = InitRunBuf(); | |||
| if (error_code != RET_OK) { | |||
| MS_LOG(ERROR) << "deconv fp32 InitRunBuf error! error_code[" << error_code << "]"; | |||
| FreeRunBuf(); | |||
| return error_code; | |||
| } | |||
| @@ -218,6 +219,7 @@ int DeConvolutionCPUKernel::Run() { | |||
| error_code = ParallelLaunch(this->context_->thread_pool_, DeConvFp32Run, this, thread_count_); | |||
| if (error_code != RET_OK) { | |||
| MS_LOG(ERROR) << "deconv fp32 run error! error_code[" << error_code << "]"; | |||
| FreeRunBuf(); | |||
| return error_code; | |||
| } | |||
| } | |||
| @@ -390,6 +390,7 @@ int DeConvolutionWinogradCPUKernel::Run() { | |||
| auto ret = InitRunBuf(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "InitRunBuf fail!ret: " << ret; | |||
| FreeRunBuf(); | |||
| return ret; | |||
| } | |||
| @@ -410,5 +411,4 @@ int DeConvolutionWinogradCPUKernel::Run() { | |||
| FreeRunBuf(); | |||
| return RET_OK; | |||
| } | |||
| } // namespace mindspore::kernel | |||
| @@ -26,13 +26,9 @@ using mindspore::schema::PrimitiveType_Elu; | |||
| namespace mindspore::kernel { | |||
| int EluCPUKernel::Init() { | |||
| elu_parameter_ = reinterpret_cast<EluParameter *>(op_parameter_); | |||
| elu_parameter_->thread_num_ = thread_count_; | |||
| if (!InferShapeDone()) { | |||
| return RET_OK; | |||
| } | |||
| return ReSize(); | |||
| } | |||
| @@ -42,6 +38,8 @@ int EluCPUKernel::ReSize() { | |||
| } | |||
| int EluCPUKernel::DoExcute(int task_id) { | |||
| auto input_addr = reinterpret_cast<float *>(in_tensors_.front()->MutableData()); | |||
| auto output_addr = reinterpret_cast<float *>(out_tensors_.front()->MutableData()); | |||
| Elu(input_addr, output_addr, elu_parameter_, task_id); | |||
| return RET_OK; | |||
| } | |||
| @@ -57,10 +55,7 @@ int EluRun(void *cdata, int task_id) { | |||
| } | |||
| int EluCPUKernel::Run() { | |||
| input_addr = reinterpret_cast<float *>(in_tensors_.front()->MutableData()); | |||
| output_addr = reinterpret_cast<float *>(out_tensors_.front()->MutableData()); | |||
| auto ret = ParallelLaunch(this->context_->thread_pool_, EluRun, this, elu_parameter_->thread_num_); | |||
| auto ret = ParallelLaunch(this->context_->thread_pool_, EluRun, this, op_parameter_->thread_num_); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Elu error: error_code[" << ret << "]"; | |||
| return RET_ERROR; | |||
| @@ -72,16 +67,6 @@ kernel::LiteKernel *CpuEluFp32KernelCreator(const std::vector<lite::Tensor *> &i | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *parameter, | |||
| const lite::InnerContext *ctx, const KernelKey &desc, | |||
| const mindspore::lite::PrimitiveC *primitive) { | |||
| if (parameter == nullptr) { | |||
| MS_LOG(ERROR) << "parameter is nullptr"; | |||
| return nullptr; | |||
| } | |||
| if (ctx == nullptr) { | |||
| MS_LOG(ERROR) << "ctx is nullptr"; | |||
| free(parameter); | |||
| return nullptr; | |||
| } | |||
| MS_ASSERT(desc.type == PrimitiveType_Elu); | |||
| auto *kernel = new (std::nothrow) EluCPUKernel(parameter, inputs, outputs, ctx, primitive); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "Create Kernel failed, name: " << parameter->name_; | |||
| @@ -24,25 +24,21 @@ | |||
| namespace mindspore::kernel { | |||
| class EluCPUKernel : public LiteKernel { | |||
| public: | |||
| explicit EluCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx, | |||
| const mindspore::lite::PrimitiveC *primitive) | |||
| : LiteKernel(parameter, inputs, outputs, ctx, primitive), ctx_(ctx), thread_count_(ctx->thread_num_) {} | |||
| ~EluCPUKernel() override{}; | |||
| EluCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx, | |||
| const mindspore::lite::PrimitiveC *primitive) | |||
| : LiteKernel(parameter, inputs, outputs, ctx, primitive) { | |||
| elu_parameter_ = reinterpret_cast<EluParameter *>(op_parameter_); | |||
| } | |||
| ~EluCPUKernel() = default; | |||
| int Init() override; | |||
| int ReSize() override; | |||
| int Run() override; | |||
| int DoExcute(int task_id); | |||
| protected: | |||
| const lite::InnerContext *ctx_ = nullptr; | |||
| int thread_count_ = 1; | |||
| EluParameter *elu_parameter_ = nullptr; | |||
| private: | |||
| float *input_addr = nullptr; | |||
| float *output_addr = nullptr; | |||
| EluParameter *elu_parameter_ = nullptr; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| @@ -26,9 +26,6 @@ using mindspore::schema::PrimitiveType_EmbeddingLookup; | |||
| namespace mindspore::kernel { | |||
| int EmbeddingLookupCPUKernel::Init() { | |||
| embedding_lookup_parameter_ = reinterpret_cast<EmbeddingLookupParameter *>(op_parameter_); | |||
| embedding_lookup_parameter_->thread_num = thread_count_; | |||
| if (!InferShapeDone()) { | |||
| return RET_OK; | |||
| } | |||
| @@ -36,24 +33,24 @@ int EmbeddingLookupCPUKernel::Init() { | |||
| } | |||
| int EmbeddingLookupCPUKernel::ReSize() { | |||
| embedding_lookup_parameter_->ids_size_ = in_tensors_.back()->ElementsNum(); | |||
| embedding_lookup_parameter_->layer_size_ = 1; | |||
| param_->ids_size_ = in_tensors_.back()->ElementsNum(); | |||
| param_->layer_size_ = 1; | |||
| auto in_shape = in_tensors_.front()->shape(); | |||
| for (size_t i = 1; i < in_shape.size(); ++i) { | |||
| embedding_lookup_parameter_->layer_size_ *= in_shape[i]; | |||
| param_->layer_size_ *= in_shape[i]; | |||
| } | |||
| embedding_lookup_parameter_->layer_num_ = 0; | |||
| param_->layer_num_ = 0; | |||
| for (size_t i = 0; i < in_tensors_.size() - 1; ++i) { | |||
| embedding_lookup_parameter_->layer_num_ += in_tensors_[i]->shape()[0]; | |||
| param_->layer_num_ += in_tensors_[i]->shape()[0]; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int EmbeddingLookupCPUKernel::DoExcute(int task_id) { | |||
| int error_code = EmbeddingLookup(input_addr_, ids_addr_, output_addr_, embedding_lookup_parameter_, task_id); | |||
| auto ids_addr = reinterpret_cast<int *>(in_tensors_.back()->MutableData()); | |||
| auto output_addr = reinterpret_cast<float *>(out_tensors_.front()->MutableData()); | |||
| int error_code = EmbeddingLookup(input_addr_, ids_addr, output_addr, param_, task_id); | |||
| if (error_code != RET_OK) { | |||
| MS_LOG(ERROR) << "embedding lookup error error_code[" << error_code << "]"; | |||
| return RET_ERROR; | |||
| @@ -62,8 +59,8 @@ int EmbeddingLookupCPUKernel::DoExcute(int task_id) { | |||
| } | |||
| int EmbeddingLookupRun(void *cdata, int task_id) { | |||
| auto EmbeddingLookupData = reinterpret_cast<EmbeddingLookupCPUKernel *>(cdata); | |||
| auto ret = EmbeddingLookupData->DoExcute(task_id); | |||
| auto kernel = reinterpret_cast<EmbeddingLookupCPUKernel *>(cdata); | |||
| auto ret = kernel->DoExcute(task_id); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "EmbeddingLookupRun error task_id[" << task_id << "] error_code[" << ret << "]"; | |||
| return RET_ERROR; | |||
| @@ -73,39 +70,38 @@ int EmbeddingLookupRun(void *cdata, int task_id) { | |||
| int EmbeddingLookupCPUKernel::Run() { | |||
| MS_ASSERT(context_->allocator != nullptr); | |||
| input_addr_ = reinterpret_cast<float *>(context_->allocator->Malloc( | |||
| sizeof(float) * embedding_lookup_parameter_->layer_size_ * embedding_lookup_parameter_->layer_num_)); | |||
| embedding_lookup_parameter_->is_regulated_ = | |||
| reinterpret_cast<bool *>(context_->allocator->Malloc(sizeof(bool) * embedding_lookup_parameter_->layer_num_)); | |||
| if (input_addr_ == nullptr || embedding_lookup_parameter_->is_regulated_ == nullptr) { | |||
| input_addr_ = | |||
| reinterpret_cast<float *>(context_->allocator->Malloc(sizeof(float) * param_->layer_size_ * param_->layer_num_)); | |||
| param_->is_regulated_ = reinterpret_cast<bool *>(context_->allocator->Malloc(sizeof(bool) * param_->layer_num_)); | |||
| if (input_addr_ == nullptr || param_->is_regulated_ == nullptr) { | |||
| MS_LOG(ERROR) << "Memory allocation failed"; | |||
| context_->allocator->Free(input_addr_); | |||
| context_->allocator->Free(embedding_lookup_parameter_->is_regulated_); | |||
| FreeRunBuff(); | |||
| return RET_ERROR; | |||
| } | |||
| for (int i = 0; i < embedding_lookup_parameter_->layer_num_; ++i) { | |||
| embedding_lookup_parameter_->is_regulated_[i] = embedding_lookup_parameter_->max_norm_ == 0; | |||
| for (int i = 0; i < param_->layer_num_; ++i) { | |||
| param_->is_regulated_[i] = param_->max_norm_ == 0; | |||
| } | |||
| int dest_loc = 0; | |||
| for (size_t i = 0; i < in_tensors_.size() - 1; i++) { | |||
| auto input_t = reinterpret_cast<float *>(in_tensors_.at(i)->MutableData()); | |||
| memcpy(input_addr_ + dest_loc, input_t, sizeof(float) * in_tensors_.at(i)->ElementsNum()); | |||
| dest_loc += in_tensors_.at(i)->ElementsNum(); | |||
| } | |||
| output_addr_ = reinterpret_cast<float *>(out_tensors_.front()->MutableData()); | |||
| ids_addr_ = reinterpret_cast<int *>(in_tensors_.back()->MutableData()); | |||
| auto ret = | |||
| ParallelLaunch(this->context_->thread_pool_, EmbeddingLookupRun, this, embedding_lookup_parameter_->thread_num); | |||
| context_->allocator->Free(input_addr_); | |||
| context_->allocator->Free(embedding_lookup_parameter_->is_regulated_); | |||
| auto ret = ParallelLaunch(this->context_->thread_pool_, EmbeddingLookupRun, this, op_parameter_->thread_num_); | |||
| FreeRunBuff(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "EmbeddingLookup error: error_code[" << ret << "]"; | |||
| } | |||
| return ret; | |||
| } | |||
| void EmbeddingLookupCPUKernel::FreeRunBuff() { | |||
| context_->allocator->Free(input_addr_); | |||
| context_->allocator->Free(param_->is_regulated_); | |||
| input_addr_ = nullptr; | |||
| param_->is_regulated_ = nullptr; | |||
| } | |||
| kernel::LiteKernel *CpuEmbeddingLookupFp32KernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, | |||
| OpParameter *parameter, const lite::InnerContext *ctx, | |||
| @@ -27,30 +27,20 @@ class EmbeddingLookupCPUKernel : public LiteKernel { | |||
| explicit EmbeddingLookupCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx, | |||
| const mindspore::lite::PrimitiveC *primitive) | |||
| : LiteKernel(parameter, inputs, outputs, ctx, primitive), ctx_(ctx), thread_count_(ctx->thread_num_) {} | |||
| ~EmbeddingLookupCPUKernel() override { | |||
| if (input_addr_ != nullptr) { | |||
| free(input_addr_); | |||
| } | |||
| if (embedding_lookup_parameter_->is_regulated_ != nullptr) { | |||
| free(embedding_lookup_parameter_->is_regulated_); | |||
| } | |||
| }; | |||
| : LiteKernel(parameter, inputs, outputs, ctx, primitive) { | |||
| param_ = reinterpret_cast<EmbeddingLookupParameter *>(parameter); | |||
| } | |||
| ~EmbeddingLookupCPUKernel() = default; | |||
| int Init() override; | |||
| int ReSize() override; | |||
| int Run() override; | |||
| int DoExcute(int task_id); | |||
| protected: | |||
| const lite::InnerContext *ctx_ = nullptr; | |||
| int thread_count_ = 1; | |||
| EmbeddingLookupParameter *embedding_lookup_parameter_ = nullptr; | |||
| private: | |||
| void FreeRunBuff(); | |||
| EmbeddingLookupParameter *param_ = nullptr; | |||
| float *input_addr_ = nullptr; | |||
| float *output_addr_ = nullptr; | |||
| int *ids_addr_ = nullptr; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| @@ -44,7 +44,9 @@ void FullconnectionCPUKernel::FreeBuf() { | |||
| int FullconnectionCPUKernel::ReSize() { | |||
| FreeBuf(); | |||
| int row = 1; | |||
| for (size_t i = 0; i < out_tensors_[0]->shape().size() - 1; ++i) row *= (out_tensors_[0]->shape())[i]; | |||
| for (size_t i = 0; i < out_tensors_[0]->shape().size() - 1; ++i) { | |||
| row *= (out_tensors_[0]->shape())[i]; | |||
| } | |||
| fc_param_->row_ = row; | |||
| fc_param_->col_ = out_tensors_[0]->shape().back(); | |||
| fc_param_->deep_ = (in_tensors_[1]->shape())[1]; | |||
| @@ -56,13 +56,12 @@ int InstanceNormCPUKernel::DoInstanceNorm(int task_id) { | |||
| } | |||
| int InstanceNormRun(void *cdata, int task_id) { | |||
| auto InstanceNormData = reinterpret_cast<InstanceNormCPUKernel *>(cdata); | |||
| auto ret = InstanceNormData->DoInstanceNorm(task_id); | |||
| auto kernel = reinterpret_cast<InstanceNormCPUKernel *>(cdata); | |||
| auto ret = kernel->DoInstanceNorm(task_id); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "InstanceNormRun error task_id[" << task_id << "] error_code[" << ret << "]"; | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| return ret; | |||
| } | |||
| int InstanceNormCPUKernel::Run() { | |||
| @@ -58,8 +58,8 @@ int LayerNormCPUKernel::DoLayerNorm(int thread_id) { | |||
| } | |||
| int LayerNormRun(void *cdata, int task_id) { | |||
| auto LayerNormData = reinterpret_cast<LayerNormCPUKernel *>(cdata); | |||
| auto ret = LayerNormData->DoLayerNorm(task_id); | |||
| auto kernel = reinterpret_cast<LayerNormCPUKernel *>(cdata); | |||
| auto ret = kernel->DoLayerNorm(task_id); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "LayerNormRun error task_id[" << task_id << "] error_code[" << ret << "]"; | |||
| return RET_ERROR; | |||
| @@ -72,7 +72,7 @@ int ConvolutionDepthwiseSWInt8CPUKernel::InitWeightBias() { | |||
| return RET_OK; | |||
| } | |||
| int ConvolutionDepthwiseSWInt8CPUKernel::InitBuffer() { | |||
| int ConvolutionDepthwiseSWInt8CPUKernel::InitPackedInputOutput() { | |||
| if (conv_param_->input_channel_ % C8NUM != 0) { | |||
| need_align_ = true; | |||
| @@ -319,15 +319,10 @@ int ConvDwSWInt8Run(void *cdata, int task_id) { | |||
| } | |||
| int ConvolutionDepthwiseSWInt8CPUKernel::Run() { | |||
| auto ret = InitBuffer(); | |||
| auto ret = InitPackedInputOutput(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Depthwise int8 ReSize error!"; | |||
| if (need_align_) { | |||
| context_->allocator->Free(packed_input_); | |||
| context_->allocator->Free(packed_output_); | |||
| packed_input_ = nullptr; | |||
| packed_output_ = nullptr; | |||
| } | |||
| FreePackedInputOutput(); | |||
| return ret; | |||
| } | |||
| @@ -353,12 +348,17 @@ int ConvolutionDepthwiseSWInt8CPUKernel::Run() { | |||
| if (need_align_) { | |||
| PackNHWC8ToNHWCInt8(packed_output_, output_addr, conv_param_->output_batch_, | |||
| conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); | |||
| } | |||
| FreePackedInputOutput(); | |||
| return ret; | |||
| } | |||
| void ConvolutionDepthwiseSWInt8CPUKernel::FreePackedInputOutput() { | |||
| if (need_align_) { | |||
| context_->allocator->Free(packed_input_); | |||
| context_->allocator->Free(packed_output_); | |||
| packed_input_ = nullptr; | |||
| packed_output_ = nullptr; | |||
| } | |||
| return ret; | |||
| } | |||
| } // namespace mindspore::kernel | |||
| @@ -36,10 +36,11 @@ class ConvolutionDepthwiseSWInt8CPUKernel : public ConvolutionBaseCPUKernel { | |||
| int Run() override; | |||
| int InitWeightBias(); | |||
| int InitBuffer(); | |||
| int InitPackedInputOutput(); | |||
| int Execute(int task_id); | |||
| private: | |||
| void FreePackedInputOutput(); | |||
| int ReinitQuantParam(); | |||
| int ReinitFreeBefore(); | |||
| void FreeTmpQuant(); | |||