Merge pull request !4157 from yangruoqi713/litetags/v0.7.0-beta
| @@ -56,13 +56,11 @@ int Pooling::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tenso | |||
| } else { | |||
| auto round_mode = pooling_prim->roundMode(); | |||
| if (round_mode == schema::RoundMode_FLOOR) { | |||
| output_h = std::floor((input_h + pad_u_ + pad_d_ - window_h) / pooling_prim->strideH() + 1); | |||
| output_w = std::floor((input_w + pad_l_ + pad_r_ - window_w) / pooling_prim->strideW() + 1); | |||
| output_h = std::floor(static_cast<float>(input_h + pad_u_ + pad_d_ - window_h) / pooling_prim->strideH()) + 1; | |||
| output_w = std::floor(static_cast<float>(input_w + pad_l_ + pad_r_ - window_w) / pooling_prim->strideW()) + 1; | |||
| } else if (round_mode == schema::RoundMode_CEIL) { | |||
| output_h = | |||
| std::ceil((input_h + pooling_prim->padUp() + pooling_prim->padDown() - window_h) / pooling_prim->strideH() + 1); | |||
| output_w = std::ceil( | |||
| (input_w + pooling_prim->padLeft() + pooling_prim->padRight() - window_w) / pooling_prim->strideW() + 1); | |||
| output_h = std::ceil(static_cast<float>(input_h + pad_u_ + pad_d_ - window_h) / pooling_prim->strideH()) + 1; | |||
| output_w = std::ceil(static_cast<float>(input_w + pad_l_ + pad_r_ - window_w) / pooling_prim->strideW()) + 1; | |||
| } else { | |||
| MS_LOG(ERROR) << "unsupported round mode."; | |||
| } | |||
| @@ -80,4 +78,3 @@ int Pooling::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tenso | |||
| return RET_OK; | |||
| } | |||
| } // namespace mindspore::lite | |||
| @@ -28,17 +28,23 @@ using mindspore::lite::RET_OK; | |||
| using mindspore::schema::PrimitiveType_BatchNorm; | |||
| namespace mindspore::kernel { | |||
| int BatchnormCPUKernel::Init() { return RET_OK; } | |||
| int BatchnormCPUKernel::Init() { | |||
| auto input_shapes = inputs_[0]->shape(); | |||
| auto n_dim = input_shapes.size(); | |||
| batchnorm_param_->channel_ = input_shapes[n_dim - 1]; | |||
| batchnorm_param_->unit_ = 1; | |||
| for (int i = 0; i < n_dim - 1; i++) { | |||
| batchnorm_param_->unit_ *= input_shapes[i]; | |||
| } | |||
| batchnorm_param_->op_parameter_.thread_num_ = | |||
| MSMIN(batchnorm_param_->op_parameter_.thread_num_, batchnorm_param_->unit_); | |||
| return RET_OK; | |||
| } | |||
| int BatchnormCPUKernel::ReSize() { return RET_OK; } | |||
| int BatchnormCPUKernel::DoExecute(int tid) { | |||
| int count = MSMIN(thread_unit_, units_ - tid * thread_unit_); | |||
| if (count <= 0) { | |||
| return RET_OK; | |||
| } | |||
| int offset = tid * thread_unit_ * channel_; | |||
| BatchNorm(in_addr_ + offset, mean_addr_, var_addr_, count, channel_, batchnorm_param_->epsilon_, out_addr_ + offset); | |||
| int BatchnormCPUKernel::DoExecute(int task_id) { | |||
| BatchNorm(out_addr_, in_addr_, mean_addr_, var_addr_, task_id, batchnorm_param_); | |||
| return RET_OK; | |||
| } | |||
| @@ -62,15 +68,8 @@ int BatchnormCPUKernel::Run() { | |||
| mean_addr_ = reinterpret_cast<float *>(inputs_.at(1)->Data()); | |||
| var_addr_ = reinterpret_cast<float *>(inputs_.at(2)->Data()); | |||
| out_addr_ = reinterpret_cast<float *>(outputs_.at(0)->Data()); | |||
| auto input_shapes = inputs_[0]->shape(); | |||
| channel_ = input_shapes[3]; | |||
| units_ = 1; | |||
| for (int i = 0; i < 3; i++) { | |||
| units_ *= input_shapes[i]; | |||
| } | |||
| thread_count_ = MSMIN(thread_count_, units_); | |||
| thread_unit_ = UP_DIV(units_, thread_count_); | |||
| int ret = LiteBackendParallelLaunch(BatchNormRun, this, thread_count_); | |||
| int ret = LiteBackendParallelLaunch(BatchNormRun, this, batchnorm_param_->op_parameter_.thread_num_); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "BatchnormRun error error_code[" << ret << "]"; | |||
| return ret; | |||
| @@ -30,10 +30,11 @@ class BatchnormCPUKernel : public LiteKernel { | |||
| BatchnormCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs, | |||
| const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx, | |||
| const lite::Primitive *primitive) | |||
| : LiteKernel(parameter, inputs, outputs, ctx, primitive), ctx_(ctx), thread_count_(ctx->thread_num_) { | |||
| : LiteKernel(parameter, inputs, outputs, ctx, primitive) { | |||
| opParameter->thread_num_ = ctx->thread_num_; | |||
| batchnorm_param_ = reinterpret_cast<BatchNormParameter *>(parameter); | |||
| } | |||
| ~BatchnormCPUKernel() override { delete batchnorm_param_; } | |||
| ~BatchnormCPUKernel() override = default; | |||
| int Init() override; | |||
| int ReSize() override; | |||
| @@ -41,15 +42,10 @@ class BatchnormCPUKernel : public LiteKernel { | |||
| int DoExecute(int tid); | |||
| private: | |||
| int thread_count_; | |||
| int thread_unit_; | |||
| int units_; | |||
| int channel_; | |||
| float *in_addr_; | |||
| float *mean_addr_; | |||
| float *var_addr_; | |||
| float *out_addr_; | |||
| const Context *ctx_; | |||
| BatchNormParameter *batchnorm_param_; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| @@ -36,8 +36,12 @@ int Nchw2NhwcCPUKernel::Run() { | |||
| auto input = inputs_[0]; | |||
| auto output = outputs_[0]; | |||
| PackNCHWToNHWCFp32(input->Data(), output->Data(), output->Batch(), output->Height() * output->Width(), | |||
| output->Channel()); | |||
| if (input->shape().size() == 4) { | |||
| PackNCHWToNHWCFp32(input->Data(), output->Data(), output->Batch(), output->Height() * output->Width(), | |||
| output->Channel()); | |||
| } else { | |||
| memcpy(output->Data(), input->Data(), input->ElementsNum() * sizeof(float)); | |||
| } | |||
| return RET_OK; | |||
| } | |||
| @@ -36,8 +36,12 @@ int Nhwc2NchwCPUKernel::Run() { | |||
| auto input = inputs_[0]; | |||
| auto output = outputs_[0]; | |||
| PackNHWCToNCHWFp32(input->Data(), output->Data(), output->Batch(), output->Height() * output->Width(), | |||
| output->Channel()); | |||
| if (input->shape().size() == 4) { | |||
| PackNHWCToNCHWFp32(input->Data(), output->Data(), output->Batch(), output->Height() * output->Width(), | |||
| output->Channel()); | |||
| } else { | |||
| memcpy(output->Data(), input->Data(), input->ElementsNum() * sizeof(float)); | |||
| } | |||
| return RET_OK; | |||
| } | |||
| @@ -45,12 +45,13 @@ int ScaleCPUKernel::InitScaleOffset() { | |||
| } | |||
| if (inputs_.size() == 3) { | |||
| auto offset_tensor = inputs_.at(1); | |||
| auto offset_tensor = inputs_.at(2); | |||
| offset_ = reinterpret_cast<float *>(malloc(offset_tensor->ElementsNum() * sizeof(float))); | |||
| if (offset_ == nullptr) { | |||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||
| return RET_ERROR; | |||
| } | |||
| memcpy(offset_, offset_tensor->Data(), offset_tensor->ElementsNum() * sizeof(float)); | |||
| param->has_offset_ = true; | |||
| } else { | |||
| offset_ = nullptr; | |||
| @@ -16,12 +16,12 @@ | |||
| #include "src/runtime/kernel/arm/nnacl/fp32/batchnorm.h" | |||
| void BatchNorm(const float *input_ptr, const float *mean_ptr, const float *variance_ptr, int units, int channel, | |||
| float epsilon, float *output_ptr) { | |||
| for (int u = 0; u < units; u++) { | |||
| for (int c = 0; c < channel; c++) { | |||
| auto variance_sqrt = sqrt(variance_ptr[c] + epsilon); | |||
| output_ptr[u * channel + c] = (input_ptr[u * channel + c] - mean_ptr[c]) / variance_sqrt; | |||
| void BatchNorm(float *output_ptr, const float *input_ptr, const float *mean_ptr, const float *variance_ptr, int task_id, | |||
| BatchNormParameter *param) { | |||
| for (int u = task_id; u < param->unit_; u += param->op_parameter_.thread_num_) { | |||
| for (int c = 0; c < param->channel_; c++) { | |||
| auto variance_sqrt = sqrt(variance_ptr[c] + param->epsilon_); | |||
| output_ptr[u * param->channel_ + c] = (input_ptr[u * param->channel_ + c] - mean_ptr[c]) / variance_sqrt; | |||
| } | |||
| } | |||
| } | |||
| @@ -22,9 +22,11 @@ | |||
| struct BatchNormParameter { | |||
| OpParameter op_parameter_; | |||
| float epsilon_; | |||
| int unit_; | |||
| int channel_; | |||
| }; | |||
| void BatchNorm(const float *input_ptr, const float *mean_ptr, const float *variance_ptr, int count, int channel, | |||
| float epsilon, float *output_ptr); | |||
| void BatchNorm(float *output_ptr, const float *input_ptr, const float *mean_ptr, const float *variance_ptr, int task_id, | |||
| BatchNormParameter *param); | |||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FUSED_BATCHNORM_H_ | |||
| @@ -245,8 +245,6 @@ bool ThreadPool::SetThreadPool() { | |||
| } else { | |||
| AddRunThread(localMaxThreadNums); | |||
| } | |||
| MS_LOG(DEBUG) << "configThreadNums=" << configThreadNums << ", curThreadNums=" << curThreadNums | |||
| << ", curThreadRunNums=" << curThreadRunNums << ", localMaxThreadNums=" << localMaxThreadNums; | |||
| return true; | |||
| } | |||
| @@ -276,7 +274,6 @@ void ThreadPool::AddNewThread(int newNums) { | |||
| } | |||
| curThreadNums += newNums; | |||
| curThreadRunNums += newNums; | |||
| MS_LOG(DEBUG) << "add " << newNums << " thread"; | |||
| } | |||
| bool ThreadPool::SetThreadCpuBind(bool ifBind, int mode, bool master) { | |||
| @@ -330,7 +327,6 @@ bool ThreadPool::AddTask(WorkFun &&worker, void *cdata, int numTask) { | |||
| } | |||
| bool ThreadPool::DistributeTask(ThreadPoolTask *task, int numTask) { | |||
| MS_LOG(DEBUG) << "numTask = " << numTask << ", curThreadRunNums = " << curThreadRunNums; | |||
| auto taskOri = *task; | |||
| if (numTask > curThreadRunNums) { | |||
| task->first = [taskOri, numTask, this](int task_id, TvmEnv *penv, void *cdata) -> int { | |||
| @@ -370,12 +366,10 @@ bool ThreadPool::DistributeTask(ThreadPoolTask *task, int numTask) { | |||
| } | |||
| } | |||
| } | |||
| MS_LOG(DEBUG) << "finish " << numTask << " task successful"; | |||
| return CheckResult(); | |||
| } | |||
| void ThreadPool::AddRunThread(int num) { | |||
| MS_LOG(DEBUG) << "num=" << num << ", curThreadRunNums=" << curThreadRunNums; | |||
| int activeNums = num - curThreadRunNums; | |||
| if (activeNums <= 0 || activateList.size() < activeNums) { | |||
| return; | |||
| @@ -389,7 +383,6 @@ void ThreadPool::AddRunThread(int num) { | |||
| } | |||
| void ThreadPool::SubRunThread(int num) { | |||
| MS_LOG(DEBUG) << "num=" << num << ", curThreadRunNums=" << curThreadRunNums; | |||
| int deactiveNums = curThreadRunNums - num; | |||
| if (deactiveNums <= 0) { | |||
| return; | |||
| @@ -56,6 +56,8 @@ STATUS CaffePoolingParser::Parse(const caffe::LayerParameter &proto, | |||
| return RET_ERROR; | |||
| } | |||
| // default roundMode RoundMode_CEIL | |||
| attr->roundMode = schema::RoundMode_CEIL; | |||
| if (poolingParam.has_round_mode()) { | |||
| if (poolingParam.round_mode() == caffe::PoolingParameter_RoundMode_FLOOR) { | |||
| attr->roundMode = schema::RoundMode_FLOOR; | |||