Merge pull request !4387 from 徐安越/mastertags/v0.7.0-beta
| @@ -31,7 +31,7 @@ int Power::InferShape(std::vector<tensor::Tensor *> inputs, std::vector<tensor:: | |||||
| } | } | ||||
| auto output_tensor = outputs[0]; | auto output_tensor = outputs[0]; | ||||
| MS_ASSERT(output_tensor != nullptr); | MS_ASSERT(output_tensor != nullptr); | ||||
| if (exp_tensor) { | |||||
| if (exp_tensor != nullptr) { | |||||
| if (exp_tensor->shape() != x_tensor->shape() || exp_tensor->data_type() != x_tensor->data_type()) { | if (exp_tensor->shape() != x_tensor->shape() || exp_tensor->data_type() != x_tensor->data_type()) { | ||||
| MS_LOG(ERROR) << "Power inputs shape or type is not equal!"; | MS_LOG(ERROR) << "Power inputs shape or type is not equal!"; | ||||
| return RET_INPUT_TENSOR_ERROR; | return RET_INPUT_TENSOR_ERROR; | ||||
| @@ -48,7 +48,7 @@ kernel::LiteKernel *CpuMatmulKernelCreator(const std::vector<lite::tensor::Tenso | |||||
| case kNumberTypeFloat32: { | case kNumberTypeFloat32: { | ||||
| kernel = new (std::nothrow) MatmulCPUKernel(opParameter, inputs, outputs, ctx, primitive); | kernel = new (std::nothrow) MatmulCPUKernel(opParameter, inputs, outputs, ctx, primitive); | ||||
| if (!kernel) { | |||||
| if (kernel == nullptr) { | |||||
| MS_LOG(ERROR) << "kernel is nullptr."; | MS_LOG(ERROR) << "kernel is nullptr."; | ||||
| return nullptr; | return nullptr; | ||||
| } | } | ||||
| @@ -97,8 +97,8 @@ int Convolution3x3FP16CPUKernel::InitWeightBias() { | |||||
| } | } | ||||
| int Convolution3x3FP16CPUKernel::InitTmpBuffer() { | int Convolution3x3FP16CPUKernel::InitTmpBuffer() { | ||||
| int tile_num = 16; | |||||
| int k_plane = 36; | |||||
| const int tile_num = 16; | |||||
| const int k_plane = 36; | |||||
| int iC4 = UP_DIV(conv_param_->input_channel_, C4NUM); | int iC4 = UP_DIV(conv_param_->input_channel_, C4NUM); | ||||
| int oC8 = UP_DIV(conv_param_->output_channel_, C8NUM); | int oC8 = UP_DIV(conv_param_->output_channel_, C8NUM); | ||||
| @@ -261,7 +261,7 @@ kernel::LiteKernel *CpuConvFp32KernelCreator(const std::vector<lite::tensor::Ten | |||||
| conv_param->input_w_ = inputs.front()->Width(); | conv_param->input_w_ = inputs.front()->Width(); | ||||
| conv_param->output_h_ = outputs.front()->Height(); | conv_param->output_h_ = outputs.front()->Height(); | ||||
| conv_param->output_w_ = outputs.front()->Width(); | conv_param->output_w_ = outputs.front()->Width(); | ||||
| bool use_winograd; | |||||
| bool use_winograd = false; | |||||
| int out_unit; | int out_unit; | ||||
| InputTransformUnitFunc input_trans_func = nullptr; | InputTransformUnitFunc input_trans_func = nullptr; | ||||
| OutputTransformUnitFunc output_trans_func = nullptr; | OutputTransformUnitFunc output_trans_func = nullptr; | ||||
| @@ -61,7 +61,7 @@ int Convolution3x3CPUKernel::InitWeightBias() { | |||||
| oc_block = C8NUM; | oc_block = C8NUM; | ||||
| oc_block_num = UP_DIV(output_channel, C8NUM); | oc_block_num = UP_DIV(output_channel, C8NUM); | ||||
| #endif | #endif | ||||
| int k_plane = 16; | |||||
| const int k_plane = 16; | |||||
| // init weight | // init weight | ||||
| size_t transformed_size = iC4 * C4NUM * oc_block_num * oc_block * k_plane * sizeof(float); | size_t transformed_size = iC4 * C4NUM * oc_block_num * oc_block * k_plane * sizeof(float); | ||||
| transformed_filter_addr_ = reinterpret_cast<float *>(malloc(transformed_size)); | transformed_filter_addr_ = reinterpret_cast<float *>(malloc(transformed_size)); | ||||
| @@ -93,7 +93,7 @@ int Convolution3x3CPUKernel::InitWeightBias() { | |||||
| int Convolution3x3CPUKernel::InitTmpBuffer() { | int Convolution3x3CPUKernel::InitTmpBuffer() { | ||||
| int iC4 = UP_DIV(conv_param_->input_channel_, C4NUM); | int iC4 = UP_DIV(conv_param_->input_channel_, C4NUM); | ||||
| int oC4 = UP_DIV(conv_param_->output_channel_, C4NUM); | int oC4 = UP_DIV(conv_param_->output_channel_, C4NUM); | ||||
| int k_plane = 16; | |||||
| const int k_plane = 16; | |||||
| /*=============================tile_buffer_============================*/ | /*=============================tile_buffer_============================*/ | ||||
| size_t tile_buffer_size = thread_count_ * TILE_NUM * k_plane * iC4 * C4NUM * sizeof(float); | size_t tile_buffer_size = thread_count_ * TILE_NUM * k_plane * iC4 * C4NUM * sizeof(float); | ||||
| @@ -52,6 +52,10 @@ void WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int | |||||
| std::vector<int> strides = trans_weight->GetStride(); | std::vector<int> strides = trans_weight->GetStride(); | ||||
| int kernel_plane_stride = channel_in; | int kernel_plane_stride = channel_in; | ||||
| if (oc_block == 0) { | |||||
| MS_LOG(ERROR) << "Divide by zero"; | |||||
| return; | |||||
| } | |||||
| for (int i = 0; i < channel_out; i++) { | for (int i = 0; i < channel_out; i++) { | ||||
| int out_c_block = i / oc_block; | int out_c_block = i / oc_block; | ||||
| int out_c_res = i % oc_block; | int out_c_res = i % oc_block; | ||||
| @@ -84,7 +84,7 @@ int LstmCPUKernel::InitWeightBias() { | |||||
| } | } | ||||
| auto bias_data = reinterpret_cast<float *>(in_tensors_.at(3)->Data()); | auto bias_data = reinterpret_cast<float *>(in_tensors_.at(3)->Data()); | ||||
| int state_bias_offset = 4 * lstm_parm_->hidden_size_; | |||||
| const int state_bias_offset = 4 * lstm_parm_->hidden_size_; | |||||
| for (int i = 0; i < state_bias_offset; i++) { | for (int i = 0; i < state_bias_offset; i++) { | ||||
| bias_ptr_[i] = bias_data[i] + bias_data[i + state_bias_offset]; | bias_ptr_[i] = bias_data[i] + bias_data[i + state_bias_offset]; | ||||
| } | } | ||||
| @@ -66,7 +66,7 @@ int PowerCPUKernel::RunImpl(int task_id) { | |||||
| exp_addr = reinterpret_cast<float *>(in_tensors_[1]->Data()); | exp_addr = reinterpret_cast<float *>(in_tensors_[1]->Data()); | ||||
| broadcast = false; | broadcast = false; | ||||
| } | } | ||||
| float *cur_exp; | |||||
| float *cur_exp = nullptr; | |||||
| if (broadcast) { | if (broadcast) { | ||||
| cur_exp = &power_; | cur_exp = &power_; | ||||
| } else { | } else { | ||||
| @@ -161,7 +161,7 @@ int SqueezeInt8CPUKernel::Run() { | |||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| MS_LOG(ERROR) << "RunSqueezeParam failed. errorcode: "; | MS_LOG(ERROR) << "RunSqueezeParam failed. errorcode: "; | ||||
| } | } | ||||
| return RET_OK; | |||||
| return ret; | |||||
| } | } | ||||
| int SqueezeInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) { | int SqueezeInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) { | ||||
| @@ -35,7 +35,7 @@ void IndirectGemmFp16_16x8(float16_t *output, float16_t *input, float16_t *weigh | |||||
| void IndirectGemmFp16_16x8(float16_t *output, float16_t *input, float16_t *weight, float16_t *bias, size_t step, | void IndirectGemmFp16_16x8(float16_t *output, float16_t *input, float16_t *weight, float16_t *bias, size_t step, | ||||
| size_t ic4, size_t out_channel, size_t offset, size_t mode, size_t writeC4, size_t relu, | size_t ic4, size_t out_channel, size_t offset, size_t mode, size_t writeC4, size_t relu, | ||||
| size_t relu6) { | size_t relu6) { | ||||
| int tile_n = 16; | |||||
| const int tile_n = 16; | |||||
| for (int i = 0; i < out_channel; i++) { | for (int i = 0; i < out_channel; i++) { | ||||
| int oc8_block = i / 8; | int oc8_block = i / 8; | ||||
| int oc8_res = i % 8; | int oc8_res = i % 8; | ||||
| @@ -76,7 +76,7 @@ void IndirectGemmFp16_16x8(float16_t *output, float16_t *input, float16_t *weigh | |||||
| void IndirectGemmFp16_16x8_tmp(float16_t *output, float16_t *input, float16_t *weight, const float16_t *bias, | void IndirectGemmFp16_16x8_tmp(float16_t *output, float16_t *input, float16_t *weight, const float16_t *bias, | ||||
| size_t step, size_t ic4, size_t output_channel, size_t offset, size_t mode, | size_t step, size_t ic4, size_t output_channel, size_t offset, size_t mode, | ||||
| size_t writeC4, size_t relu, size_t relu6) { | size_t writeC4, size_t relu, size_t relu6) { | ||||
| int tile_num = 16; | |||||
| const int tile_num = 16; | |||||
| if (mode) { | if (mode) { | ||||
| for (int i = 0; i < tile_num; i++) { | for (int i = 0; i < tile_num; i++) { | ||||
| int input_tile_offset = i * C4NUM; | int input_tile_offset = i * C4NUM; | ||||
| @@ -175,8 +175,8 @@ void Conv3x3Fp16(float16_t *input_data, float16_t *transed_weight, const float16 | |||||
| // todo | // todo | ||||
| int thread_count = conv_param->thread_num_; | int thread_count = conv_param->thread_num_; | ||||
| int tile_num = 16; | int tile_num = 16; | ||||
| int output_unit = 4; | |||||
| int k_plane = 36; | |||||
| const int output_unit = 4; | |||||
| const int k_plane = 36; | |||||
| int ic4 = UP_DIV(conv_param->input_channel_, C4NUM); | int ic4 = UP_DIV(conv_param->input_channel_, C4NUM); | ||||
| int oc8 = UP_DIV(conv_param->output_channel_, C8NUM); | int oc8 = UP_DIV(conv_param->output_channel_, C8NUM); | ||||
| @@ -190,14 +190,16 @@ void Conv3x3Fp16InputUnit(float16_t *tmp_data, float16_t *trans_input_data, size | |||||
| void Conv3x3Fp16InputTransform(const float16_t *input_data, float16_t *trans_input, float16_t *tmp_data, | void Conv3x3Fp16InputTransform(const float16_t *input_data, float16_t *trans_input, float16_t *tmp_data, | ||||
| int start_index, int real_cal_num, int out_w_block, ConvParameter *conv_param) { | int start_index, int real_cal_num, int out_w_block, ConvParameter *conv_param) { | ||||
| // input data format : nhwc | // input data format : nhwc | ||||
| int output_unit = 4; | |||||
| const int output_unit = 4; | |||||
| int input_channel = conv_param->input_channel_; | int input_channel = conv_param->input_channel_; | ||||
| int input_width = conv_param->input_w_; | int input_width = conv_param->input_w_; | ||||
| int input_height = conv_param->input_h_; | int input_height = conv_param->input_h_; | ||||
| int pad_w = conv_param->pad_w_; | int pad_w = conv_param->pad_w_; | ||||
| int pad_h = conv_param->pad_h_; | int pad_h = conv_param->pad_h_; | ||||
| int ic4 = UP_DIV(input_channel, C4NUM); | int ic4 = UP_DIV(input_channel, C4NUM); | ||||
| if (out_w_block == 0) { | |||||
| return; | |||||
| } | |||||
| for (int cal_id = 0; cal_id < real_cal_num; cal_id++) { | for (int cal_id = 0; cal_id < real_cal_num; cal_id++) { | ||||
| int x_id = start_index + cal_id; | int x_id = start_index + cal_id; | ||||
| int origin_x = (x_id % out_w_block) * output_unit - pad_w; | int origin_x = (x_id % out_w_block) * output_unit - pad_w; | ||||
| @@ -511,7 +513,9 @@ void Conv3x3Fp16OutputTransform(const float16_t *gemm_out, float16_t *out_data, | |||||
| int output_h = conv_param->output_h_; | int output_h = conv_param->output_h_; | ||||
| int out_h_block = UP_DIV(output_h, C4NUM); | int out_h_block = UP_DIV(output_h, C4NUM); | ||||
| int oc8 = UP_DIV(output_channel, C8NUM); | int oc8 = UP_DIV(output_channel, C8NUM); | ||||
| if (out_w_block == 0) { | |||||
| return; | |||||
| } | |||||
| for (int i = 0; i < real_cal_num; i++) { | for (int i = 0; i < real_cal_num; i++) { | ||||
| int out_w_index = (start_index + i) % out_w_block; | int out_w_index = (start_index + i) % out_w_block; | ||||
| int out_h_index = (start_index + i) / out_w_block; | int out_h_index = (start_index + i) / out_w_block; | ||||
| @@ -65,6 +65,9 @@ void MatrixMultiAdd(float *c11, float *c12, float *c21, float *c22, float *x_ptr | |||||
| void PostConvFuncComm(const float *src_ptr_, float *out_ptr, const float *bias_ptr, size_t output_channel, | void PostConvFuncComm(const float *src_ptr_, float *out_ptr, const float *bias_ptr, size_t output_channel, | ||||
| size_t plane_size, size_t stride, bool is_relu, bool is_relu6, int size) { | size_t plane_size, size_t stride, bool is_relu, bool is_relu6, int size) { | ||||
| if (size == 0) { | |||||
| return; | |||||
| } | |||||
| for (int oc = 0; oc < output_channel; oc++) { | for (int oc = 0; oc < output_channel; oc++) { | ||||
| int oc_div = oc / size, oc_mod = oc % size; | int oc_div = oc / size, oc_mod = oc % size; | ||||
| for (int hw = 0; hw < plane_size; hw++) { | for (int hw = 0; hw < plane_size; hw++) { | ||||
| @@ -142,7 +142,7 @@ void ConvSWFp32(const float *input_data, const float *packed_weight, const float | |||||
| int ic4 = slidingWindow_param->ic4_channel_ / C4NUM; | int ic4 = slidingWindow_param->ic4_channel_ / C4NUM; | ||||
| int oc4_res = conv_param->output_channel_ % C4NUM; | int oc4_res = conv_param->output_channel_ % C4NUM; | ||||
| const float *src = input_data; | const float *src = input_data; | ||||
| float *dst; | |||||
| float *dst = NULL; | |||||
| if (oc4_res == 0) { | if (oc4_res == 0) { | ||||
| dst = output_data; | dst = output_data; | ||||
| } else { | } else { | ||||
| @@ -328,36 +328,36 @@ void ConvDw3x3Fp32FilterTrans(float *trans_weight, float *weight, int oc4) { | |||||
| float dst01 = (local_ptr + 4)[0]; | float dst01 = (local_ptr + 4)[0]; | ||||
| float dst02 = (local_ptr + 8)[0]; | float dst02 = (local_ptr + 8)[0]; | ||||
| float dst10 = 0.5f * local_ptr[0] + 0.5f * (local_ptr + 12)[0] + 0.5f * (local_ptr + 24)[0]; | |||||
| float dst11 = 0.5f * (local_ptr + 4)[0] + 0.5f * (local_ptr + 16)[0] + 0.5f * (local_ptr + 28)[0]; | |||||
| float dst12 = 0.5f * (local_ptr + 8)[0] + 0.5f * (local_ptr + 20)[0] + 0.5f * (local_ptr + 32)[0]; | |||||
| const float dst10 = 0.5f * local_ptr[0] + 0.5f * (local_ptr + 12)[0] + 0.5f * (local_ptr + 24)[0]; | |||||
| const float dst11 = 0.5f * (local_ptr + 4)[0] + 0.5f * (local_ptr + 16)[0] + 0.5f * (local_ptr + 28)[0]; | |||||
| const float dst12 = 0.5f * (local_ptr + 8)[0] + 0.5f * (local_ptr + 20)[0] + 0.5f * (local_ptr + 32)[0]; | |||||
| float dst20 = 0.5f * local_ptr[0] - 0.5f * (local_ptr + 12)[0] + 0.5f * (local_ptr + 24)[0]; | |||||
| float dst21 = 0.5f * (local_ptr + 4)[0] - 0.5f * (local_ptr + 16)[0] + 0.5f * (local_ptr + 28)[0]; | |||||
| float dst22 = 0.5f * (local_ptr + 8)[0] - 0.5f * (local_ptr + 20)[0] + 0.5f * (local_ptr + 32)[0]; | |||||
| const float dst20 = 0.5f * local_ptr[0] - 0.5f * (local_ptr + 12)[0] + 0.5f * (local_ptr + 24)[0]; | |||||
| const float dst21 = 0.5f * (local_ptr + 4)[0] - 0.5f * (local_ptr + 16)[0] + 0.5f * (local_ptr + 28)[0]; | |||||
| const float dst22 = 0.5f * (local_ptr + 8)[0] - 0.5f * (local_ptr + 20)[0] + 0.5f * (local_ptr + 32)[0]; | |||||
| float dst30 = (local_ptr + 24)[0]; | float dst30 = (local_ptr + 24)[0]; | ||||
| float dst31 = (local_ptr + 28)[0]; | float dst31 = (local_ptr + 28)[0]; | ||||
| float dst32 = (local_ptr + 32)[0]; | float dst32 = (local_ptr + 32)[0]; | ||||
| float m00 = dst00; | float m00 = dst00; | ||||
| float m01 = 0.5f * dst00 + 0.5f * dst01 + 0.5f * dst02; | |||||
| float m02 = 0.5f * dst00 - 0.5f * dst01 + 0.5f * dst02; | |||||
| const float m01 = 0.5f * dst00 + 0.5f * dst01 + 0.5f * dst02; | |||||
| const float m02 = 0.5f * dst00 - 0.5f * dst01 + 0.5f * dst02; | |||||
| float m03 = dst02; | float m03 = dst02; | ||||
| float m10 = dst10; | float m10 = dst10; | ||||
| float m11 = 0.5f * dst10 + 0.5f * dst11 + 0.5f * dst12; | |||||
| float m12 = 0.5f * dst10 - 0.5f * dst11 + 0.5f * dst12; | |||||
| const float m11 = 0.5f * dst10 + 0.5f * dst11 + 0.5f * dst12; | |||||
| const float m12 = 0.5f * dst10 - 0.5f * dst11 + 0.5f * dst12; | |||||
| float m13 = dst12; | float m13 = dst12; | ||||
| float m20 = dst20; | float m20 = dst20; | ||||
| float m21 = 0.5f * dst20 + 0.5f * dst21 + 0.5f * dst22; | |||||
| float m22 = 0.5f * dst20 - 0.5f * dst21 + 0.5f * dst22; | |||||
| const float m21 = 0.5f * dst20 + 0.5f * dst21 + 0.5f * dst22; | |||||
| const float m22 = 0.5f * dst20 - 0.5f * dst21 + 0.5f * dst22; | |||||
| float m23 = dst22; | float m23 = dst22; | ||||
| float m30 = dst30; | float m30 = dst30; | ||||
| float m31 = 0.5f * dst30 + 0.5f * dst31 + 0.5f * dst32; | |||||
| float m32 = 0.5f * dst30 - 0.5f * dst31 + 0.5f * dst32; | |||||
| const float m31 = 0.5f * dst30 + 0.5f * dst31 + 0.5f * dst32; | |||||
| const float m32 = 0.5f * dst30 - 0.5f * dst31 + 0.5f * dst32; | |||||
| float m33 = dst32; | float m33 = dst32; | ||||
| *(dst + j) = m00; | *(dst + j) = m00; | ||||
| @@ -387,7 +387,7 @@ void ConvDw3x3Fp32FilterTrans(float *trans_weight, float *weight, int oc4) { | |||||
| void ConvDw3x3Fp32InputTrans(const float *input_data, float *trans_input, float *block_buffer, int out_h_block, | void ConvDw3x3Fp32InputTrans(const float *input_data, float *trans_input, float *block_buffer, int out_h_block, | ||||
| int out_w_block, const ConvParameter *conv_param) { | int out_w_block, const ConvParameter *conv_param) { | ||||
| int ic4 = UP_DIV(conv_param->input_channel_, C4NUM); | int ic4 = UP_DIV(conv_param->input_channel_, C4NUM); | ||||
| int input_unit = 4; | |||||
| const int input_unit = 4; | |||||
| memset(trans_input, 0, out_h_block * out_h_block * 16 * C4NUM * sizeof(float)); | memset(trans_input, 0, out_h_block * out_h_block * 16 * C4NUM * sizeof(float)); | ||||
| for (int oh = 0; oh < out_h_block; oh++) { | for (int oh = 0; oh < out_h_block; oh++) { | ||||
| @@ -426,7 +426,7 @@ void ConvDw3x3Fp32InputTrans(const float *input_data, float *trans_input, float | |||||
| // todo yangruoqi: implement assembly | // todo yangruoqi: implement assembly | ||||
| void ConvDw3x3Fp32Winograd(float *trans_buffer, const float *weight, int out_h_block, int out_w_block) { | void ConvDw3x3Fp32Winograd(float *trans_buffer, const float *weight, int out_h_block, int out_w_block) { | ||||
| int unit = 4; | |||||
| const int unit = 4; | |||||
| for (int oh = 0; oh < out_h_block; oh++) { | for (int oh = 0; oh < out_h_block; oh++) { | ||||
| float *buf_oh = trans_buffer + oh * out_w_block * 16 * C4NUM; | float *buf_oh = trans_buffer + oh * out_w_block * 16 * C4NUM; | ||||
| for (int ow = 0; ow < out_w_block; ow++) { | for (int ow = 0; ow < out_w_block; ow++) { | ||||
| @@ -583,7 +583,7 @@ void ConvDw3x3Fp32OutputTrans(float *trans_buffer, float *output_data, const flo | |||||
| int oc4 = UP_DIV(conv_param->output_channel_, C4NUM); | int oc4 = UP_DIV(conv_param->output_channel_, C4NUM); | ||||
| bool h_in_range = true; | bool h_in_range = true; | ||||
| for (int oh = 0; oh < out_h_block; oh++) { | for (int oh = 0; oh < out_h_block; oh++) { | ||||
| int real_oh = 2 * oh; | |||||
| const int real_oh = 2 * oh; | |||||
| if ((oh + 1) * 2 > conv_param->output_h_) { | if ((oh + 1) * 2 > conv_param->output_h_) { | ||||
| h_in_range = false; | h_in_range = false; | ||||
| } | } | ||||
| @@ -592,7 +592,7 @@ void ConvDw3x3Fp32OutputTrans(float *trans_buffer, float *output_data, const flo | |||||
| float *output_oh = output_data + real_oh * conv_param->output_w_ * oc4 * C4NUM; | float *output_oh = output_data + real_oh * conv_param->output_w_ * oc4 * C4NUM; | ||||
| for (int ow = 0; ow < out_w_block; ow++) { | for (int ow = 0; ow < out_w_block; ow++) { | ||||
| int real_ow = 2 * ow; | |||||
| const int real_ow = 2 * ow; | |||||
| if ((ow + 1) * 2 > conv_param->output_w_) { | if ((ow + 1) * 2 > conv_param->output_w_) { | ||||
| w_in_range = false; | w_in_range = false; | ||||
| } | } | ||||
| @@ -47,13 +47,13 @@ int ResizeBilinear(const float *input_data, float *output_data, const int *input | |||||
| int y_bottom = (int)(floor(actual_y)); | int y_bottom = (int)(floor(actual_y)); | ||||
| int y_top = y_bottom + 1 < in_h ? (y_bottom + 1) : (in_h - 1); | int y_top = y_bottom + 1 < in_h ? (y_bottom + 1) : (in_h - 1); | ||||
| float y_top_weight = actual_y - (float)(y_bottom); | float y_top_weight = actual_y - (float)(y_bottom); | ||||
| float y_bottom_weight = 1.0f - y_top_weight; | |||||
| const float y_bottom_weight = 1.0f - y_top_weight; | |||||
| for (w = 0; w < new_width; w++) { | for (w = 0; w < new_width; w++) { | ||||
| float actual_x = (float)(w)*width_scale; | float actual_x = (float)(w)*width_scale; | ||||
| int x_left = (int)(floor(actual_x)); | int x_left = (int)(floor(actual_x)); | ||||
| int x_right = x_left + 1 < in_w ? (x_left + 1) : (in_w - 1); | int x_right = x_left + 1 < in_w ? (x_left + 1) : (in_w - 1); | ||||
| float x_right_weight = actual_x - (float)(x_left); | float x_right_weight = actual_x - (float)(x_left); | ||||
| float x_left_weight = 1.0f - x_right_weight; | |||||
| const float x_left_weight = 1.0f - x_right_weight; | |||||
| c = 0; | c = 0; | ||||
| #ifdef ENABLE_NEON | #ifdef ENABLE_NEON | ||||
| for (; c <= in_c - 4; c += 4) { | for (; c <= in_c - 4; c += 4) { | ||||
| @@ -30,7 +30,7 @@ int ROIPooling(float *in_ptr, float *out_ptr, float *roi, const int *in_shape, c | |||||
| int pooled_width = param->pooledW_; | int pooled_width = param->pooledW_; | ||||
| int in_stride[DIMENSION_4D]; | int in_stride[DIMENSION_4D]; | ||||
| int out_stride[DIMENSION_4D]; | int out_stride[DIMENSION_4D]; | ||||
| int roi_stride = 5; | |||||
| const int roi_stride = 5; | |||||
| in_stride[DIMENSION_4D - 1] = 1; | in_stride[DIMENSION_4D - 1] = 1; | ||||
| out_stride[DIMENSION_4D - 1] = 1; | out_stride[DIMENSION_4D - 1] = 1; | ||||
| for (int i = dim - 2; i >= 0; --i) { | for (int i = dim - 2; i >= 0; --i) { | ||||
| @@ -138,7 +138,7 @@ void DoPadding(const float *input, float *padded_input, SpaceToBatchParameter pa | |||||
| } | } | ||||
| int SpaceToBatch(const float *input, float *output, SpaceToBatchParameter param, float *tmp_space[3]) { | int SpaceToBatch(const float *input, float *output, SpaceToBatchParameter param, float *tmp_space[3]) { | ||||
| float *padded_input; | |||||
| float *padded_input = NULL; | |||||
| int ret; | int ret; | ||||
| if (param.need_paddings_) { | if (param.need_paddings_) { | ||||
| if (tmp_space[0] == NULL || tmp_space[1] == NULL || tmp_space[2] == NULL) { | if (tmp_space[0] == NULL || tmp_space[1] == NULL || tmp_space[2] == NULL) { | ||||
| @@ -30,7 +30,7 @@ void AvgPoolingGrad(const float *input_ptr, float *output_ptr, PoolingParameter | |||||
| int output_h = pooling_param->output_h_; | int output_h = pooling_param->output_h_; | ||||
| int output_batch = pooling_param->output_batch_; | int output_batch = pooling_param->output_batch_; | ||||
| const float *inPtr; | |||||
| const float *inPtr = NULL; | |||||
| for (int i = 0; i < output_h * output_w * channel * output_batch; i++) output_ptr[i] = 0.0; | for (int i = 0; i < output_h * output_w * channel * output_batch; i++) output_ptr[i] = 0.0; | ||||
| // int pad_top = padding[2]; | // int pad_top = padding[2]; | ||||
| @@ -119,7 +119,7 @@ void MaxPoolingGrad(const float *dy, const int *indices, float *output_ptr, Pool | |||||
| const float *yt = (const float *)(dy); | const float *yt = (const float *)(dy); | ||||
| const int *pos = (const int *)(indices); | const int *pos = (const int *)(indices); | ||||
| float *out; | |||||
| float *out = NULL; | |||||
| if (1) { // grads->layout() == Tensor::nhwc) | if (1) { // grads->layout() == Tensor::nhwc) | ||||
| for (int ib = 0; ib < output_batch; ib++) { | for (int ib = 0; ib < output_batch; ib++) { | ||||
| @@ -34,7 +34,7 @@ void CalcParameter(const int *shape, int dims_number, int axis, int *pre_axis_co | |||||
| void DoArgMinMaxQuant(const int8_t *input, int8_t *output, ArgMinMaxParameter *param, int pre_axis_count, | void DoArgMinMaxQuant(const int8_t *input, int8_t *output, ArgMinMaxParameter *param, int pre_axis_count, | ||||
| int axis_count, int after_axis_count, QuantArg *in_quant_arg, QuantArg *out_quant_arg) { | int axis_count, int after_axis_count, QuantArg *in_quant_arg, QuantArg *out_quant_arg) { | ||||
| bool out_value = param->out_value_; | bool out_value = param->out_value_; | ||||
| float output_inverse_scale = 1.f / out_quant_arg->scale_; | |||||
| const float output_inverse_scale = 1.f / out_quant_arg->scale_; | |||||
| float bias = -in_quant_arg->zp_ * in_quant_arg->scale_; | float bias = -in_quant_arg->zp_ * in_quant_arg->scale_; | ||||
| int32_t output_zp = out_quant_arg->zp_; | int32_t output_zp = out_quant_arg->zp_; | ||||
| for (int i = 0; i < pre_axis_count; ++i) { | for (int i = 0; i < pre_axis_count; ++i) { | ||||
| @@ -28,7 +28,7 @@ void BatchToSpaceNoCropForNHWCInt8(const int8_t *input, int8_t *output, const in | |||||
| size_t output_offset = 0; | size_t output_offset = 0; | ||||
| size_t in_stride_h = in_w * in_c; | size_t in_stride_h = in_w * in_c; | ||||
| size_t in_stride_n = in_stride_h * in_h; | size_t in_stride_n = in_stride_h * in_h; | ||||
| float output_inverse_scale = 1.f / out_quant_arg->scale_; | |||||
| const float output_inverse_scale = 1.f / out_quant_arg->scale_; | |||||
| float scale = in_quant_arg->scale_ * output_inverse_scale; | float scale = in_quant_arg->scale_ * output_inverse_scale; | ||||
| float bias = -in_quant_arg->zp_ * scale; | float bias = -in_quant_arg->zp_ * scale; | ||||
| int32_t output_zp = out_quant_arg->zp_; | int32_t output_zp = out_quant_arg->zp_; | ||||
| @@ -76,7 +76,7 @@ void BatchToSpaceForNHWCInt8(const int8_t *input, int8_t *output, const int *in_ | |||||
| size_t in_stride_h = in_w * in_c; | size_t in_stride_h = in_w * in_c; | ||||
| size_t in_stride_n = in_stride_h * in_h; | size_t in_stride_n = in_stride_h * in_h; | ||||
| float output_inverse_scale = 1.f / out_quant_arg->scale_; | |||||
| const float output_inverse_scale = 1.f / out_quant_arg->scale_; | |||||
| float scale = in_quant_arg->scale_ * output_inverse_scale; | float scale = in_quant_arg->scale_ * output_inverse_scale; | ||||
| float bias = -in_quant_arg->zp_ * scale; | float bias = -in_quant_arg->zp_ * scale; | ||||
| int32_t output_zp = out_quant_arg->zp_; | int32_t output_zp = out_quant_arg->zp_; | ||||
| @@ -20,7 +20,7 @@ | |||||
| void Int8Concat(int8_t **inputs, int8_t *output, ConcatParameter *para, int axis, int64_t real_dst_count, int task_id) { | void Int8Concat(int8_t **inputs, int8_t *output, ConcatParameter *para, int axis, int64_t real_dst_count, int task_id) { | ||||
| float output_scale = para->quant_arg_.out_args_.scale_; | float output_scale = para->quant_arg_.out_args_.scale_; | ||||
| float output_inverse_scale = 1.f / output_scale; | |||||
| const float output_inverse_scale = 1.f / output_scale; | |||||
| int input_num = para->input_num_; | int input_num = para->input_num_; | ||||
| int count_unit_ = para->count_unit_; | int count_unit_ = para->count_unit_; | ||||
| int after_axis_size = para->after_axis_size; | int after_axis_size = para->after_axis_size; | ||||
| @@ -201,7 +201,7 @@ void Conv3x3Uint8Gemm(int32_t *dst, const int16_t *src, const int16_t *weight, i | |||||
| #ifdef ENABLE_ARM | #ifdef ENABLE_ARM | ||||
| IndirectGemmInt16to32_8x4(dst, src, weight, 16, ic8, oc4, oc4 * 4 * 16 * sizeof(int32_t)); | IndirectGemmInt16to32_8x4(dst, src, weight, 16, ic8, oc4, oc4 * 4 * 16 * sizeof(int32_t)); | ||||
| #else | #else | ||||
| int input_unit_square = 16; | |||||
| const int input_unit_square = 16; | |||||
| for (int c = 0; c < oc4; c++) { | for (int c = 0; c < oc4; c++) { | ||||
| int filter_oc_offset = c * input_unit_square * ic8 * C8NUM * C4NUM; | int filter_oc_offset = c * input_unit_square * ic8 * C8NUM * C4NUM; | ||||
| int dst_oc_offset = c * input_unit_square * C4NUM; | int dst_oc_offset = c * input_unit_square * C4NUM; | ||||
| @@ -22,7 +22,7 @@ void DepthToSpaceForNHWCInt8(const int8_t *input, int8_t *output, int *in_shape, | |||||
| int32_t in_shape_dim2 = in_shape[2]; | int32_t in_shape_dim2 = in_shape[2]; | ||||
| int32_t in_shape_dim1 = in_shape[1]; | int32_t in_shape_dim1 = in_shape[1]; | ||||
| size_t copy_size = block_size * param->out_stride_dim2_; | size_t copy_size = block_size * param->out_stride_dim2_; | ||||
| float output_inverse_scale = 1.f / out_quant_arg->scale_; | |||||
| const float output_inverse_scale = 1.f / out_quant_arg->scale_; | |||||
| float scale = in_quant_arg->scale_ * output_inverse_scale; | float scale = in_quant_arg->scale_ * output_inverse_scale; | ||||
| float bias = -in_quant_arg->zp_ * scale; | float bias = -in_quant_arg->zp_ * scale; | ||||
| int32_t output_zp = out_quant_arg->zp_; | int32_t output_zp = out_quant_arg->zp_; | ||||
| @@ -36,8 +36,8 @@ void AvgPoolingInt8(const int8_t *input_ptr, int8_t *output_ptr, PoolingParamete | |||||
| float output_scale = pooling_param->quant_args_[1][0].scale_; | float output_scale = pooling_param->quant_args_[1][0].scale_; | ||||
| int output_zp = pooling_param->quant_args_[1][0].zp_; | int output_zp = pooling_param->quant_args_[1][0].zp_; | ||||
| double real_multiplier = input_scale / output_scale; | double real_multiplier = input_scale / output_scale; | ||||
| int8_t out_min = INT8_MIN; | |||||
| int8_t out_max = INT8_MAX; | |||||
| const int8_t out_min = INT8_MIN; | |||||
| const int8_t out_max = INT8_MAX; | |||||
| for (int batch = 0; batch < output_batch; batch++) { | for (int batch = 0; batch < output_batch; batch++) { | ||||
| int in_batch_offset = batch * in_h * in_w * channel; | int in_batch_offset = batch * in_h * in_w * channel; | ||||
| @@ -91,8 +91,8 @@ void AvgPoolingOptInt8(const int8_t *input_ptr, int8_t *output_ptr, PoolingParam | |||||
| int out_tile_count = UP_DIV(out_plane, TILE_NUM); | int out_tile_count = UP_DIV(out_plane, TILE_NUM); | ||||
| int thread_num = pooling_param->thread_num_; | int thread_num = pooling_param->thread_num_; | ||||
| int c8 = UP_DIV(channel, C8NUM); | int c8 = UP_DIV(channel, C8NUM); | ||||
| int8_t out_min = INT8_MIN; | |||||
| int8_t out_max = INT8_MAX; | |||||
| const int8_t out_min = INT8_MIN; | |||||
| const int8_t out_max = INT8_MAX; | |||||
| for (int batch = 0; batch < output_batch; batch++) { | for (int batch = 0; batch < output_batch; batch++) { | ||||
| int in_batch_offset = batch * in_h * in_w * channel; | int in_batch_offset = batch * in_h * in_w * channel; | ||||
| @@ -20,7 +20,7 @@ | |||||
| void prelu(int8_t *inputs, int8_t *output_ptr, PreluParameter *quant_prelu_parm, int task_id) { | void prelu(int8_t *inputs, int8_t *output_ptr, PreluParameter *quant_prelu_parm, int task_id) { | ||||
| float output_scale = quant_prelu_parm->quant_arg.out_args_.scale_; | float output_scale = quant_prelu_parm->quant_arg.out_args_.scale_; | ||||
| int output_zp = quant_prelu_parm->quant_arg.out_args_.zp_; | int output_zp = quant_prelu_parm->quant_arg.out_args_.zp_; | ||||
| float output_inverse_scale = 1.f / output_scale; | |||||
| const float output_inverse_scale = 1.f / output_scale; | |||||
| int output_dim = quant_prelu_parm->input_dim_; | int output_dim = quant_prelu_parm->input_dim_; | ||||
| QuantArg *input_quant = NULL; | QuantArg *input_quant = NULL; | ||||
| @@ -22,7 +22,7 @@ void Int8Reshape(int8_t *input_ptr, int8_t *output_ptr, int64_t real_dst_count, | |||||
| if (para.in_args_.scale_ == para.out_args_.scale_ && para.in_args_.zp_ == para.out_args_.zp_) { | if (para.in_args_.scale_ == para.out_args_.scale_ && para.in_args_.zp_ == para.out_args_.zp_) { | ||||
| memcpy(output_ptr, input_ptr, real_dst_count); | memcpy(output_ptr, input_ptr, real_dst_count); | ||||
| } else { | } else { | ||||
| float output_inverse_scale = 1.f / para.out_args_.scale_; | |||||
| const float output_inverse_scale = 1.f / para.out_args_.scale_; | |||||
| float scale = para.in_args_.scale_ * output_inverse_scale; | float scale = para.in_args_.scale_ * output_inverse_scale; | ||||
| float bias = -para.in_args_.zp_ * scale; | float bias = -para.in_args_.zp_ * scale; | ||||
| int32_t output_zp = para.out_args_.zp_; | int32_t output_zp = para.out_args_.zp_; | ||||
| @@ -115,6 +115,9 @@ int ResizeNearestNeighborInt8Simple(const int8_t *input_data, int8_t *output_dat | |||||
| } | } | ||||
| void ComputeScale(const int32_t in_value, const int32_t out_value, const bool align_corners, int32_t *scale) { | void ComputeScale(const int32_t in_value, const int32_t out_value, const bool align_corners, int32_t *scale) { | ||||
| if (out_value == 0) { | |||||
| return; | |||||
| } | |||||
| *scale = (in_value * (1 << 10) + out_value / 2) / out_value; | *scale = (in_value * (1 << 10) + out_value / 2) / out_value; | ||||
| if (align_corners && out_value > 1) { | if (align_corners && out_value > 1) { | ||||
| *scale = ((in_value - 1) * (1 << 10) + (out_value - 1) / 2) / (out_value - 1); | *scale = ((in_value - 1) * (1 << 10) + (out_value - 1) / 2) / (out_value - 1); | ||||
| @@ -133,6 +136,9 @@ void ComputeInterpolationArgs(const int32_t pos, const int32_t scale, const int3 | |||||
| void ComputeNearestNeighborInt(const int32_t pos, const int in_size, const int32_t new_size, const bool align_corners, | void ComputeNearestNeighborInt(const int32_t pos, const int in_size, const int32_t new_size, const bool align_corners, | ||||
| int32_t *nearest) { | int32_t *nearest) { | ||||
| if (new_size == 0) { | |||||
| return; | |||||
| } | |||||
| *nearest = (in_size * pos) / new_size; | *nearest = (in_size * pos) / new_size; | ||||
| if (align_corners) { | if (align_corners) { | ||||
| *nearest = ((in_size - 1) * pos + (new_size - 1) / 2) / (new_size - 1); | *nearest = ((in_size - 1) * pos + (new_size - 1) / 2) / (new_size - 1); | ||||
| @@ -20,11 +20,11 @@ | |||||
| void Squeeze(int8_t **inputs, int8_t *output_ptr, int task_id, SqueezeQuantArg *quant_Squeeze_parm, | void Squeeze(int8_t **inputs, int8_t *output_ptr, int task_id, SqueezeQuantArg *quant_Squeeze_parm, | ||||
| SqueezeParameter *para_, size_t osize) { | SqueezeParameter *para_, size_t osize) { | ||||
| float output_scale = quant_Squeeze_parm->out_quant_args_.scale_; | float output_scale = quant_Squeeze_parm->out_quant_args_.scale_; | ||||
| float output_inverse_scale = 1.f / output_scale; | |||||
| const float output_inverse_scale = 1.f / output_scale; | |||||
| QuantArg *input_quant = quant_Squeeze_parm->in_quant_args_; | QuantArg *input_quant = quant_Squeeze_parm->in_quant_args_; | ||||
| int output_zp = quant_Squeeze_parm->out_quant_args_.zp_; | int output_zp = quant_Squeeze_parm->out_quant_args_.zp_; | ||||
| int i = 0; | |||||
| const int i = 0; | |||||
| int8_t *input_ptr = inputs[0]; | int8_t *input_ptr = inputs[0]; | ||||
| for (int j = task_id; j < osize; j += para_->op_parameter_.thread_num_) { | for (int j = task_id; j < osize; j += para_->op_parameter_.thread_num_) { | ||||
| float scale = input_quant[i].scale_ * output_inverse_scale; | float scale = input_quant[i].scale_ * output_inverse_scale; | ||||
| @@ -21,6 +21,9 @@ | |||||
| void PackWeightFp32(float *weight_data, ConvParameter *conv_param, float *packed_weight, int oc_block, | void PackWeightFp32(float *weight_data, ConvParameter *conv_param, float *packed_weight, int oc_block, | ||||
| int oc_block_num) { | int oc_block_num) { | ||||
| // original weight format : ohwi | // original weight format : ohwi | ||||
| if (oc_block_num == 0) { | |||||
| return; | |||||
| } | |||||
| int kernel_h = conv_param->kernel_h_; | int kernel_h = conv_param->kernel_h_; | ||||
| int kernel_w = conv_param->kernel_w_; | int kernel_w = conv_param->kernel_w_; | ||||
| int in_channel = conv_param->input_channel_; | int in_channel = conv_param->input_channel_; | ||||
| @@ -30,7 +33,7 @@ void PackWeightFp32(float *weight_data, ConvParameter *conv_param, float *packed | |||||
| int pack_weight_size = oc_block * oc_block_num * ic4 * C4NUM * kernel_plane; | int pack_weight_size = oc_block * oc_block_num * ic4 * C4NUM * kernel_plane; | ||||
| int unit_size = oc_block * C4NUM; | int unit_size = oc_block * C4NUM; | ||||
| int block_size = pack_weight_size / oc_block_num; | |||||
| const int block_size = pack_weight_size / oc_block_num; | |||||
| for (int m = 0; m < kernel_plane; m++) { | for (int m = 0; m < kernel_plane; m++) { | ||||
| int kernel_plane_stride = m * in_channel; | int kernel_plane_stride = m * in_channel; | ||||
| @@ -19,6 +19,9 @@ | |||||
| #include "nnacl/prior_box.h" | #include "nnacl/prior_box.h" | ||||
| int PriorBox(const float *input_data, float *output_data, const size_t size, const int tid, const int thread_num) { | int PriorBox(const float *input_data, float *output_data, const size_t size, const int tid, const int thread_num) { | ||||
| if (thread_num == 0) { | |||||
| return NNACL_ERR; | |||||
| } | |||||
| size_t unit_size = size / thread_num; | size_t unit_size = size / thread_num; | ||||
| if (tid == thread_num - 1) { | if (tid == thread_num - 1) { | ||||
| size_t tail_size = size - unit_size * tid; | size_t tail_size = size - unit_size * tid; | ||||
| @@ -26,7 +26,7 @@ int Find(float *array, int len, float target) { | |||||
| } | } | ||||
| void Unique(float *input, int input_len, float *output0, int *output0_len, int *output1) { | void Unique(float *input, int input_len, float *output0, int *output0_len, int *output1) { | ||||
| output0_len = 0; | |||||
| *output0_len = 0; | |||||
| for (int i = 0; i < input_len; i++) { | for (int i = 0; i < input_len; i++) { | ||||
| int idx = Find(output0, *output0_len, input[i]); | int idx = Find(output0, *output0_len, input[i]); | ||||
| if (idx != -1) { | if (idx != -1) { | ||||
| @@ -28,7 +28,9 @@ void WinogradInputTransform(const float *input_data, float *trans_input, float * | |||||
| int pad_w = conv_param->pad_w_; | int pad_w = conv_param->pad_w_; | ||||
| int input_h = conv_param->input_h_; | int input_h = conv_param->input_h_; | ||||
| int input_w = conv_param->input_w_; | int input_w = conv_param->input_w_; | ||||
| if (out_w_block_num == 0) { | |||||
| return; | |||||
| } | |||||
| for (int c = 0; c < cal_num; c++) { // actual tiled number | for (int c = 0; c < cal_num; c++) { // actual tiled number | ||||
| int src_x_s = (out_tile_index % out_w_block_num) * output_unit - pad_w; | int src_x_s = (out_tile_index % out_w_block_num) * output_unit - pad_w; | ||||
| int src_y_s = (out_tile_index / out_w_block_num) * output_unit - pad_h; | int src_y_s = (out_tile_index / out_w_block_num) * output_unit - pad_h; | ||||
| @@ -83,7 +85,9 @@ void WinogradOutputTransform(const float *gemm_out, float *tmp_out_data, const f | |||||
| int output_channel = conv_param->output_channel_; | int output_channel = conv_param->output_channel_; | ||||
| int oc4 = UP_DIV(output_channel, C4NUM); | int oc4 = UP_DIV(output_channel, C4NUM); | ||||
| int input_unit = conv_param->input_unit_; | int input_unit = conv_param->input_unit_; | ||||
| if (output_unit_num == 0) { | |||||
| return; | |||||
| } | |||||
| for (int i = 0; i < cal_num; i++) { | for (int i = 0; i < cal_num; i++) { | ||||
| int dst_x_s = out_tile_index % output_unit_num; | int dst_x_s = out_tile_index % output_unit_num; | ||||
| int dst_y_s = out_tile_index / output_unit_num; | int dst_y_s = out_tile_index / output_unit_num; | ||||
| @@ -281,7 +285,9 @@ void Conv3x3Fp32InputTransform(const float *input_data, float *trans_input, floa | |||||
| int pad_h = conv_param->pad_h_; | int pad_h = conv_param->pad_h_; | ||||
| int ic4 = UP_DIV(input_channel, C4NUM); | int ic4 = UP_DIV(input_channel, C4NUM); | ||||
| int input_unit = 4; | int input_unit = 4; | ||||
| if (out_w_block == 0) { | |||||
| return; | |||||
| } | |||||
| for (int cal_id = 0; cal_id < real_cal_num; cal_id++) { | for (int cal_id = 0; cal_id < real_cal_num; cal_id++) { | ||||
| int x_id = start_index + cal_id; | int x_id = start_index + cal_id; | ||||
| int origin_x = (x_id % out_w_block) * OUPUT_UNIT - pad_w; | int origin_x = (x_id % out_w_block) * OUPUT_UNIT - pad_w; | ||||
| @@ -328,8 +334,11 @@ void Conv3x3Fp32InputTransform(const float *input_data, float *trans_input, floa | |||||
| void Conv3x3Fp32FilterTransform(float *weight_data, float *trans_weight, int iC4, int output_channel, int kernel_plane, | void Conv3x3Fp32FilterTransform(float *weight_data, float *trans_weight, int iC4, int output_channel, int kernel_plane, | ||||
| int oc_block) { | int oc_block) { | ||||
| int input_unit = 4; | |||||
| const int input_unit = 4; | |||||
| int dst_step = iC4 * C4NUM * oc_block; | int dst_step = iC4 * C4NUM * oc_block; | ||||
| if (oc_block == 0) { | |||||
| return; | |||||
| } | |||||
| for (int o = 0; o < output_channel; o++) { | for (int o = 0; o < output_channel; o++) { | ||||
| int oc_block_num = o / oc_block; | int oc_block_num = o / oc_block; | ||||
| int oc_block_rem = o % oc_block; | int oc_block_rem = o % oc_block; | ||||
| @@ -485,36 +494,36 @@ void Conv3x3Fp32FilterTransform(float *weight_data, float *trans_weight, int iC4 | |||||
| float dst01 = (local_ptr + 4)[0]; | float dst01 = (local_ptr + 4)[0]; | ||||
| float dst02 = (local_ptr + 8)[0]; | float dst02 = (local_ptr + 8)[0]; | ||||
| float dst10 = 0.5f * local_ptr[0] + 0.5f * (local_ptr + 12)[0] + 0.5f * (local_ptr + 24)[0]; | |||||
| float dst11 = 0.5f * (local_ptr + 4)[0] + 0.5f * (local_ptr + 16)[0] + 0.5f * (local_ptr + 28)[0]; | |||||
| float dst12 = 0.5f * (local_ptr + 8)[0] + 0.5f * (local_ptr + 20)[0] + 0.5f * (local_ptr + 32)[0]; | |||||
| const float dst10 = 0.5f * local_ptr[0] + 0.5f * (local_ptr + 12)[0] + 0.5f * (local_ptr + 24)[0]; | |||||
| const float dst11 = 0.5f * (local_ptr + 4)[0] + 0.5f * (local_ptr + 16)[0] + 0.5f * (local_ptr + 28)[0]; | |||||
| const float dst12 = 0.5f * (local_ptr + 8)[0] + 0.5f * (local_ptr + 20)[0] + 0.5f * (local_ptr + 32)[0]; | |||||
| float dst20 = 0.5f * local_ptr[0] - 0.5f * (local_ptr + 12)[0] + 0.5f * (local_ptr + 24)[0]; | |||||
| float dst21 = 0.5f * (local_ptr + 4)[0] - 0.5f * (local_ptr + 16)[0] + 0.5f * (local_ptr + 28)[0]; | |||||
| float dst22 = 0.5f * (local_ptr + 8)[0] - 0.5f * (local_ptr + 20)[0] + 0.5f * (local_ptr + 32)[0]; | |||||
| const float dst20 = 0.5f * local_ptr[0] - 0.5f * (local_ptr + 12)[0] + 0.5f * (local_ptr + 24)[0]; | |||||
| const float dst21 = 0.5f * (local_ptr + 4)[0] - 0.5f * (local_ptr + 16)[0] + 0.5f * (local_ptr + 28)[0]; | |||||
| const float dst22 = 0.5f * (local_ptr + 8)[0] - 0.5f * (local_ptr + 20)[0] + 0.5f * (local_ptr + 32)[0]; | |||||
| float dst30 = (local_ptr + 24)[0]; | float dst30 = (local_ptr + 24)[0]; | ||||
| float dst31 = (local_ptr + 28)[0]; | float dst31 = (local_ptr + 28)[0]; | ||||
| float dst32 = (local_ptr + 32)[0]; | float dst32 = (local_ptr + 32)[0]; | ||||
| float m00 = dst00; | float m00 = dst00; | ||||
| float m01 = 0.5f * dst00 + 0.5f * dst01 + 0.5f * dst02; | |||||
| float m02 = 0.5f * dst00 - 0.5f * dst01 + 0.5f * dst02; | |||||
| const float m01 = 0.5f * dst00 + 0.5f * dst01 + 0.5f * dst02; | |||||
| const float m02 = 0.5f * dst00 - 0.5f * dst01 + 0.5f * dst02; | |||||
| float m03 = dst02; | float m03 = dst02; | ||||
| float m10 = dst10; | float m10 = dst10; | ||||
| float m11 = 0.5f * dst10 + 0.5f * dst11 + 0.5f * dst12; | |||||
| float m12 = 0.5f * dst10 - 0.5f * dst11 + 0.5f * dst12; | |||||
| const float m11 = 0.5f * dst10 + 0.5f * dst11 + 0.5f * dst12; | |||||
| const float m12 = 0.5f * dst10 - 0.5f * dst11 + 0.5f * dst12; | |||||
| float m13 = dst12; | float m13 = dst12; | ||||
| float m20 = dst20; | float m20 = dst20; | ||||
| float m21 = 0.5f * dst20 + 0.5f * dst21 + 0.5f * dst22; | |||||
| float m22 = 0.5f * dst20 - 0.5f * dst21 + 0.5f * dst22; | |||||
| const float m21 = 0.5f * dst20 + 0.5f * dst21 + 0.5f * dst22; | |||||
| const float m22 = 0.5f * dst20 - 0.5f * dst21 + 0.5f * dst22; | |||||
| float m23 = dst22; | float m23 = dst22; | ||||
| float m30 = dst30; | float m30 = dst30; | ||||
| float m31 = 0.5f * dst30 + 0.5f * dst31 + 0.5f * dst32; | |||||
| float m32 = 0.5f * dst30 - 0.5f * dst31 + 0.5f * dst32; | |||||
| const float m31 = 0.5f * dst30 + 0.5f * dst31 + 0.5f * dst32; | |||||
| const float m32 = 0.5f * dst30 - 0.5f * dst31 + 0.5f * dst32; | |||||
| float m33 = dst32; | float m33 = dst32; | ||||
| *(dst_ic4_ptr + j * 8) = m00; | *(dst_ic4_ptr + j * 8) = m00; | ||||
| @@ -652,8 +661,10 @@ void Conv3x3Fp32OutputTransform(const float *gemm_out, float *out_data, const fl | |||||
| int output_w = conv_param->output_w_; | int output_w = conv_param->output_w_; | ||||
| int output_h = conv_param->output_h_; | int output_h = conv_param->output_h_; | ||||
| int oc4 = UP_DIV(output_channel, C4NUM); | int oc4 = UP_DIV(output_channel, C4NUM); | ||||
| int input_unit = 4; | |||||
| const int input_unit = 4; | |||||
| if (out_w_block == 0) { | |||||
| return; | |||||
| } | |||||
| for (int i = 0; i < real_cal_num; i++) { | for (int i = 0; i < real_cal_num; i++) { | ||||
| int out_w_index = (start_index + i) % out_w_block; | int out_w_index = (start_index + i) % out_w_block; | ||||
| int out_h_index = (start_index + i) / out_w_block; | int out_h_index = (start_index + i) / out_w_block; | ||||
| @@ -855,9 +866,11 @@ void Conv3x3Uint8InputTransform(const int16_t *input_data, int16_t *trans_input, | |||||
| int pad_h = conv_param->pad_h_; | int pad_h = conv_param->pad_h_; | ||||
| ConvQuantArg quant_arg = conv_param->conv_quant_arg_; | ConvQuantArg quant_arg = conv_param->conv_quant_arg_; | ||||
| int input_zp = quant_arg.input_quant_args_[0].zp_; | int input_zp = quant_arg.input_quant_args_[0].zp_; | ||||
| int ic8 = UP_DIV(input_channel, C8NUM); | |||||
| int input_unit = 4; | |||||
| const int ic8 = UP_DIV(input_channel, C8NUM); | |||||
| const int input_unit = 4; | |||||
| if (out_w_block == 0) { | |||||
| return; | |||||
| } | |||||
| for (int cal_id = 0; cal_id < real_cal_num; cal_id++) { | for (int cal_id = 0; cal_id < real_cal_num; cal_id++) { | ||||
| int x_id = start_index + cal_id; | int x_id = start_index + cal_id; | ||||
| int origin_x = (x_id % out_w_block) * OUPUT_UNIT - pad_w; | int origin_x = (x_id % out_w_block) * OUPUT_UNIT - pad_w; | ||||
| @@ -890,7 +903,7 @@ void Conv3x3Uint8InputTransform(const int16_t *input_data, int16_t *trans_input, | |||||
| void Conv3x3Int8FilterTransform(const int16_t *weight_data, int16_t *trans_weight, int iC8, int output_channel, | void Conv3x3Int8FilterTransform(const int16_t *weight_data, int16_t *trans_weight, int iC8, int output_channel, | ||||
| int kernel_plane) { | int kernel_plane) { | ||||
| int input_unit = 4; | |||||
| const int input_unit = 4; | |||||
| int dst_step = iC8 * C8NUM * C4NUM; | int dst_step = iC8 * C8NUM * C4NUM; | ||||
| for (int o = 0; o < output_channel; o++) { | for (int o = 0; o < output_channel; o++) { | ||||
| int oc4_block_num = o / C4NUM; | int oc4_block_num = o / C4NUM; | ||||
| @@ -1441,9 +1454,11 @@ void Conv3x3Uint8OutputTransform(const int32_t *gemm_out, int8_t *out_data, cons | |||||
| int output_channel = conv_param->output_channel_; | int output_channel = conv_param->output_channel_; | ||||
| int output_w = conv_param->output_w_; | int output_w = conv_param->output_w_; | ||||
| int output_h = conv_param->output_h_; | int output_h = conv_param->output_h_; | ||||
| int oc4 = UP_DIV(output_channel, C4NUM); | |||||
| int input_unit = 4; | |||||
| const int oc4 = UP_DIV(output_channel, C4NUM); | |||||
| const int input_unit = 4; | |||||
| if (out_w_block == 0) { | |||||
| return; | |||||
| } | |||||
| for (int i = 0; i < real_cal_num; i++) { | for (int i = 0; i < real_cal_num; i++) { | ||||
| int out_w_index = (start_index + i) % out_w_block; | int out_w_index = (start_index + i) % out_w_block; | ||||
| int out_h_index = (start_index + i) / out_w_block; | int out_h_index = (start_index + i) / out_w_block; | ||||
| @@ -49,6 +49,9 @@ __kernel void ElementDiv(__read_only image2d_t input_a, __read_only image2d_t in | |||||
| float4 a = read_imagef(input_a, smp_none, (int2)(X, Y)); | float4 a = read_imagef(input_a, smp_none, (int2)(X, Y)); | ||||
| float4 b = read_imagef(input_b, smp_none, (int2)(X, Y)); | float4 b = read_imagef(input_b, smp_none, (int2)(X, Y)); | ||||
| if (b == 0) { | |||||
| return; | |||||
| } | |||||
| write_imagef(output, (int2)(X, Y), a / b); | write_imagef(output, (int2)(X, Y), a / b); | ||||
| } | } | ||||
| @@ -249,6 +249,10 @@ int ConvolutionOpenCLKernel::GetGlobalLocal(std::vector<size_t> *global, std::ve | |||||
| size_t global_c = UP_DIV(UP_DIV(param->output_channel_, C4NUM), work_group_size[2]) * work_group_size[2]; | size_t global_c = UP_DIV(UP_DIV(param->output_channel_, C4NUM), work_group_size[2]) * work_group_size[2]; | ||||
| size_t local_c = GetBiggestDivider(global_c, max_z_size); | size_t local_c = GetBiggestDivider(global_c, max_z_size); | ||||
| if (local_c == 0) { | |||||
| MS_LOG(ERROR) << "Divide by zero"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| size_t local_hw_size = std::min<size_t>(256, max_work_group_size) / local_c; | size_t local_hw_size = std::min<size_t>(256, max_work_group_size) / local_c; | ||||
| size_t local_w = std::min(global_w, local_hw_size); | size_t local_w = std::min(global_w, local_hw_size); | ||||
| size_t local_h = std::min(local_hw_size / local_w, global_h); | size_t local_h = std::min(local_hw_size / local_w, global_h); | ||||
| @@ -32,6 +32,10 @@ std::vector<size_t> GetCommonGlobalSize(const std::vector<size_t> &local, const | |||||
| std::vector<size_t> GetCommonLocalSize(const std::vector<size_t> &global, int max_size) { | std::vector<size_t> GetCommonLocalSize(const std::vector<size_t> &global, int max_size) { | ||||
| size_t wg_z = GetBiggestDividerWithPriority(global[2], 8); | size_t wg_z = GetBiggestDividerWithPriority(global[2], 8); | ||||
| if (wg_z == 0) { | |||||
| MS_LOG(ERROR) << "Divide by zero"; | |||||
| return {}; | |||||
| } | |||||
| size_t wg_xy_size = max_size / wg_z; | size_t wg_xy_size = max_size / wg_z; | ||||
| size_t wg_x = std::min(DivideRoundUp(global[0], 2), wg_xy_size); | size_t wg_x = std::min(DivideRoundUp(global[0], 2), wg_xy_size); | ||||
| size_t wg_y = std::min(wg_xy_size / wg_x, global[1]); | size_t wg_y = std::min(wg_xy_size / wg_x, global[1]); | ||||
| @@ -130,6 +130,7 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t>& img_size) | |||||
| if (ret != CL_SUCCESS) { | if (ret != CL_SUCCESS) { | ||||
| MS_LOG(ERROR) << "Create OpenCL Image2D failed! (ERROR CODE: " << ret << ")"; | MS_LOG(ERROR) << "Create OpenCL Image2D failed! (ERROR CODE: " << ret << ")"; | ||||
| UnLock(); | UnLock(); | ||||
| delete buffer; | |||||
| return nullptr; | return nullptr; | ||||
| } | } | ||||
| device_ptr = static_cast<void *>(buffer); | device_ptr = static_cast<void *>(buffer); | ||||
| @@ -138,6 +139,7 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t>& img_size) | |||||
| if (host_ptr == nullptr) { | if (host_ptr == nullptr) { | ||||
| MS_LOG(ERROR) << "Map buffer failed, can not found buffer :" << device_ptr << ", host_ptr=" << host_ptr; | MS_LOG(ERROR) << "Map buffer failed, can not found buffer :" << device_ptr << ", host_ptr=" << host_ptr; | ||||
| UnLock(); | UnLock(); | ||||
| return nullptr; | return nullptr; | ||||
| } | } | ||||
| cl::Memory *mem = buffer; | cl::Memory *mem = buffer; | ||||
| @@ -187,6 +189,7 @@ void *OpenCLAllocator::CreateImageFromHost(void *data, size_t size, const std::v | |||||
| if (ret != CL_SUCCESS) { | if (ret != CL_SUCCESS) { | ||||
| MS_LOG(ERROR) << "Create OpenCL Image2D failed! (ERROR CODE: " << ret << ")"; | MS_LOG(ERROR) << "Create OpenCL Image2D failed! (ERROR CODE: " << ret << ")"; | ||||
| UnLock(); | UnLock(); | ||||
| delete buffer; | |||||
| return nullptr; | return nullptr; | ||||
| } | } | ||||
| device_ptr = static_cast<void *>(buffer); | device_ptr = static_cast<void *>(buffer); | ||||
| @@ -195,7 +195,7 @@ kernel::LiteKernel *Scheduler::ScheduleNode(const std::vector<tensor::Tensor *> | |||||
| } | } | ||||
| desc.arch = kernel::KERNEL_ARCH::kCPU; | desc.arch = kernel::KERNEL_ARCH::kCPU; | ||||
| kernel::LiteKernel *kernel; | |||||
| kernel::LiteKernel *kernel = nullptr; | |||||
| if (data_type == kNumberTypeFloat32) { | if (data_type == kNumberTypeFloat32) { | ||||
| // check if support fp16 | // check if support fp16 | ||||
| kernel::KernelKey key{desc.arch, kNumberTypeFloat16, desc.type}; | kernel::KernelKey key{desc.arch, kNumberTypeFloat16, desc.type}; | ||||
| @@ -25,6 +25,10 @@ std::shared_ptr<ModelImpl> Import(const char *model_buf, size_t size) { | |||||
| return nullptr; | return nullptr; | ||||
| } | } | ||||
| // todo hangangqiang remove when copy primitive done | // todo hangangqiang remove when copy primitive done | ||||
| if (size <= 0) { | |||||
| MS_LOG(ERROR) << "size is zero"; | |||||
| return nullptr; | |||||
| } | |||||
| auto *inner_buf = new char[size]; | auto *inner_buf = new char[size]; | ||||
| memcpy(inner_buf, model_buf, size); | memcpy(inner_buf, model_buf, size); | ||||
| auto meta_graph = schema::GetMetaGraph(inner_buf); | auto meta_graph = schema::GetMetaGraph(inner_buf); | ||||
| @@ -462,7 +462,7 @@ void BenchmarkFlags::InitInputDataList() { | |||||
| char *cur_input; | char *cur_input; | ||||
| const char *split_c = ","; | const char *split_c = ","; | ||||
| cur_input = strtok(input_list, split_c); | cur_input = strtok(input_list, split_c); | ||||
| while (cur_input) { | |||||
| while (cur_input != nullptr) { | |||||
| input_data_list.emplace_back(cur_input); | input_data_list.emplace_back(cur_input); | ||||
| cur_input = strtok(nullptr, split_c); | cur_input = strtok(nullptr, split_c); | ||||
| } | } | ||||
| @@ -117,9 +117,9 @@ STATUS OnnxEltwiseParser::Parse(const onnx::GraphProto &onnx_graph, | |||||
| std::unique_ptr<schema::EltwiseT> attr(new schema::EltwiseT()); | std::unique_ptr<schema::EltwiseT> attr(new schema::EltwiseT()); | ||||
| if (onnx_node.op_type() == "Prod") { | if (onnx_node.op_type() == "Prod") { | ||||
| attr->mode = schema::EltwiseMode_PROD; | attr->mode = schema::EltwiseMode_PROD; | ||||
| } else if (onnx_node.op_type() == "Prod") { | |||||
| attr->mode = schema::EltwiseMode_SUM; | |||||
| } else if (onnx_node.op_type() == "Sum") { | } else if (onnx_node.op_type() == "Sum") { | ||||
| attr->mode = schema::EltwiseMode_SUM; | |||||
| } else if (onnx_node.op_type() == "Maximum") { | |||||
| attr->mode = schema::EltwiseMode_MAXIMUM; | attr->mode = schema::EltwiseMode_MAXIMUM; | ||||
| } | } | ||||
| @@ -166,6 +166,7 @@ const void ConvTransformFusion::CalNewWeightTensor(float *weight_data, int kerne | |||||
| auto data_size = kernel_num * kernel_size * sizeof(float); | auto data_size = kernel_num * kernel_size * sizeof(float); | ||||
| if (0 != memset_s(tmp_weight_data, data_size, 0, data_size)) { | if (0 != memset_s(tmp_weight_data, data_size, 0, data_size)) { | ||||
| MS_LOG(EXCEPTION) << "memset newWeightData failed"; | MS_LOG(EXCEPTION) << "memset newWeightData failed"; | ||||
| delete[] tmp_weight_data; | |||||
| return; | return; | ||||
| } | } | ||||