Merge pull request !4437 from 张学同/to_mergetags/v0.7.0-beta
| @@ -706,6 +706,7 @@ build_lite() | |||
| mkdir -p ${OUTPUT_DIR}/include/schema/ | |||
| cp ${BASEPATH}/mindspore/lite/schema/*.h ${OUTPUT_DIR}/include/schema/ | |||
| cp ${BASEPATH}/mindspore/lite/build/src/libmindspore-lite.so ${OUTPUT_DIR}/lib/ | |||
| cp ${BASEPATH}/mindspore/lite/build/src/runtime/kernel/arm/nnacl/liboptimize.so ${OUTPUT_DIR}/lib/ | |||
| mkdir -p ${OUTPUT_DIR}/third_party/flatbuffers | |||
| cp -r ${BASEPATH}/third_party/flatbuffers/include/ ${OUTPUT_DIR}/third_party/flatbuffers/ | |||
| cd .. | |||
| @@ -266,7 +266,8 @@ int Convolution3x3FP16CPUKernel::Run() { | |||
| } | |||
| auto input_tensor = in_tensors_.at(kInputIndex); | |||
| auto ori_input_data = reinterpret_cast<float *>(input_tensor->Data()); | |||
| for (int i = 0; i < input_tensor->ElementsNum(); ++i) { | |||
| auto input_element_num = input_tensor->ElementsNum(); | |||
| for (int i = 0; i < input_element_num; ++i) { | |||
| fp16_input_[i] = (float16_t)ori_input_data[i]; | |||
| } | |||
| @@ -285,7 +286,9 @@ int Convolution3x3FP16CPUKernel::Run() { | |||
| // cast fp16 out to fp32 data | |||
| auto out_tensor = out_tensors_.at(kOutputIndex); | |||
| auto output_addr = reinterpret_cast<float *>(out_tensor->Data()); | |||
| for (int j = 0; j < out_tensor->ElementsNum(); ++j) { | |||
| auto output_element_num = out_tensor->ElementsNum(); | |||
| for (int j = 0; j < output_element_num; ++j) { | |||
| output_addr[j] = static_cast<float>(fp16_out_[j]); | |||
| } | |||
| return RET_OK; | |||
| @@ -231,7 +231,8 @@ int ConvolutionFP16CPUKernel::Run() { | |||
| } | |||
| auto input_tensor = in_tensors_.at(kInputIndex); | |||
| auto ori_input_data = reinterpret_cast<float *>(input_tensor->Data()); | |||
| for (int i = 0; i < input_tensor->ElementsNum(); ++i) { | |||
| auto input_element_num = input_tensor->ElementsNum(); | |||
| for (int i = 0; i < input_element_num; ++i) { | |||
| fp16_input_[i] = (float16_t)ori_input_data[i]; | |||
| } | |||
| @@ -250,7 +251,8 @@ int ConvolutionFP16CPUKernel::Run() { | |||
| // cast fp16 out to fp32 data | |||
| auto out_tensor = out_tensors_.at(kOutputIndex); | |||
| auto output_addr = reinterpret_cast<float *>(out_tensor->Data()); | |||
| for (int j = 0; j < out_tensor->ElementsNum(); ++j) { | |||
| auto output_element_num = out_tensor->ElementsNum(); | |||
| for (int j = 0; j < output_element_num; ++j) { | |||
| output_addr[j] = static_cast<float>(fp16_out_[j]); | |||
| } | |||
| return RET_OK; | |||
| @@ -51,10 +51,36 @@ int ArithmeticCPUKernel::Init() { | |||
| int ArithmeticCPUKernel::ReSize() { | |||
| FreeTileData(); | |||
| auto element_num = out_tensors_[0]->ElementsNum(); | |||
| arithmeticParameter_->in_elements_num0_ = in_tensors_[0]->ElementsNum(); | |||
| arithmeticParameter_->in_elements_num1_ = in_tensors_[1]->ElementsNum(); | |||
| arithmeticParameter_->out_elements_num_ = out_tensors_[0]->ElementsNum(); | |||
| if (arithmeticParameter_->in_elements_num0_ == 1 || arithmeticParameter_->in_elements_num1_ == 1) { | |||
| if (arithmeticParameter_->activation_type_ == schema::ActivationType_NO_ACTIVATION) { | |||
| switch (arithmeticParameter_->op_parameter_.type_) { | |||
| case PrimitiveType_Mul: | |||
| arithmeticParameter_->broadcasting_ = false; | |||
| arithmetic_opt_run_ = ElementOptMul; | |||
| break; | |||
| case PrimitiveType_Add: | |||
| arithmeticParameter_->broadcasting_ = false; | |||
| arithmetic_opt_run_ = ElementOptAdd; | |||
| break; | |||
| case PrimitiveType_Sub: | |||
| arithmeticParameter_->broadcasting_ = false; | |||
| arithmetic_opt_run_ = ElementOptSub; | |||
| break; | |||
| default: | |||
| break; | |||
| } | |||
| } | |||
| } | |||
| if (arithmeticParameter_->broadcasting_) { | |||
| tile_data0_ = new float[arithmeticParameter_->out_elements_num_]; | |||
| tile_data1_ = new float[arithmeticParameter_->out_elements_num_]; | |||
| } | |||
| tile_data0_ = new float[element_num]; | |||
| tile_data1_ = new float[element_num]; | |||
| return RET_OK; | |||
| } | |||
| @@ -77,7 +103,17 @@ int ArithmeticCPUKernel::DoArithmetic(int task_id) { | |||
| if (arithmeticParameter_->broadcasting_) { | |||
| error_code = arithmetic_run_(tile_data0_ + stride * task_id, tile_data1_ + stride * task_id, | |||
| output_data + stride * task_id, count); | |||
| } else if (arithmetic_opt_run_ != nullptr) { | |||
| if (arithmeticParameter_->in_elements_num0_ == 1) { | |||
| error_code = arithmetic_opt_run_(input0_data, input1_data1 + stride * task_id, output_data + stride * task_id, | |||
| count, arithmeticParameter_); | |||
| } else if (arithmeticParameter_->in_elements_num1_ == 1) { | |||
| error_code = arithmetic_opt_run_(input0_data + stride * task_id, input1_data1, output_data + stride * task_id, | |||
| count, arithmeticParameter_); | |||
| } else { | |||
| error_code = arithmetic_opt_run_(input0_data + stride * task_id, input1_data1 + stride * task_id, | |||
| output_data + stride * task_id, count, arithmeticParameter_); | |||
| } | |||
| } else { | |||
| error_code = arithmetic_run_(input0_data + stride * task_id, input1_data1 + stride * task_id, | |||
| output_data + stride * task_id, count); | |||
| @@ -104,6 +140,7 @@ int ArithmeticCPUKernel::Run() { | |||
| MS_LOG(ERROR) << "Prepare fail!ret: " << ret; | |||
| return ret; | |||
| } | |||
| if (arithmeticParameter_->broadcasting_) { | |||
| auto input_data0 = reinterpret_cast<float *>(in_tensors_[0]->Data()); | |||
| auto input_data1 = reinterpret_cast<float *>(in_tensors_[1]->Data()); | |||
| @@ -43,6 +43,8 @@ using mindspore::schema::PrimitiveType_Sub; | |||
| namespace mindspore::kernel { | |||
| class ArithmeticCPUKernel : public LiteKernel { | |||
| typedef int (*ArithmeticRun)(float *input0, float *input1, float *output, int element_size); | |||
| typedef int (*ArithmeticOptRun)(float *input0, float *input1, float *output, int element_size, | |||
| ArithmeticParameter *param); | |||
| typedef int (*ArithmeticBroadcastRun)(float *input0, float *input1, float *tile_input0, float *tile_input1, | |||
| float *output, int element_size, ArithmeticParameter *param); | |||
| @@ -177,8 +179,9 @@ class ArithmeticCPUKernel : public LiteKernel { | |||
| float *tile_data0_ = nullptr; | |||
| float *tile_data1_ = nullptr; | |||
| ArithmeticParameter *arithmeticParameter_; | |||
| ArithmeticRun arithmetic_run_; | |||
| ArithmeticBroadcastRun arithmetic_broadcast_run_; | |||
| ArithmeticRun arithmetic_run_ = nullptr; | |||
| ArithmeticBroadcastRun arithmetic_broadcast_run_ = nullptr; | |||
| ArithmeticOptRun arithmetic_opt_run_ = nullptr; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_ARITHMETIC_H_ | |||
| @@ -277,7 +277,8 @@ kernel::LiteKernel *CpuConvFp32KernelCreator(const std::vector<lite::tensor::Ten | |||
| kernel = | |||
| new (std::nothrow) kernel::ConvolutionWinogradCPUKernel(opParameter, inputs, outputs, ctx, primitive, out_unit); | |||
| } else if (use_sw) { | |||
| kernel = new (std::nothrow) kernel::ConvolutionSWCPUKernel(opParameter, inputs, outputs, ctx, primitive); | |||
| // kernel = new (std::nothrow) kernel::ConvolutionSWCPUKernel(opParameter, inputs, outputs, ctx, primitive); | |||
| kernel = new (std::nothrow) kernel::ConvolutionCPUKernel(opParameter, inputs, outputs, ctx, primitive); | |||
| } else { | |||
| kernel = new (std::nothrow) kernel::ConvolutionCPUKernel(opParameter, inputs, outputs, ctx, primitive); | |||
| } | |||
| @@ -29,8 +29,12 @@ typedef struct ArithmeticParameter { | |||
| size_t ndim_; | |||
| int activation_type_; | |||
| int in_shape0_[5]; | |||
| int in_elements_num0_; | |||
| int in_shape1_[5]; | |||
| int in_elements_num1_; | |||
| int out_shape_[5]; | |||
| int out_elements_num_; | |||
| int in_strides0_[5]; | |||
| int in_strides1_[5]; | |||
| @@ -19,6 +19,57 @@ | |||
| #define ACCURACY_DATA 0.00000001 | |||
| int ElementOptMul(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param) { | |||
| if (param->in_elements_num0_ == 1) { | |||
| for (int i = 0; i < element_size; ++i) { | |||
| output[i] = input0[0] * input1[i]; | |||
| } | |||
| } else if (param->in_elements_num1_ == 1) { | |||
| for (int i = 0; i < element_size; ++i) { | |||
| output[i] = input0[i] * input1[0]; | |||
| } | |||
| } else { | |||
| for (int i = 0; i < element_size; ++i) { | |||
| output[i] = input0[i] * input1[i]; | |||
| } | |||
| } | |||
| return NNACL_OK; | |||
| } | |||
| int ElementOptSub(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param) { | |||
| if (param->in_elements_num0_ == 1) { | |||
| for (int i = 0; i < element_size; ++i) { | |||
| output[i] = input0[0] - input1[i]; | |||
| } | |||
| } else if (param->in_elements_num1_ == 1) { | |||
| for (int i = 0; i < element_size; ++i) { | |||
| output[i] = input0[i] - input1[0]; | |||
| } | |||
| } else { | |||
| for (int i = 0; i < element_size; ++i) { | |||
| output[i] = input0[i] - input1[i]; | |||
| } | |||
| } | |||
| return NNACL_OK; | |||
| } | |||
| int ElementOptAdd(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param) { | |||
| if (param->in_elements_num0_ == 1) { | |||
| for (int i = 0; i < element_size; ++i) { | |||
| output[i] = input0[0] + input1[i]; | |||
| } | |||
| } else if (param->in_elements_num1_ == 1) { | |||
| for (int i = 0; i < element_size; ++i) { | |||
| output[i] = input0[i] + input1[0]; | |||
| } | |||
| } else { | |||
| for (int i = 0; i < element_size; ++i) { | |||
| output[i] = input0[i] + input1[i]; | |||
| } | |||
| } | |||
| return NNACL_OK; | |||
| } | |||
| int ElementMul(float *input0, float *input1, float *output, int element_size) { | |||
| int block_mod = element_size % C4NUM; | |||
| int block_c4 = element_size - block_mod; | |||
| @@ -26,6 +26,9 @@ | |||
| #ifdef __cplusplus | |||
| extern "C" { | |||
| #endif | |||
| int ElementOptAdd(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param); | |||
| int ElementOptSub(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param); | |||
| int ElementOptMul(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param); | |||
| int ElementMul(float *input0, float *input1, float *output, int element_size); | |||
| int ElementMulRelu(float *input0, float *input1, float *output, int element_size); | |||
| int ElementMulRelu6(float *input0, float *input1, float *output, int element_size); | |||