Merge pull request !4974 from ling/srtags/v0.7.0-beta
| @@ -15,14 +15,28 @@ | |||||
| */ | */ | ||||
| #include "nnacl/fp16/matmul_fp16.h" | #include "nnacl/fp16/matmul_fp16.h" | ||||
| void ColMajor2Row8MajorFp16(float16_t *src_ptr, float16_t *dst_ptr, size_t row, size_t col) { | |||||
| for (int r = 0; r < row; r++) { | |||||
| for (int c = 0; c < col; c++) { | |||||
| int cd8 = c / 8; | |||||
| int cm8 = c % 8; | |||||
| dst_ptr[cd8 * 8 * row + r * 8 + cm8] = src_ptr[c * row + r]; | |||||
| void ColMajor2Row8MajorFp16(void *src_ptr, float16_t *dst_ptr, size_t row, size_t col, bool src_float16) { | |||||
| if (src_float16) { | |||||
| float16_t *src = (float16_t *)src_ptr; | |||||
| for (int r = 0; r < row; r++) { | |||||
| for (int c = 0; c < col; c++) { | |||||
| int cd8 = c / 8; | |||||
| int cm8 = c % 8; | |||||
| dst_ptr[cd8 * 8 * row + r * 8 + cm8] = (float16_t)(src[c * row + r]); | |||||
| } | |||||
| } | |||||
| } else { | |||||
| float *src = (float *)src_ptr; | |||||
| for (int r = 0; r < row; r++) { | |||||
| for (int c = 0; c < col; c++) { | |||||
| int cd8 = c / 8; | |||||
| int cm8 = c % 8; | |||||
| dst_ptr[cd8 * 8 * row + r * 8 + cm8] = (float16_t)(src[c * row + r]); | |||||
| } | |||||
| } | } | ||||
| } | } | ||||
| return; | |||||
| } | } | ||||
| void MatMul16x8(const float16_t *a, const float16_t *b, float16_t *dst, const float16_t *bias, ActType act_type, | void MatMul16x8(const float16_t *a, const float16_t *b, float16_t *dst, const float16_t *bias, ActType act_type, | ||||
| @@ -32,7 +32,7 @@ extern "C" { | |||||
| void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type, | void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type, | ||||
| int depth, int row, int col, int stride, bool write_nhwc); | int depth, int row, int col, int stride, bool write_nhwc); | ||||
| void ColMajor2Row8MajorFp16(float16_t *src_ptr, float16_t *dst_ptr, size_t row, size_t col); | |||||
| void ColMajor2Row8MajorFp16(void *src_ptr, float16_t *dst_ptr, size_t row, size_t col, bool src_float16); | |||||
| void RowMajor2Col16MajorFp16(float16_t *src_ptr, float16_t *dst_ptr, size_t row, size_t col); | void RowMajor2Col16MajorFp16(float16_t *src_ptr, float16_t *dst_ptr, size_t row, size_t col); | ||||
| @@ -74,31 +74,36 @@ int Convolution1x1FP16CPUKernel::InitConv1x1Param() { | |||||
| } | } | ||||
| int Convolution1x1FP16CPUKernel::InitWeightBias() { | int Convolution1x1FP16CPUKernel::InitWeightBias() { | ||||
| auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteFilter(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Get Execute filter failed."; | |||||
| return ret; | |||||
| } | |||||
| auto bias_tensor = in_tensors_.at(kBiasIndex); | |||||
| auto weight_tensor = in_tensors_.at(kWeightIndex); | |||||
| auto input_channel = weight_tensor->Channel(); | |||||
| auto output_channel = weight_tensor->Batch(); | |||||
| bias_data_ = malloc(matmul_param_->col_8_ * sizeof(float16_t)); | |||||
| size_t size = UP_ROUND(output_channel, C8NUM) * sizeof(float16_t); | |||||
| bias_data_ = malloc(size); | |||||
| if (bias_data_ == nullptr) { | if (bias_data_ == nullptr) { | ||||
| MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!"; | MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| memset(bias_data_, 0, matmul_param_->col_8_ * sizeof(float16_t)); | |||||
| memset(bias_data_, 0, size); | |||||
| if (in_tensors_.size() == 3) { | if (in_tensors_.size() == 3) { | ||||
| Float32ToFloat16(reinterpret_cast<float *>(in_tensors_[2]->Data()), reinterpret_cast<float16_t *>(bias_data_), | |||||
| conv_param_->output_channel_); | |||||
| if (bias_tensor->data_type() == kNumberTypeFloat16) { | |||||
| memcpy(bias_data_, bias_tensor->Data(), output_channel * sizeof(float16_t)); | |||||
| } else { | |||||
| Float32ToFloat16(reinterpret_cast<float *>(bias_tensor->Data()), reinterpret_cast<float16_t *>(bias_data_), | |||||
| output_channel); | |||||
| } | |||||
| } | } | ||||
| weight_ptr_ = reinterpret_cast<float16_t *>(malloc(matmul_param_->deep_ * matmul_param_->col_8_ * sizeof(float16_t))); | |||||
| size = input_channel * UP_ROUND(output_channel, C8NUM) * sizeof(float16_t); | |||||
| weight_ptr_ = reinterpret_cast<float16_t *>(malloc(size)); | |||||
| if (weight_ptr_ == nullptr) { | if (weight_ptr_ == nullptr) { | ||||
| MS_LOG(ERROR) << "Conv1x1 Malloc weight_ptr_ error!"; | MS_LOG(ERROR) << "Conv1x1 Malloc weight_ptr_ error!"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| memset(weight_ptr_, 0, matmul_param_->deep_ * matmul_param_->col_8_ * sizeof(float16_t)); | |||||
| ColMajor2Row8MajorFp16(reinterpret_cast<float16_t *>(execute_weight_), weight_ptr_, matmul_param_->deep_, | |||||
| matmul_param_->col_); | |||||
| memset(weight_ptr_, 0, size); | |||||
| ColMajor2Row8MajorFp16(weight_tensor->Data(), weight_ptr_, input_channel, output_channel, | |||||
| weight_tensor->data_type() == kNumberTypeFloat16); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -106,6 +111,13 @@ int Convolution1x1FP16CPUKernel::Init() { | |||||
| if (!InferShapeDone()) { | if (!InferShapeDone()) { | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| matmul_param_ = new (std::nothrow) MatMulParameter(); | |||||
| if (matmul_param_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Init matmul_param_ failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| int ret = InitWeightBias(); | int ret = InitWeightBias(); | ||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| MS_LOG(ERROR) << "Init weight bias failed."; | MS_LOG(ERROR) << "Init weight bias failed."; | ||||
| @@ -31,9 +31,7 @@ class Convolution1x1FP16CPUKernel : public ConvolutionBaseFP16CPUKernel { | |||||
| Convolution1x1FP16CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs, | Convolution1x1FP16CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs, | ||||
| const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx, | const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx, | ||||
| const mindspore::lite::PrimitiveC *primitive) | const mindspore::lite::PrimitiveC *primitive) | ||||
| : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) { | |||||
| matmul_param_ = new MatMulParameter(); | |||||
| } | |||||
| : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} | |||||
| ~Convolution1x1FP16CPUKernel() override; | ~Convolution1x1FP16CPUKernel() override; | ||||
| int Init() override; | int Init() override; | ||||
| @@ -50,7 +48,7 @@ class Convolution1x1FP16CPUKernel : public ConvolutionBaseFP16CPUKernel { | |||||
| private: | private: | ||||
| bool pre_trans_input_ = false; | bool pre_trans_input_ = false; | ||||
| int thread_count_ = 0; | |||||
| int thread_count_ = 1; | |||||
| int thread_stride_ = 0; | int thread_stride_ = 0; | ||||
| float16_t *weight_ptr_ = nullptr; | float16_t *weight_ptr_ = nullptr; | ||||
| float16_t *input_ptr_ = nullptr; | float16_t *input_ptr_ = nullptr; | ||||
| @@ -23,6 +23,14 @@ | |||||
| #include "src/runtime/runtime_api.h" | #include "src/runtime/runtime_api.h" | ||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| ConvolutionBaseFP16CPUKernel::~ConvolutionBaseFP16CPUKernel() { | |||||
| if (fp16_weight_ != nullptr) { | |||||
| free(fp16_weight_); | |||||
| fp16_weight_ = nullptr; | |||||
| } | |||||
| } | |||||
| int ConvolutionBaseFP16CPUKernel::GetExecuteTensor() { | int ConvolutionBaseFP16CPUKernel::GetExecuteTensor() { | ||||
| // ===================input====================// | // ===================input====================// | ||||
| auto input_tensor = in_tensors_.at(kInputIndex); | auto input_tensor = in_tensors_.at(kInputIndex); | ||||
| @@ -65,6 +73,7 @@ int ConvolutionBaseFP16CPUKernel::GetExecuteFilter() { | |||||
| } else { | } else { | ||||
| auto *origin_weight = reinterpret_cast<float16_t *>(in_tensors_.at(kWeightIndex)->Data()); | auto *origin_weight = reinterpret_cast<float16_t *>(in_tensors_.at(kWeightIndex)->Data()); | ||||
| execute_weight_ = origin_weight; | execute_weight_ = origin_weight; | ||||
| fp16_weight_ = nullptr; | |||||
| } | } | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -30,7 +30,7 @@ class ConvolutionBaseFP16CPUKernel : public ConvolutionBaseCPUKernel { | |||||
| const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx, | const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx, | ||||
| const mindspore::lite::PrimitiveC *primitive) | const mindspore::lite::PrimitiveC *primitive) | ||||
| : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} | : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} | ||||
| ~ConvolutionBaseFP16CPUKernel() override = default; | |||||
| ~ConvolutionBaseFP16CPUKernel() override; | |||||
| int Init() override { return RET_OK; } | int Init() override { return RET_OK; } | ||||
| int ReSize() override { return RET_OK; } | int ReSize() override { return RET_OK; } | ||||
| @@ -244,8 +244,7 @@ kernel::LiteKernel *CpuConvFp16KernelCreator(const std::vector<lite::tensor::Ten | |||||
| if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) { | if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) { | ||||
| kernel = new (std::nothrow) kernel::Convolution3x3FP16CPUKernel(opParameter, inputs, outputs, ctx, primitive); | kernel = new (std::nothrow) kernel::Convolution3x3FP16CPUKernel(opParameter, inputs, outputs, ctx, primitive); | ||||
| } else if (kernel_h == 1 && kernel_w == 1) { | } else if (kernel_h == 1 && kernel_w == 1) { | ||||
| // kernel = new (std::nothrow) kernel::Convolution1x1FP16CPUKernel(opParameter, inputs, outputs, ctx, primitive); | |||||
| kernel = new (std::nothrow) kernel::ConvolutionFP16CPUKernel(opParameter, inputs, outputs, ctx, primitive); | |||||
| kernel = new (std::nothrow) kernel::Convolution1x1FP16CPUKernel(opParameter, inputs, outputs, ctx, primitive); | |||||
| } else { | } else { | ||||
| bool use_winograd = false; | bool use_winograd = false; | ||||
| int out_unit; | int out_unit; | ||||