| @@ -238,3 +238,63 @@ void RowMajor2Col16MajorFp16(float16_t *src_ptr, float16_t *dst_ptr, size_t row, | |||||
| } | } | ||||
| return; | return; | ||||
| } | } | ||||
| void Fp32RowMajor2Fp16Col16Major(float *src, float16_t *dst, size_t row, size_t col) { | |||||
| for (int r = 0; r < row; r++) { | |||||
| for (int c = 0; c < col; c++) { | |||||
| int r_div16 = r / 16; | |||||
| int r_mod16 = r % 16; | |||||
| dst[r_div16 * 16 * col + c * 16 + r_mod16] = (float16_t)(src[r * col + c]); | |||||
| } | |||||
| } | |||||
| } | |||||
| void Fp16RowMajor2Fp16Col16Major(float16_t *src, float16_t *dst, size_t row, size_t col) { | |||||
| for (int r = 0; r < row; r++) { | |||||
| for (int c = 0; c < col; c++) { | |||||
| int r_div16 = r / 16; | |||||
| int r_mod16 = r % 16; | |||||
| dst[r_div16 * 16 * col + c * 16 + r_mod16] = src[r * col + c]; | |||||
| } | |||||
| } | |||||
| } | |||||
| void Fp32RowMajor2Fp16Row16Major(float *src, float16_t *dst, size_t row, size_t col) { | |||||
| for (int r = 0; r < row; r++) { | |||||
| for (int c = 0; c < col; c++) { | |||||
| int c_div16 = c / 16; | |||||
| int c_mod16 = c % 16; | |||||
| dst[c_div16 * 16 * row + r * 16 + c_mod16] = (float16_t)(src[r * col + c]); | |||||
| } | |||||
| } | |||||
| } | |||||
| void Fp16RowMajor2Fp16Row16Major(float16_t *src, float16_t *dst, size_t row, size_t col) { | |||||
| for (int r = 0; r < row; r++) { | |||||
| for (int c = 0; c < col; c++) { | |||||
| int c_div16 = c / 16; | |||||
| int c_mod16 = c % 16; | |||||
| dst[c_div16 * 16 * row + r * 16 + c_mod16] = src[r * col + c]; | |||||
| } | |||||
| } | |||||
| } | |||||
| void Fp32RowMajor2Fp16Row8Major(float *src, float16_t *dst, size_t row, size_t col) { | |||||
| for (int r = 0; r < row; r++) { | |||||
| for (int c = 0; c < col; c++) { | |||||
| int c_div8 = c / 8; | |||||
| int c_mod8 = c % 8; | |||||
| dst[c_div8 * 8 * row + r * 8 + c_mod8] = (float16_t)src[r * col + c]; | |||||
| } | |||||
| } | |||||
| } | |||||
| void Fp32RowMajor2Fp16Col8Major(float *src, float16_t *dst, size_t row, size_t col) { | |||||
| for (int r = 0; r < row; r++) { | |||||
| for (int c = 0; c < col; c++) { | |||||
| int r_div8 = r / 8; | |||||
| int r_mod8 = r % 8; | |||||
| dst[r_div8 * 8 * col + c * 8 + r_mod8] = (float16_t)src[r * col + c]; | |||||
| } | |||||
| } | |||||
| } | |||||
| @@ -39,6 +39,18 @@ void RowMajor2Col16MajorFp16(float16_t *src_ptr, float16_t *dst_ptr, size_t row, | |||||
| void MatmulFp16Neon64(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, | void MatmulFp16Neon64(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, | ||||
| size_t depth, size_t row, size_t col, size_t stride, bool write_nhwc); | size_t depth, size_t row, size_t col, size_t stride, bool write_nhwc); | ||||
| void Fp32RowMajor2Fp16Col16Major(float *src, float16_t *dst, size_t row, size_t col); | |||||
| void Fp16RowMajor2Fp16Col16Major(float16_t *src, float16_t *dst, size_t row, size_t col); | |||||
| void Fp32RowMajor2Fp16Row16Major(float *src, float16_t *dst, size_t row, size_t col); | |||||
| void Fp16RowMajor2Fp16Row16Major(float16_t *src, float16_t *dst, size_t row, size_t col); | |||||
| void Fp32RowMajor2Fp16Row8Major(float *src, float16_t *dst, size_t row, size_t col); | |||||
| void Fp32RowMajor2Fp16Col8Major(float *src, float16_t *dst, size_t row, size_t col); | |||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||
| } | } | ||||
| #endif | #endif | ||||
| @@ -0,0 +1,187 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "src/runtime/kernel/arm/fp16/fullconnection_fp16.h" | |||||
| #include "nnacl/fp16/matmul_fp16.h" | |||||
| #include "nnacl/fp16/cast_fp16.h" | |||||
| #include "src/runtime/runtime_api.h" | |||||
| #include "include/errorcode.h" | |||||
| #include "src/kernel_registry.h" | |||||
| using mindspore::lite::KernelRegistrar; | |||||
| using mindspore::lite::RET_ERROR; | |||||
| using mindspore::lite::RET_INPUT_TENSOR_ERROR; | |||||
| using mindspore::lite::RET_MEMORY_FAILED; | |||||
| using mindspore::lite::RET_OK; | |||||
| using mindspore::schema::PrimitiveType_FullConnection; | |||||
| namespace mindspore::kernel { | |||||
| FullconnectionFP16CPUKernel::~FullconnectionFP16CPUKernel() { FreeTmpBuffer(); } | |||||
| void FullconnectionFP16CPUKernel::FreeTmpBuffer() { | |||||
| if (a_pack_ptr_ != nullptr) { | |||||
| ctx_->allocator->Free(a_pack_ptr_); | |||||
| a_pack_ptr_ = nullptr; | |||||
| } | |||||
| if (b_pack_ptr_ != nullptr) { | |||||
| ctx_->allocator->Free(b_pack_ptr_); | |||||
| b_pack_ptr_ = nullptr; | |||||
| } | |||||
| if (bias_ptr_ != nullptr) { | |||||
| ctx_->allocator->Free(bias_ptr_); | |||||
| bias_ptr_ = nullptr; | |||||
| } | |||||
| if (output_fp16_ != nullptr) { | |||||
| ctx_->allocator->Free(output_fp16_); | |||||
| output_fp16_ = nullptr; | |||||
| } | |||||
| } | |||||
| int FullconnectionFP16CPUKernel::ReSize() { | |||||
| FreeTmpBuffer(); | |||||
| fc_param_->row_ = (in_tensors_[0]->shape())[0]; | |||||
| fc_param_->col_ = (in_tensors_[1]->shape())[0]; | |||||
| fc_param_->deep_ = (in_tensors_[1]->shape())[1]; | |||||
| fc_param_->row_16_ = UP_ROUND(fc_param_->row_, C16NUM); | |||||
| fc_param_->col_8_ = UP_ROUND(fc_param_->col_, C8NUM); | |||||
| thread_count_ = MSMIN(thread_count_, UP_DIV(fc_param_->col_, C8NUM)); | |||||
| thread_stride_ = UP_DIV(UP_DIV(fc_param_->col_, C8NUM), thread_count_) * C8NUM; | |||||
| a_pack_ptr_ = | |||||
| reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(fc_param_->row_16_ * fc_param_->deep_ * sizeof(float16_t))); | |||||
| if (a_pack_ptr_ == nullptr) { | |||||
| FreeTmpBuffer(); | |||||
| return RET_MEMORY_FAILED; | |||||
| } | |||||
| memset(a_pack_ptr_, 0, fc_param_->row_16_ * fc_param_->deep_ * sizeof(float16_t)); | |||||
| b_pack_ptr_ = | |||||
| reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(fc_param_->col_8_ * fc_param_->deep_ * sizeof(float16_t))); | |||||
| if (b_pack_ptr_ == nullptr) { | |||||
| FreeTmpBuffer(); | |||||
| return RET_MEMORY_FAILED; | |||||
| } | |||||
| memset(b_pack_ptr_, 0, fc_param_->col_8_ * fc_param_->deep_ * sizeof(float16_t)); | |||||
| InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->Data()), b_pack_ptr_); | |||||
| if (in_tensors_.size() == 3) { | |||||
| bias_ptr_ = reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(fc_param_->col_8_ * sizeof(float16_t))); | |||||
| if (bias_ptr_ == nullptr) { | |||||
| FreeTmpBuffer(); | |||||
| return RET_MEMORY_FAILED; | |||||
| } | |||||
| memset(bias_ptr_, 0, fc_param_->col_8_ * sizeof(float16_t)); | |||||
| Float32ToFloat16(reinterpret_cast<float *>(in_tensors_[2]->Data()), bias_ptr_, fc_param_->col_); | |||||
| } | |||||
| if (out_tensors_[0]->data_type() == kNumberTypeFloat32) { | |||||
| output_fp16_ = | |||||
| reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(fc_param_->row_ * fc_param_->col_ * sizeof(float16_t))); | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| void FullconnectionFP16CPUKernel::InitMatrixA(float *a_ptr, float16_t *a_pack_ptr) { | |||||
| Fp32RowMajor2Fp16Col16Major(a_ptr, a_pack_ptr, fc_param_->row_, fc_param_->deep_); | |||||
| } | |||||
| void FullconnectionFP16CPUKernel::InitMatrixA(float16_t *a_ptr, float16_t *a_pack_ptr) { | |||||
| Fp16RowMajor2Fp16Col16Major(a_ptr, a_pack_ptr, fc_param_->row_, fc_param_->deep_); | |||||
| } | |||||
| void FullconnectionFP16CPUKernel::InitMatrixB(float *b_ptr, float16_t *b_pack_ptr) { | |||||
| Fp32RowMajor2Fp16Col8Major(b_ptr, b_pack_ptr, fc_param_->col_, fc_param_->deep_); | |||||
| } | |||||
| int FullconnectionFP16CPUKernel::Init() { | |||||
| if (!InferShapeDone()) { | |||||
| return RET_OK; | |||||
| } | |||||
| return ReSize(); | |||||
| } | |||||
| int FullconnectionFP16CPUKernel::RunImpl(int task_id) { | |||||
| int cur_stride = fc_param_->col_ - task_id * thread_stride_; | |||||
| int cur_oc = MSMIN(thread_stride_, cur_stride); | |||||
| if (cur_oc <= 0) { | |||||
| return RET_OK; | |||||
| } | |||||
| auto b = b_pack_ptr_ + task_id * thread_stride_ * fc_param_->deep_; | |||||
| auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + thread_stride_ * task_id; | |||||
| auto c = output_ptr_ + task_id * thread_stride_; | |||||
| MatMulFp16(a_pack_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, fc_param_->row_, cur_oc, fc_param_->col_, | |||||
| true); | |||||
| return RET_OK; | |||||
| } | |||||
| int FcFP16Run(void *cdata, int task_id) { | |||||
| auto op = reinterpret_cast<FullconnectionFP16CPUKernel *>(cdata); | |||||
| auto error_code = op->RunImpl(task_id); | |||||
| if (error_code != RET_OK) { | |||||
| MS_LOG(ERROR) << "MatmulFp32Run error task_id[" << task_id << "] error_code[" << error_code << "]"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int FullconnectionFP16CPUKernel::Run() { | |||||
| auto prepare_ret = Prepare(); | |||||
| if (prepare_ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret; | |||||
| return prepare_ret; | |||||
| } | |||||
| auto out_tensor = out_tensors_[0]; | |||||
| if (out_tensor->data_type() == kNumberTypeFloat32) { | |||||
| output_ptr_ = output_fp16_; | |||||
| } else { | |||||
| output_ptr_ = reinterpret_cast<float16_t *>(out_tensor->Data()); | |||||
| } | |||||
| if (in_tensors_[0]->data_type() == kNumberTypeFloat32) { | |||||
| InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->Data()), a_pack_ptr_); | |||||
| } else { | |||||
| InitMatrixA(reinterpret_cast<float16_t *>(in_tensors_[0]->Data()), a_pack_ptr_); | |||||
| } | |||||
| ParallelLaunch(THREAD_POOL_DEFAULT, FcFP16Run, this, thread_count_); | |||||
| if (out_tensor->data_type() == kNumberTypeFloat32) { | |||||
| auto size = out_tensor->ElementsNum(); | |||||
| auto out_tensor_data = reinterpret_cast<float *>(out_tensor->Data()); | |||||
| Float16ToFloat32(output_fp16_, out_tensor_data, size); | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| kernel::LiteKernel *CpuFullConnectionFp16KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs, | |||||
| const std::vector<lite::tensor::Tensor *> &outputs, | |||||
| OpParameter *opParameter, const lite::Context *ctx, | |||||
| const kernel::KernelKey &desc, | |||||
| const mindspore::lite::PrimitiveC *primitive) { | |||||
| auto *kernel = new (std::nothrow) FullconnectionFP16CPUKernel(opParameter, inputs, outputs, ctx, primitive); | |||||
| if (kernel == nullptr) { | |||||
| MS_LOG(ERROR) << "kernel is nullptr."; | |||||
| return nullptr; | |||||
| } | |||||
| auto ret = kernel->Init(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: " | |||||
| << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_)); | |||||
| delete kernel; | |||||
| return nullptr; | |||||
| } | |||||
| return kernel; | |||||
| } | |||||
| REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_FullConnection, CpuFullConnectionFp16KernelCreator) | |||||
| } // namespace mindspore::kernel | |||||
| @@ -0,0 +1,56 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_FULLCONNECTION_H_ | |||||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_FULLCONNECTION_H_ | |||||
| #ifdef ENABLE_NEON | |||||
| #include <arm_neon.h> | |||||
| #endif | |||||
| #include <vector> | |||||
| #include "src/lite_kernel.h" | |||||
| #include "nnacl/matmul_parameter.h" | |||||
| #include "src/runtime/kernel/arm/base/fullconnection_base.h" | |||||
| namespace mindspore::kernel { | |||||
| class FullconnectionFP16CPUKernel : public FullconnectionBaseCPUKernel { | |||||
| public: | |||||
| explicit FullconnectionFP16CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs, | |||||
| const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx, | |||||
| const mindspore::lite::PrimitiveC *primitive) | |||||
| : FullconnectionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} | |||||
| ~FullconnectionFP16CPUKernel() override; | |||||
| int Init() override; | |||||
| int ReSize() override; | |||||
| int Run() override; | |||||
| int RunImpl(int task_id); | |||||
| private: | |||||
| void InitMatrixA(float *a_ptr, float16_t *a_pack_ptr); | |||||
| void InitMatrixA(float16_t *a_ptr, float16_t *a_pack_ptr); | |||||
| void InitMatrixB(float *b_ptr, float16_t *b_pack_ptr); | |||||
| void FreeTmpBuffer(); | |||||
| private: | |||||
| float16_t *a_pack_ptr_ = nullptr; | |||||
| float16_t *b_pack_ptr_ = nullptr; | |||||
| float16_t *bias_ptr_ = nullptr; | |||||
| float16_t *output_fp16_ = nullptr; | |||||
| float16_t *output_ptr_ = nullptr; | |||||
| }; | |||||
| } // namespace mindspore::kernel | |||||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_FULLCONNECTION_H_ | |||||
| @@ -0,0 +1,250 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "src/runtime/kernel/arm/fp16/matmul_fp16.h" | |||||
| #include "nnacl/fp16/matmul_fp16.h" | |||||
| #include "nnacl/fp16/cast_fp16.h" | |||||
| #include "src/runtime/runtime_api.h" | |||||
| #include "include/errorcode.h" | |||||
| #include "src/kernel_registry.h" | |||||
| using mindspore::lite::KernelRegistrar; | |||||
| using mindspore::lite::RET_ERROR; | |||||
| using mindspore::lite::RET_INPUT_TENSOR_ERROR; | |||||
| using mindspore::lite::RET_MEMORY_FAILED; | |||||
| using mindspore::lite::RET_OK; | |||||
| using mindspore::schema::PrimitiveType_MatMul; | |||||
| namespace mindspore::kernel { | |||||
| MatmulFP16CPUKernel::~MatmulFP16CPUKernel() { FreeTmpBuffer(); } | |||||
| void MatmulFP16CPUKernel::FreeTmpBuffer() { | |||||
| if (a_pack_ptr_ != nullptr) { | |||||
| ctx_->allocator->Free(a_pack_ptr_); | |||||
| a_pack_ptr_ = nullptr; | |||||
| } | |||||
| if (b_pack_ptr_ != nullptr) { | |||||
| ctx_->allocator->Free(b_pack_ptr_); | |||||
| b_pack_ptr_ = nullptr; | |||||
| } | |||||
| if (bias_ptr_ != nullptr) { | |||||
| ctx_->allocator->Free(bias_ptr_); | |||||
| bias_ptr_ = nullptr; | |||||
| } | |||||
| if (output_ptr_ != nullptr) { | |||||
| ctx_->allocator->Free(output_ptr_); | |||||
| output_ptr_ = nullptr; | |||||
| } | |||||
| } | |||||
| int MatmulFP16CPUKernel::ReSize() { | |||||
| FreeTmpBuffer(); | |||||
| int batch = 1; | |||||
| auto a_shape = in_tensors_[0]->shape(); | |||||
| auto c_shape = out_tensors_[0]->shape(); | |||||
| if (in_tensors_.size() == 3) { | |||||
| auto bias_shape = in_tensors_[2]->shape(); | |||||
| if (bias_shape[bias_shape.size() - 1] != c_shape[c_shape.size() - 1]) { | |||||
| MS_LOG(ERROR) << "The bias' dimension is not equal with column"; | |||||
| return RET_INPUT_TENSOR_ERROR; | |||||
| } | |||||
| } | |||||
| for (size_t i = 0; i < a_shape.size() - 2; ++i) { | |||||
| batch *= a_shape[i]; | |||||
| } | |||||
| params_->batch = batch; | |||||
| params_->row_ = c_shape[c_shape.size() - 2]; | |||||
| params_->col_ = c_shape[c_shape.size() - 1]; | |||||
| params_->deep_ = params_->a_transpose_ ? a_shape[a_shape.size() - 2] : a_shape[a_shape.size() - 1]; | |||||
| params_->row_16_ = UP_ROUND(params_->row_, C16NUM); | |||||
| params_->col_8_ = UP_ROUND(params_->col_, C8NUM); | |||||
| thread_count_ = MSMIN(thread_count_, UP_DIV(params_->col_, C8NUM)); | |||||
| thread_stride_ = UP_DIV(UP_DIV(params_->col_, C8NUM), thread_count_) * C8NUM; | |||||
| a_pack_ptr_ = reinterpret_cast<float16_t *>( | |||||
| ctx_->allocator->Malloc(params_->batch * params_->row_16_ * params_->deep_ * sizeof(float16_t))); | |||||
| if (a_pack_ptr_ == nullptr) { | |||||
| FreeTmpBuffer(); | |||||
| return RET_MEMORY_FAILED; | |||||
| } | |||||
| memset(a_pack_ptr_, 0, params_->batch * params_->row_16_ * params_->deep_ * sizeof(float16_t)); | |||||
| b_pack_ptr_ = reinterpret_cast<float16_t *>( | |||||
| ctx_->allocator->Malloc(params_->batch * params_->col_8_ * params_->deep_ * sizeof(float16_t))); | |||||
| if (b_pack_ptr_ == nullptr) { | |||||
| FreeTmpBuffer(); | |||||
| return RET_MEMORY_FAILED; | |||||
| } | |||||
| memset(b_pack_ptr_, 0, params_->batch * params_->col_8_ * params_->deep_ * sizeof(float16_t)); | |||||
| params_->a_const_ = (in_tensors_[0]->Data() != nullptr); | |||||
| params_->b_const_ = (in_tensors_[1]->Data() != nullptr); | |||||
| if (params_->a_const_ == true) { | |||||
| if (in_tensors_[0]->data_type() == kNumberTypeFloat32) { | |||||
| InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->Data()), a_pack_ptr_); | |||||
| } else { | |||||
| InitMatrixA(reinterpret_cast<float16_t *>(in_tensors_[0]->Data()), a_pack_ptr_); | |||||
| } | |||||
| } | |||||
| if (params_->b_const_ == true) { | |||||
| InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->Data()), b_pack_ptr_); | |||||
| } | |||||
| if (in_tensors_.size() == 3) { | |||||
| bias_ptr_ = reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(params_->col_8_ * sizeof(float16_t))); | |||||
| if (bias_ptr_ == nullptr) { | |||||
| FreeTmpBuffer(); | |||||
| return RET_MEMORY_FAILED; | |||||
| } | |||||
| memset(bias_ptr_, 0, params_->col_8_ * sizeof(float16_t)); | |||||
| Float32ToFloat16(reinterpret_cast<float *>(in_tensors_[2]->Data()), bias_ptr_, params_->col_); | |||||
| } | |||||
| if (out_tensors_[0]->data_type() == kNumberTypeFloat32) { | |||||
| output_ptr_ = reinterpret_cast<float16_t *>( | |||||
| ctx_->allocator->Malloc(params_->batch * params_->row_ * params_->col_ * sizeof(float16_t))); | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| void MatmulFP16CPUKernel::InitMatrixA(float *a_ptr, float16_t *a_pack_ptr) { | |||||
| for (int i = 0; i < params_->batch; i++) { | |||||
| float *src = a_ptr + i * params_->deep_ * params_->row_; | |||||
| float16_t *dst = a_pack_ptr + i * params_->deep_ * params_->row_16_; | |||||
| if (params_->a_transpose_) { | |||||
| Fp32RowMajor2Fp16Row16Major(src, dst, params_->deep_, params_->row_); | |||||
| } else { | |||||
| Fp32RowMajor2Fp16Col16Major(src, dst, params_->row_, params_->deep_); | |||||
| } | |||||
| } | |||||
| } | |||||
| void MatmulFP16CPUKernel::InitMatrixA(float16_t *a_ptr, float16_t *a_pack_ptr) { | |||||
| for (int i = 0; i < params_->batch; i++) { | |||||
| float16_t *src = a_ptr + i * params_->deep_ * params_->row_; | |||||
| float16_t *dst = a_pack_ptr + i * params_->deep_ * params_->row_16_; | |||||
| if (params_->a_transpose_) { | |||||
| Fp16RowMajor2Fp16Row16Major(src, dst, params_->deep_, params_->row_); | |||||
| } else { | |||||
| Fp16RowMajor2Fp16Col16Major(src, dst, params_->row_, params_->deep_); | |||||
| } | |||||
| } | |||||
| } | |||||
| void MatmulFP16CPUKernel::InitMatrixB(float *b_ptr, float16_t *b_pack_ptr) { | |||||
| for (int i = 0; i < params_->batch; i++) { | |||||
| float *src = b_ptr + i * params_->deep_ * params_->col_; | |||||
| float16_t *dst = b_pack_ptr + i * params_->deep_ * params_->col_8_; | |||||
| if (params_->b_transpose_) { | |||||
| Fp32RowMajor2Fp16Col8Major(src, dst, params_->col_, params_->deep_); | |||||
| } else { | |||||
| Fp32RowMajor2Fp16Row8Major(src, dst, params_->deep_, params_->col_); | |||||
| } | |||||
| } | |||||
| } | |||||
| int MatmulFP16CPUKernel::Init() { | |||||
| if (!InferShapeDone()) { | |||||
| return RET_OK; | |||||
| } | |||||
| return ReSize(); | |||||
| } | |||||
| int MatmulFP16CPUKernel::RunImpl(int task_id) { | |||||
| int cur_stride = params_->col_ - task_id * thread_stride_; | |||||
| int cur_oc = MSMIN(thread_stride_, cur_stride); | |||||
| if (cur_oc <= 0) { | |||||
| return RET_OK; | |||||
| } | |||||
| auto b = current_b_ + task_id * thread_stride_ * params_->deep_; | |||||
| auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + thread_stride_ * task_id; | |||||
| auto c = current_c_ + task_id * thread_stride_; | |||||
| MatMulFp16(current_a_, b, c, bias, ActType_No, params_->deep_, params_->row_, cur_oc, params_->col_, true); | |||||
| return RET_OK; | |||||
| } | |||||
| int MatmulFP16Run(void *cdata, int task_id) { | |||||
| auto op = reinterpret_cast<MatmulFP16CPUKernel *>(cdata); | |||||
| auto error_code = op->RunImpl(task_id); | |||||
| if (error_code != RET_OK) { | |||||
| MS_LOG(ERROR) << "MatmulFp32Run error task_id[" << task_id << "] error_code[" << error_code << "]"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int MatmulFP16CPUKernel::Run() { | |||||
| auto prepare_ret = Prepare(); | |||||
| if (prepare_ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret; | |||||
| return prepare_ret; | |||||
| } | |||||
| auto b = reinterpret_cast<float *>(in_tensors_[1]->Data()); | |||||
| auto out_tensor = out_tensors_[0]; | |||||
| float16_t *c_ptr; | |||||
| if (out_tensor->data_type() == kNumberTypeFloat32) { | |||||
| c_ptr = output_ptr_; | |||||
| } else { | |||||
| c_ptr = reinterpret_cast<float16_t *>(out_tensor->Data()); | |||||
| } | |||||
| if (params_->a_const_ == false) { | |||||
| if (in_tensors_[0]->data_type() == kNumberTypeFloat32) { | |||||
| InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->Data()), a_pack_ptr_); | |||||
| } else { | |||||
| InitMatrixA(reinterpret_cast<float16_t *>(in_tensors_[0]->Data()), a_pack_ptr_); | |||||
| } | |||||
| } | |||||
| if (params_->b_const_ == false) { | |||||
| InitMatrixB(b, b_pack_ptr_); | |||||
| } | |||||
| for (int i = 0; i < params_->batch; ++i) { | |||||
| current_a_ = a_pack_ptr_ + i * params_->row_16_ * params_->deep_; | |||||
| current_b_ = b_pack_ptr_ + i * params_->deep_ * params_->col_8_; | |||||
| current_c_ = c_ptr + i * params_->row_ * params_->col_; | |||||
| ParallelLaunch(THREAD_POOL_DEFAULT, MatmulFP16Run, this, thread_count_); | |||||
| } | |||||
| if (out_tensor->data_type() == kNumberTypeFloat32) { | |||||
| auto size = out_tensor->ElementsNum(); | |||||
| auto out_tensor_data = reinterpret_cast<float *>(out_tensor->Data()); | |||||
| Float16ToFloat32(output_ptr_, out_tensor_data, size); | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| kernel::LiteKernel *CpuMatmulFp16KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs, | |||||
| const std::vector<lite::tensor::Tensor *> &outputs, | |||||
| OpParameter *opParameter, const lite::Context *ctx, | |||||
| const kernel::KernelKey &desc, | |||||
| const mindspore::lite::PrimitiveC *primitive) { | |||||
| auto *kernel = new (std::nothrow) MatmulFP16CPUKernel(opParameter, inputs, outputs, ctx, primitive); | |||||
| if (kernel == nullptr) { | |||||
| MS_LOG(ERROR) << "kernel is nullptr."; | |||||
| return nullptr; | |||||
| } | |||||
| auto ret = kernel->Init(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: " | |||||
| << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_)); | |||||
| delete kernel; | |||||
| return nullptr; | |||||
| } | |||||
| return kernel; | |||||
| } | |||||
| REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_MatMul, CpuMatmulFp16KernelCreator) | |||||
| } // namespace mindspore::kernel | |||||
| @@ -0,0 +1,58 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATMUL_H_ | |||||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATMUL_H_ | |||||
| #ifdef ENABLE_NEON | |||||
| #include <arm_neon.h> | |||||
| #endif | |||||
| #include <vector> | |||||
| #include "src/lite_kernel.h" | |||||
| #include "nnacl/matmul_parameter.h" | |||||
| #include "src/runtime/kernel/arm/base/matmul_base.h" | |||||
| namespace mindspore::kernel { | |||||
| class MatmulFP16CPUKernel : public MatmulBaseCPUKernel { | |||||
| public: | |||||
| explicit MatmulFP16CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs, | |||||
| const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx, | |||||
| const mindspore::lite::PrimitiveC *primitive) | |||||
| : MatmulBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} | |||||
| ~MatmulFP16CPUKernel() override; | |||||
| int Init() override; | |||||
| int ReSize() override; | |||||
| int Run() override; | |||||
| int RunImpl(int task_id); | |||||
| private: | |||||
| void InitMatrixA(float *a_ptr, float16_t *a_pack_ptr); | |||||
| void InitMatrixA(float16_t *a_ptr, float16_t *a_pack_ptr); | |||||
| void InitMatrixB(float *b_ptr, float16_t *b_pack_ptr); | |||||
| void FreeTmpBuffer(); | |||||
| private: | |||||
| float16_t *a_pack_ptr_ = nullptr; | |||||
| float16_t *b_pack_ptr_ = nullptr; | |||||
| float16_t *bias_ptr_ = nullptr; | |||||
| float16_t *output_ptr_ = nullptr; | |||||
| float16_t *current_a_ = nullptr; | |||||
| float16_t *current_b_ = nullptr; | |||||
| float16_t *current_c_ = nullptr; | |||||
| }; | |||||
| } // namespace mindspore::kernel | |||||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATMUL_H_ | |||||