| @@ -42,35 +42,47 @@ int ConcatFp16CPUKernel::Init() { | |||
| } | |||
| int ConcatFp16CPUKernel::ReSize() { | |||
| for (auto ptr : fp16_inputs_) { | |||
| if (ptr != nullptr) { | |||
| free(ptr); | |||
| ptr = nullptr; | |||
| FreeTmpBuffer(); | |||
| for (const auto &in_tensor : in_tensors_) { | |||
| float16_t *ptr = nullptr; | |||
| if (in_tensor->data_type() == kNumberTypeFloat32 || in_tensor->data_type() == kNumberTypeFloat) { | |||
| ptr = reinterpret_cast<float16_t *>(context_->allocator->Malloc(sizeof(float16_t) * in_tensor->ElementsNum())); | |||
| if (ptr == nullptr) { | |||
| MS_LOG(ERROR) << "malloc failed"; | |||
| return RET_ERROR; | |||
| } | |||
| } | |||
| fp16_inputs_.push_back(ptr); | |||
| } | |||
| fp16_inputs_.clear(); | |||
| for (size_t i = 0; i < in_tensors_.size(); ++i) { | |||
| float16_t *ptr = nullptr; | |||
| ptr = reinterpret_cast<float16_t *>(malloc(sizeof(float16_t) * in_tensors_[i]->ElementsNum())); | |||
| if (ptr == nullptr) { | |||
| auto &out_tensor = out_tensors_.at(0); | |||
| if (out_tensor->data_type() == kNumberTypeFloat32 || out_tensor->data_type() == kNumberTypeFloat) { | |||
| if (fp16_output_ != nullptr) { | |||
| context_->allocator->Free(fp16_output_); | |||
| fp16_output_ = nullptr; | |||
| } | |||
| fp16_output_ = | |||
| reinterpret_cast<float16_t *>(context_->allocator->Malloc(sizeof(float16_t) * out_tensors_[0]->ElementsNum())); | |||
| if (fp16_output_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc failed"; | |||
| return RET_ERROR; | |||
| } | |||
| fp16_inputs_.push_back(ptr); | |||
| } | |||
| if (fp16_output_ != nullptr) { | |||
| free(fp16_output_); | |||
| fp16_output_ = nullptr; | |||
| } | |||
| fp16_output_ = reinterpret_cast<float16_t *>(malloc(sizeof(float16_t) * out_tensors_[0]->ElementsNum())); | |||
| if (fp16_output_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc failed"; | |||
| return RET_ERROR; | |||
| } | |||
| return ConcatBaseCPUKernel::ReSize(); | |||
| } | |||
| void ConcatFp16CPUKernel::FreeTmpBuffer() { | |||
| for (auto ptr : fp16_inputs_) { | |||
| if (ptr != nullptr) { | |||
| context_->allocator->Free(ptr); | |||
| ptr = nullptr; | |||
| } | |||
| } | |||
| fp16_inputs_.clear(); | |||
| } | |||
| int ConcatFp16CPUKernel::Run() { | |||
| auto prepare_ret = Prepare(); | |||
| if (prepare_ret != RET_OK) { | |||
| @@ -78,28 +90,53 @@ int ConcatFp16CPUKernel::Run() { | |||
| return prepare_ret; | |||
| } | |||
| auto input_num = in_tensors_.size(); | |||
| std::vector<float *> inputs_addr(input_num, nullptr); | |||
| std::vector<int *> inputs_output_shape(input_num + 1, nullptr); | |||
| std::vector<std::vector<int>> shapes; | |||
| for (size_t i = 0; i < input_num; ++i) { | |||
| inputs_addr[i] = reinterpret_cast<float *>(in_tensors_[i]->Data()); | |||
| if (inputs_addr[i] == nullptr) { | |||
| MS_LOG(ERROR) << "got nullptr when cast in_tensor to float ptr"; | |||
| return RET_ERROR; | |||
| const auto in_tensor = in_tensors_[i]; | |||
| if (in_tensor->data_type() == kNumberTypeFloat || in_tensor->data_type() == kNumberTypeFloat32) { | |||
| auto in_tensor_data = reinterpret_cast<float *>(in_tensor->Data()); | |||
| if (in_tensor_data == nullptr) { | |||
| MS_LOG(ERROR) << "got nullptr when cast in_tensor to float ptr"; | |||
| return RET_ERROR; | |||
| } | |||
| Float32ToFloat16(in_tensor_data, fp16_inputs_[i], in_tensor->ElementsNum()); | |||
| } else { | |||
| fp16_inputs_[i] = reinterpret_cast<float16_t *>(in_tensor->Data()); | |||
| } | |||
| Float32ToFloat16(inputs_addr[i], fp16_inputs_[i], in_tensors_[i]->ElementsNum()); | |||
| shapes.push_back(in_tensors_[i]->shape()); | |||
| inputs_output_shape[i] = shapes[i].data(); | |||
| } | |||
| auto output_shape = out_tensors_.at(0)->shape(); | |||
| inputs_output_shape[input_num] = output_shape.data(); | |||
| auto output_addr = out_tensors_.at(0)->Data(); | |||
| if (out_tensors_.at(0)->data_type() == kNumberTypeFloat16) { | |||
| fp16_output_ = reinterpret_cast<float16_t *>(out_tensors_.at(0)->Data()); | |||
| } | |||
| ConcatFp16(reinterpret_cast<void **>(fp16_inputs_.data()), input_num, axis_, inputs_output_shape.data(), | |||
| output_shape.size(), reinterpret_cast<void *>(fp16_output_)); | |||
| Float16ToFloat32(fp16_output_, reinterpret_cast<float *>(output_addr), out_tensors_.at(0)->ElementsNum()); | |||
| // free fp16 in out buffer | |||
| if (out_tensors_.at(0)->data_type() == kNumberTypeFloat32 || out_tensors_.at(0)->data_type() == kNumberTypeFloat) { | |||
| Float16ToFloat32(fp16_output_, reinterpret_cast<float *>(output_addr), out_tensors_.at(0)->ElementsNum()); | |||
| context_->allocator->Free(fp16_output_); | |||
| fp16_output_ = nullptr; | |||
| } | |||
| for (auto i = 0; i < fp16_inputs_.size(); i++) { | |||
| const auto in_tensor = in_tensors_[i]; | |||
| if (in_tensor->data_type() == kNumberTypeFloat || in_tensor->data_type() == kNumberTypeFloat32) { | |||
| auto ptr = fp16_inputs_[i]; | |||
| if (ptr != nullptr) { | |||
| context_->allocator->Free(ptr); | |||
| ptr = nullptr; | |||
| } | |||
| } | |||
| } | |||
| fp16_inputs_.clear(); | |||
| return RET_OK; | |||
| } | |||
| @@ -32,13 +32,7 @@ class ConcatFp16CPUKernel : public ConcatBaseCPUKernel { | |||
| const lite::Primitive *primitive) | |||
| : ConcatBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} | |||
| ~ConcatFp16CPUKernel() { | |||
| for (auto ptr : fp16_inputs_) { | |||
| if (ptr != nullptr) { | |||
| free(ptr); | |||
| } | |||
| } | |||
| } | |||
| ~ConcatFp16CPUKernel() = default; | |||
| int Init() override; | |||
| @@ -46,6 +40,9 @@ class ConcatFp16CPUKernel : public ConcatBaseCPUKernel { | |||
| int Run() override; | |||
| private: | |||
| void FreeTmpBuffer(); | |||
| private: | |||
| std::vector<float16_t *> fp16_inputs_; | |||
| float16_t *fp16_output_ = nullptr; | |||
| @@ -59,14 +59,17 @@ int ReduceFp16CPUKernel::Init() { | |||
| int ReduceFp16CPUKernel::ReSize() { | |||
| if (fp16_input_ != nullptr) { | |||
| free(fp16_input_); | |||
| context_->allocator->Free(fp16_input_); | |||
| fp16_input_ = nullptr; | |||
| } | |||
| auto ele_num = in_tensors_.at(0)->ElementsNum(); | |||
| fp16_input_ = reinterpret_cast<float16_t *>(malloc(sizeof(float16_t) * ele_num)); | |||
| if (fp16_input_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc fp16_src_data_ falied"; | |||
| return RET_ERROR; | |||
| auto in_tensor = in_tensors_.front(); | |||
| if (in_tensor->data_type() == kNumberTypeFloat32 || in_tensor->data_type() == kNumberTypeFloat) { | |||
| fp16_input_ = | |||
| reinterpret_cast<float16_t *>(context_->allocator->Malloc(in_tensor->ElementsNum() * sizeof(float16_t))); | |||
| if (fp16_input_ == nullptr) { | |||
| return RET_ERROR; | |||
| } | |||
| Float32ToFloat16(reinterpret_cast<float *>(in_tensor->Data()), fp16_input_, in_tensor->ElementsNum()); | |||
| } | |||
| return MallocTmpBuffer(); | |||
| } | |||
| @@ -93,10 +96,12 @@ int ReduceFp16CPUKernel::Run() { | |||
| MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret; | |||
| return prepare_ret; | |||
| } | |||
| tmp_shape_ = in_tensors_.at(0)->shape(); | |||
| src_data_ = static_cast<float *>(in_tensors_.at(0)->Data()); | |||
| auto ele_num = in_tensors_.at(0)->ElementsNum(); | |||
| Float32ToFloat16(src_data_, fp16_input_, ele_num); | |||
| auto in_tensor = in_tensors_.at(0); | |||
| if (in_tensor->data_type() == kNumberTypeFloat16) { | |||
| fp16_input_ = reinterpret_cast<float16_t *>(in_tensor->Data()); | |||
| } | |||
| fp16_src_data_ = fp16_input_; | |||
| for (int i = 0; i < data_buffers_.size(); ++i) { | |||
| fp16_dst_data_ = data_buffers_[i]; | |||
| @@ -119,19 +124,36 @@ int ReduceFp16CPUKernel::Run() { | |||
| fp16_src_data_ = fp16_dst_data_; | |||
| } | |||
| dst_data_ = reinterpret_cast<float *>(out_tensors_.at(0)->Data()); | |||
| Float16ToFloat32(fp16_dst_data_, dst_data_, out_tensors_.at(0)->ElementsNum()); | |||
| auto out_tensor = out_tensors_.at(0); | |||
| if (out_tensor->data_type() == kNumberTypeFloat32 || out_tensor->data_type() == kNumberTypeFloat) { | |||
| dst_data_ = reinterpret_cast<float *>(out_tensor->Data()); | |||
| Float16ToFloat32(fp16_dst_data_, dst_data_, out_tensor->ElementsNum()); | |||
| } else { | |||
| memcpy(out_tensor->Data(), fp16_dst_data_, out_tensor->ElementsNum() * sizeof(float16_t)); | |||
| } | |||
| if (in_tensor->data_type() == kNumberTypeFloat32 || in_tensor->data_type() == kNumberTypeFloat) { | |||
| context_->allocator->Free(fp16_input_); | |||
| } | |||
| fp16_input_ = nullptr; | |||
| FreeTmpBuffer(); | |||
| return RET_OK; | |||
| } | |||
| int ReduceFp16CPUKernel::MallocTmpBuffer() { | |||
| int ReduceFp16CPUKernel::FreeTmpBuffer() { | |||
| for (auto buffer : data_buffers_) { | |||
| if (buffer != nullptr) { | |||
| free(buffer); | |||
| context_->allocator->Free(buffer); | |||
| buffer = nullptr; | |||
| } | |||
| } | |||
| data_buffers_.clear(); | |||
| return RET_OK; | |||
| } | |||
| int ReduceFp16CPUKernel::MallocTmpBuffer() { | |||
| auto ret = FreeTmpBuffer(); | |||
| auto input_shape = in_tensors_.at(0)->shape(); | |||
| for (auto i = 0; i < num_axes_; i++) { | |||
| @@ -142,7 +164,7 @@ int ReduceFp16CPUKernel::MallocTmpBuffer() { | |||
| size *= input_shape[j]; | |||
| } | |||
| } | |||
| float16_t *buffer = reinterpret_cast<float16_t *>(malloc(size * sizeof(float16_t))); | |||
| float16_t *buffer = reinterpret_cast<float16_t *>(context_->allocator->Malloc(size * sizeof(float16_t))); | |||
| if (buffer == nullptr) { | |||
| MS_LOG(ERROR) << "Malloc data failed."; | |||
| return RET_ERROR; | |||
| @@ -150,7 +172,7 @@ int ReduceFp16CPUKernel::MallocTmpBuffer() { | |||
| data_buffers_.emplace_back(buffer); | |||
| input_shape[axis] = 1; | |||
| } | |||
| return RET_OK; | |||
| return ret; | |||
| } | |||
| kernel::LiteKernel *CpuReduceFp16KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs, | |||
| @@ -34,21 +34,7 @@ class ReduceFp16CPUKernel : public ReduceBaseCPUKernel { | |||
| const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx, | |||
| const lite::Primitive *primitive) | |||
| : ReduceBaseCPUKernel(param, inputs, outputs, ctx, primitive) {} | |||
| ~ReduceFp16CPUKernel() { | |||
| for (auto i = 0; i < data_buffers_.size(); i++) { | |||
| float16_t *buffer = data_buffers_[i]; | |||
| if (buffer != nullptr) { | |||
| free(buffer); | |||
| buffer = nullptr; | |||
| } | |||
| } | |||
| if (fp16_input_ != nullptr) { | |||
| free(fp16_input_); | |||
| fp16_input_ = nullptr; | |||
| } | |||
| src_data_ = nullptr; | |||
| dst_data_ = nullptr; | |||
| } | |||
| ~ReduceFp16CPUKernel() = default; | |||
| int Init() override; | |||
| int ReSize() override; | |||
| @@ -66,6 +52,7 @@ class ReduceFp16CPUKernel : public ReduceBaseCPUKernel { | |||
| private: | |||
| int MallocTmpBuffer(); | |||
| int FreeTmpBuffer(); | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| @@ -46,29 +46,44 @@ int TransposeFp16CPUKernel::Init() { | |||
| } | |||
| int TransposeFp16CPUKernel::ReSize() { | |||
| auto &inTensor = in_tensors_.front(); | |||
| auto &outTensor = out_tensors_.front(); | |||
| auto &in_tensor = in_tensors_.front(); | |||
| auto &out_tensor = out_tensors_.front(); | |||
| auto param = reinterpret_cast<TransposeParameter *>(op_parameter_); | |||
| auto in_shape = inTensor->shape(); | |||
| auto out_shape = outTensor->shape(); | |||
| auto in_shape = in_tensor->shape(); | |||
| auto out_shape = out_tensor->shape(); | |||
| param->strides_[param->num_axes_ - 1] = 1; | |||
| param->out_strides_[param->num_axes_ - 1] = 1; | |||
| param->data_size_ = inTensor->Size(); | |||
| param->data_size_ = in_tensor->Size(); | |||
| for (int i = param->num_axes_ - 2; i >= 0; i--) { | |||
| param->strides_[i] = in_shape[i + 1] * param->strides_[i + 1]; | |||
| param->out_strides_[i] = out_shape[i + 1] * param->out_strides_[i + 1]; | |||
| } | |||
| if (fp16_in_data_ != nullptr) { | |||
| free(fp16_in_data_); | |||
| context_->allocator->Free(fp16_in_data_); | |||
| fp16_in_data_ = nullptr; | |||
| } | |||
| fp16_in_data_ = reinterpret_cast<float16_t *>(malloc(sizeof(float16_t) * inTensor->ElementsNum())); | |||
| if (in_tensor->data_type() == kNumberTypeFloat || in_tensor->data_type() == kNumberTypeFloat32) { | |||
| fp16_in_data_ = | |||
| reinterpret_cast<float16_t *>(context_->allocator->Malloc(sizeof(float16_t) * in_tensor->ElementsNum())); | |||
| if (fp16_in_data_ == nullptr) { | |||
| MS_LOG(ERROR) << "Malloc data failed"; | |||
| return RET_ERROR; | |||
| } | |||
| } | |||
| if (fp16_out_data_ != nullptr) { | |||
| free(fp16_out_data_); | |||
| context_->allocator->Free(fp16_out_data_); | |||
| fp16_out_data_ = nullptr; | |||
| } | |||
| fp16_out_data_ = reinterpret_cast<float16_t *>(malloc(sizeof(float16_t) * outTensor->ElementsNum())); | |||
| if (out_tensor->data_type() == kNumberTypeFloat || out_tensor->data_type() == kNumberTypeFloat32) { | |||
| fp16_out_data_ = | |||
| reinterpret_cast<float16_t *>(context_->allocator->Malloc(sizeof(float16_t) * out_tensor->ElementsNum())); | |||
| if (fp16_out_data_ == nullptr) { | |||
| MS_LOG(ERROR) << "Malloc data failed"; | |||
| return RET_ERROR; | |||
| } | |||
| } | |||
| return RET_OK; | |||
| } | |||
| @@ -79,12 +94,27 @@ int TransposeFp16CPUKernel::TransposeParallel(int task_id) { | |||
| } | |||
| int thread_offset = task_id * thread_h_stride_; | |||
| TransposeParameter *param = reinterpret_cast<TransposeParameter *>(this->op_parameter_); | |||
| if (in_tensors_.at(0)->data_type() == kNumberTypeFloat16) { | |||
| fp16_in_data_ = reinterpret_cast<float16_t *>(in_tensors_.at(0)->Data()); | |||
| } | |||
| if (out_tensors_.at(0)->data_type() == kNumberTypeFloat16) { | |||
| fp16_out_data_ = reinterpret_cast<float16_t *>(out_tensors_.at(0)->Data()); | |||
| } | |||
| auto ret = DoTranspose(fp16_in_data_, fp16_out_data_, in_shape_, out_shape_, param, thread_offset, | |||
| thread_offset + num_unit_thread); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Transpose error task_id[" << task_id << "] error_code[" << ret << "]"; | |||
| return RET_ERROR; | |||
| } | |||
| if (in_tensors_.at(0)->data_type() == kNumberTypeFloat32 || in_tensors_.at(0)->data_type() == kNumberTypeFloat) { | |||
| context_->allocator->Free(fp16_in_data_); | |||
| } | |||
| if (out_tensors_.at(0)->data_type() == kNumberTypeFloat32 || out_tensors_.at(0)->data_type() == kNumberTypeFloat) { | |||
| context_->allocator->Free(fp16_out_data_); | |||
| } | |||
| return RET_OK; | |||
| } | |||
| @@ -112,9 +142,17 @@ int TransposeFp16CPUKernel::Run() { | |||
| MS_LOG(ERROR) << "null pointer dreferencing."; | |||
| return RET_ERROR; | |||
| } | |||
| in_data_ = reinterpret_cast<float *>(in_tensor->Data()); | |||
| out_data_ = reinterpret_cast<float *>(out_tensor->Data()); | |||
| Float32ToFloat16(in_data_, fp16_in_data_, in_tensor->ElementsNum()); | |||
| if (in_tensor->data_type() == kNumberTypeFloat || in_tensor->data_type() == kNumberTypeFloat32) { | |||
| in_data_ = reinterpret_cast<float *>(in_tensor->Data()); | |||
| Float32ToFloat16(in_data_, fp16_in_data_, in_tensor->ElementsNum()); | |||
| } else { | |||
| fp16_in_data_ = reinterpret_cast<float16_t *>(in_tensor->Data()); | |||
| } | |||
| if (out_tensor->data_type() == kNumberTypeFloat16) { | |||
| fp16_out_data_ = reinterpret_cast<float16_t *>(out_tensor->Data()); | |||
| } | |||
| in_shape_ = const_cast<int *>(in_tensor->shape().data()); | |||
| out_shape_ = const_cast<int *>(out_tensor->shape().data()); | |||
| @@ -123,9 +161,24 @@ int TransposeFp16CPUKernel::Run() { | |||
| MS_LOG(ERROR) << "Tranpose error error_code[" << ret << "]"; | |||
| return ret; | |||
| } | |||
| Float16ToFloat32(fp16_out_data_, out_data_, out_tensor->ElementsNum()); | |||
| if (in_tensor->data_type() == kNumberTypeFloat || in_tensor->data_type() == kNumberTypeFloat32) { | |||
| context_->allocator->Free(fp16_in_data_); | |||
| fp16_in_data_ = nullptr; | |||
| } | |||
| if (out_tensor->data_type() == kNumberTypeFloat || out_tensor->data_type() == kNumberTypeFloat32) { | |||
| out_data_ = reinterpret_cast<float *>(out_tensor->Data()); | |||
| if (out_data_ == nullptr) { | |||
| return RET_ERROR; | |||
| } | |||
| Float16ToFloat32(fp16_out_data_, out_data_, out_tensor->ElementsNum()); | |||
| context_->allocator->Free(fp16_out_data_); | |||
| fp16_out_data_ = nullptr; | |||
| } | |||
| return ret; | |||
| } // namespace mindspore::kernel | |||
| } | |||
| kernel::LiteKernel *CpuTransposeFp16KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs, | |||
| const std::vector<lite::tensor::Tensor *> &outputs, | |||
| @@ -30,16 +30,7 @@ class TransposeFp16CPUKernel : public LiteKernel { | |||
| const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx, | |||
| const lite::Primitive *primitive) | |||
| : LiteKernel(param, inputs, outputs, ctx, primitive), thread_num_(ctx->thread_num_) {} | |||
| ~TransposeFp16CPUKernel() { | |||
| if (fp16_in_data_ != nullptr) { | |||
| free(fp16_in_data_); | |||
| fp16_in_data_ = nullptr; | |||
| } | |||
| if (fp16_out_data_ != nullptr) { | |||
| free(fp16_out_data_); | |||
| fp16_out_data_ = nullptr; | |||
| } | |||
| } | |||
| ~TransposeFp16CPUKernel() = default; | |||
| int Init() override; | |||
| int ReSize() override; | |||