| @@ -42,35 +42,47 @@ int ConcatFp16CPUKernel::Init() { | |||||
| } | } | ||||
| int ConcatFp16CPUKernel::ReSize() { | int ConcatFp16CPUKernel::ReSize() { | ||||
| for (auto ptr : fp16_inputs_) { | |||||
| if (ptr != nullptr) { | |||||
| free(ptr); | |||||
| ptr = nullptr; | |||||
| FreeTmpBuffer(); | |||||
| for (const auto &in_tensor : in_tensors_) { | |||||
| float16_t *ptr = nullptr; | |||||
| if (in_tensor->data_type() == kNumberTypeFloat32 || in_tensor->data_type() == kNumberTypeFloat) { | |||||
| ptr = reinterpret_cast<float16_t *>(context_->allocator->Malloc(sizeof(float16_t) * in_tensor->ElementsNum())); | |||||
| if (ptr == nullptr) { | |||||
| MS_LOG(ERROR) << "malloc failed"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| } | } | ||||
| fp16_inputs_.push_back(ptr); | |||||
| } | } | ||||
| fp16_inputs_.clear(); | |||||
| for (size_t i = 0; i < in_tensors_.size(); ++i) { | |||||
| float16_t *ptr = nullptr; | |||||
| ptr = reinterpret_cast<float16_t *>(malloc(sizeof(float16_t) * in_tensors_[i]->ElementsNum())); | |||||
| if (ptr == nullptr) { | |||||
| auto &out_tensor = out_tensors_.at(0); | |||||
| if (out_tensor->data_type() == kNumberTypeFloat32 || out_tensor->data_type() == kNumberTypeFloat) { | |||||
| if (fp16_output_ != nullptr) { | |||||
| context_->allocator->Free(fp16_output_); | |||||
| fp16_output_ = nullptr; | |||||
| } | |||||
| fp16_output_ = | |||||
| reinterpret_cast<float16_t *>(context_->allocator->Malloc(sizeof(float16_t) * out_tensors_[0]->ElementsNum())); | |||||
| if (fp16_output_ == nullptr) { | |||||
| MS_LOG(ERROR) << "malloc failed"; | MS_LOG(ERROR) << "malloc failed"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| fp16_inputs_.push_back(ptr); | |||||
| } | } | ||||
| if (fp16_output_ != nullptr) { | |||||
| free(fp16_output_); | |||||
| fp16_output_ = nullptr; | |||||
| } | |||||
| fp16_output_ = reinterpret_cast<float16_t *>(malloc(sizeof(float16_t) * out_tensors_[0]->ElementsNum())); | |||||
| if (fp16_output_ == nullptr) { | |||||
| MS_LOG(ERROR) << "malloc failed"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| return ConcatBaseCPUKernel::ReSize(); | return ConcatBaseCPUKernel::ReSize(); | ||||
| } | } | ||||
| void ConcatFp16CPUKernel::FreeTmpBuffer() { | |||||
| for (auto ptr : fp16_inputs_) { | |||||
| if (ptr != nullptr) { | |||||
| context_->allocator->Free(ptr); | |||||
| ptr = nullptr; | |||||
| } | |||||
| } | |||||
| fp16_inputs_.clear(); | |||||
| } | |||||
| int ConcatFp16CPUKernel::Run() { | int ConcatFp16CPUKernel::Run() { | ||||
| auto prepare_ret = Prepare(); | auto prepare_ret = Prepare(); | ||||
| if (prepare_ret != RET_OK) { | if (prepare_ret != RET_OK) { | ||||
| @@ -78,28 +90,53 @@ int ConcatFp16CPUKernel::Run() { | |||||
| return prepare_ret; | return prepare_ret; | ||||
| } | } | ||||
| auto input_num = in_tensors_.size(); | auto input_num = in_tensors_.size(); | ||||
| std::vector<float *> inputs_addr(input_num, nullptr); | |||||
| std::vector<int *> inputs_output_shape(input_num + 1, nullptr); | std::vector<int *> inputs_output_shape(input_num + 1, nullptr); | ||||
| std::vector<std::vector<int>> shapes; | std::vector<std::vector<int>> shapes; | ||||
| for (size_t i = 0; i < input_num; ++i) { | for (size_t i = 0; i < input_num; ++i) { | ||||
| inputs_addr[i] = reinterpret_cast<float *>(in_tensors_[i]->Data()); | |||||
| if (inputs_addr[i] == nullptr) { | |||||
| MS_LOG(ERROR) << "got nullptr when cast in_tensor to float ptr"; | |||||
| return RET_ERROR; | |||||
| const auto in_tensor = in_tensors_[i]; | |||||
| if (in_tensor->data_type() == kNumberTypeFloat || in_tensor->data_type() == kNumberTypeFloat32) { | |||||
| auto in_tensor_data = reinterpret_cast<float *>(in_tensor->Data()); | |||||
| if (in_tensor_data == nullptr) { | |||||
| MS_LOG(ERROR) << "got nullptr when cast in_tensor to float ptr"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| Float32ToFloat16(in_tensor_data, fp16_inputs_[i], in_tensor->ElementsNum()); | |||||
| } else { | |||||
| fp16_inputs_[i] = reinterpret_cast<float16_t *>(in_tensor->Data()); | |||||
| } | } | ||||
| Float32ToFloat16(inputs_addr[i], fp16_inputs_[i], in_tensors_[i]->ElementsNum()); | |||||
| shapes.push_back(in_tensors_[i]->shape()); | shapes.push_back(in_tensors_[i]->shape()); | ||||
| inputs_output_shape[i] = shapes[i].data(); | inputs_output_shape[i] = shapes[i].data(); | ||||
| } | } | ||||
| auto output_shape = out_tensors_.at(0)->shape(); | auto output_shape = out_tensors_.at(0)->shape(); | ||||
| inputs_output_shape[input_num] = output_shape.data(); | inputs_output_shape[input_num] = output_shape.data(); | ||||
| auto output_addr = out_tensors_.at(0)->Data(); | auto output_addr = out_tensors_.at(0)->Data(); | ||||
| if (out_tensors_.at(0)->data_type() == kNumberTypeFloat16) { | |||||
| fp16_output_ = reinterpret_cast<float16_t *>(out_tensors_.at(0)->Data()); | |||||
| } | |||||
| ConcatFp16(reinterpret_cast<void **>(fp16_inputs_.data()), input_num, axis_, inputs_output_shape.data(), | ConcatFp16(reinterpret_cast<void **>(fp16_inputs_.data()), input_num, axis_, inputs_output_shape.data(), | ||||
| output_shape.size(), reinterpret_cast<void *>(fp16_output_)); | output_shape.size(), reinterpret_cast<void *>(fp16_output_)); | ||||
| Float16ToFloat32(fp16_output_, reinterpret_cast<float *>(output_addr), out_tensors_.at(0)->ElementsNum()); | |||||
| // free fp16 in out buffer | |||||
| if (out_tensors_.at(0)->data_type() == kNumberTypeFloat32 || out_tensors_.at(0)->data_type() == kNumberTypeFloat) { | |||||
| Float16ToFloat32(fp16_output_, reinterpret_cast<float *>(output_addr), out_tensors_.at(0)->ElementsNum()); | |||||
| context_->allocator->Free(fp16_output_); | |||||
| fp16_output_ = nullptr; | |||||
| } | |||||
| for (auto i = 0; i < fp16_inputs_.size(); i++) { | |||||
| const auto in_tensor = in_tensors_[i]; | |||||
| if (in_tensor->data_type() == kNumberTypeFloat || in_tensor->data_type() == kNumberTypeFloat32) { | |||||
| auto ptr = fp16_inputs_[i]; | |||||
| if (ptr != nullptr) { | |||||
| context_->allocator->Free(ptr); | |||||
| ptr = nullptr; | |||||
| } | |||||
| } | |||||
| } | |||||
| fp16_inputs_.clear(); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -32,13 +32,7 @@ class ConcatFp16CPUKernel : public ConcatBaseCPUKernel { | |||||
| const lite::Primitive *primitive) | const lite::Primitive *primitive) | ||||
| : ConcatBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} | : ConcatBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} | ||||
| ~ConcatFp16CPUKernel() { | |||||
| for (auto ptr : fp16_inputs_) { | |||||
| if (ptr != nullptr) { | |||||
| free(ptr); | |||||
| } | |||||
| } | |||||
| } | |||||
| ~ConcatFp16CPUKernel() = default; | |||||
| int Init() override; | int Init() override; | ||||
| @@ -46,6 +40,9 @@ class ConcatFp16CPUKernel : public ConcatBaseCPUKernel { | |||||
| int Run() override; | int Run() override; | ||||
| private: | |||||
| void FreeTmpBuffer(); | |||||
| private: | private: | ||||
| std::vector<float16_t *> fp16_inputs_; | std::vector<float16_t *> fp16_inputs_; | ||||
| float16_t *fp16_output_ = nullptr; | float16_t *fp16_output_ = nullptr; | ||||
| @@ -59,14 +59,17 @@ int ReduceFp16CPUKernel::Init() { | |||||
| int ReduceFp16CPUKernel::ReSize() { | int ReduceFp16CPUKernel::ReSize() { | ||||
| if (fp16_input_ != nullptr) { | if (fp16_input_ != nullptr) { | ||||
| free(fp16_input_); | |||||
| context_->allocator->Free(fp16_input_); | |||||
| fp16_input_ = nullptr; | fp16_input_ = nullptr; | ||||
| } | } | ||||
| auto ele_num = in_tensors_.at(0)->ElementsNum(); | |||||
| fp16_input_ = reinterpret_cast<float16_t *>(malloc(sizeof(float16_t) * ele_num)); | |||||
| if (fp16_input_ == nullptr) { | |||||
| MS_LOG(ERROR) << "malloc fp16_src_data_ falied"; | |||||
| return RET_ERROR; | |||||
| auto in_tensor = in_tensors_.front(); | |||||
| if (in_tensor->data_type() == kNumberTypeFloat32 || in_tensor->data_type() == kNumberTypeFloat) { | |||||
| fp16_input_ = | |||||
| reinterpret_cast<float16_t *>(context_->allocator->Malloc(in_tensor->ElementsNum() * sizeof(float16_t))); | |||||
| if (fp16_input_ == nullptr) { | |||||
| return RET_ERROR; | |||||
| } | |||||
| Float32ToFloat16(reinterpret_cast<float *>(in_tensor->Data()), fp16_input_, in_tensor->ElementsNum()); | |||||
| } | } | ||||
| return MallocTmpBuffer(); | return MallocTmpBuffer(); | ||||
| } | } | ||||
| @@ -93,10 +96,12 @@ int ReduceFp16CPUKernel::Run() { | |||||
| MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret; | MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret; | ||||
| return prepare_ret; | return prepare_ret; | ||||
| } | } | ||||
| tmp_shape_ = in_tensors_.at(0)->shape(); | tmp_shape_ = in_tensors_.at(0)->shape(); | ||||
| src_data_ = static_cast<float *>(in_tensors_.at(0)->Data()); | |||||
| auto ele_num = in_tensors_.at(0)->ElementsNum(); | |||||
| Float32ToFloat16(src_data_, fp16_input_, ele_num); | |||||
| auto in_tensor = in_tensors_.at(0); | |||||
| if (in_tensor->data_type() == kNumberTypeFloat16) { | |||||
| fp16_input_ = reinterpret_cast<float16_t *>(in_tensor->Data()); | |||||
| } | |||||
| fp16_src_data_ = fp16_input_; | fp16_src_data_ = fp16_input_; | ||||
| for (int i = 0; i < data_buffers_.size(); ++i) { | for (int i = 0; i < data_buffers_.size(); ++i) { | ||||
| fp16_dst_data_ = data_buffers_[i]; | fp16_dst_data_ = data_buffers_[i]; | ||||
| @@ -119,19 +124,36 @@ int ReduceFp16CPUKernel::Run() { | |||||
| fp16_src_data_ = fp16_dst_data_; | fp16_src_data_ = fp16_dst_data_; | ||||
| } | } | ||||
| dst_data_ = reinterpret_cast<float *>(out_tensors_.at(0)->Data()); | |||||
| Float16ToFloat32(fp16_dst_data_, dst_data_, out_tensors_.at(0)->ElementsNum()); | |||||
| auto out_tensor = out_tensors_.at(0); | |||||
| if (out_tensor->data_type() == kNumberTypeFloat32 || out_tensor->data_type() == kNumberTypeFloat) { | |||||
| dst_data_ = reinterpret_cast<float *>(out_tensor->Data()); | |||||
| Float16ToFloat32(fp16_dst_data_, dst_data_, out_tensor->ElementsNum()); | |||||
| } else { | |||||
| memcpy(out_tensor->Data(), fp16_dst_data_, out_tensor->ElementsNum() * sizeof(float16_t)); | |||||
| } | |||||
| if (in_tensor->data_type() == kNumberTypeFloat32 || in_tensor->data_type() == kNumberTypeFloat) { | |||||
| context_->allocator->Free(fp16_input_); | |||||
| } | |||||
| fp16_input_ = nullptr; | |||||
| FreeTmpBuffer(); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int ReduceFp16CPUKernel::MallocTmpBuffer() { | |||||
| int ReduceFp16CPUKernel::FreeTmpBuffer() { | |||||
| for (auto buffer : data_buffers_) { | for (auto buffer : data_buffers_) { | ||||
| if (buffer != nullptr) { | if (buffer != nullptr) { | ||||
| free(buffer); | |||||
| context_->allocator->Free(buffer); | |||||
| buffer = nullptr; | buffer = nullptr; | ||||
| } | } | ||||
| } | } | ||||
| data_buffers_.clear(); | data_buffers_.clear(); | ||||
| return RET_OK; | |||||
| } | |||||
| int ReduceFp16CPUKernel::MallocTmpBuffer() { | |||||
| auto ret = FreeTmpBuffer(); | |||||
| auto input_shape = in_tensors_.at(0)->shape(); | auto input_shape = in_tensors_.at(0)->shape(); | ||||
| for (auto i = 0; i < num_axes_; i++) { | for (auto i = 0; i < num_axes_; i++) { | ||||
| @@ -142,7 +164,7 @@ int ReduceFp16CPUKernel::MallocTmpBuffer() { | |||||
| size *= input_shape[j]; | size *= input_shape[j]; | ||||
| } | } | ||||
| } | } | ||||
| float16_t *buffer = reinterpret_cast<float16_t *>(malloc(size * sizeof(float16_t))); | |||||
| float16_t *buffer = reinterpret_cast<float16_t *>(context_->allocator->Malloc(size * sizeof(float16_t))); | |||||
| if (buffer == nullptr) { | if (buffer == nullptr) { | ||||
| MS_LOG(ERROR) << "Malloc data failed."; | MS_LOG(ERROR) << "Malloc data failed."; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| @@ -150,7 +172,7 @@ int ReduceFp16CPUKernel::MallocTmpBuffer() { | |||||
| data_buffers_.emplace_back(buffer); | data_buffers_.emplace_back(buffer); | ||||
| input_shape[axis] = 1; | input_shape[axis] = 1; | ||||
| } | } | ||||
| return RET_OK; | |||||
| return ret; | |||||
| } | } | ||||
| kernel::LiteKernel *CpuReduceFp16KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs, | kernel::LiteKernel *CpuReduceFp16KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs, | ||||
| @@ -34,21 +34,7 @@ class ReduceFp16CPUKernel : public ReduceBaseCPUKernel { | |||||
| const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx, | const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx, | ||||
| const lite::Primitive *primitive) | const lite::Primitive *primitive) | ||||
| : ReduceBaseCPUKernel(param, inputs, outputs, ctx, primitive) {} | : ReduceBaseCPUKernel(param, inputs, outputs, ctx, primitive) {} | ||||
| ~ReduceFp16CPUKernel() { | |||||
| for (auto i = 0; i < data_buffers_.size(); i++) { | |||||
| float16_t *buffer = data_buffers_[i]; | |||||
| if (buffer != nullptr) { | |||||
| free(buffer); | |||||
| buffer = nullptr; | |||||
| } | |||||
| } | |||||
| if (fp16_input_ != nullptr) { | |||||
| free(fp16_input_); | |||||
| fp16_input_ = nullptr; | |||||
| } | |||||
| src_data_ = nullptr; | |||||
| dst_data_ = nullptr; | |||||
| } | |||||
| ~ReduceFp16CPUKernel() = default; | |||||
| int Init() override; | int Init() override; | ||||
| int ReSize() override; | int ReSize() override; | ||||
| @@ -66,6 +52,7 @@ class ReduceFp16CPUKernel : public ReduceBaseCPUKernel { | |||||
| private: | private: | ||||
| int MallocTmpBuffer(); | int MallocTmpBuffer(); | ||||
| int FreeTmpBuffer(); | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -46,29 +46,44 @@ int TransposeFp16CPUKernel::Init() { | |||||
| } | } | ||||
| int TransposeFp16CPUKernel::ReSize() { | int TransposeFp16CPUKernel::ReSize() { | ||||
| auto &inTensor = in_tensors_.front(); | |||||
| auto &outTensor = out_tensors_.front(); | |||||
| auto &in_tensor = in_tensors_.front(); | |||||
| auto &out_tensor = out_tensors_.front(); | |||||
| auto param = reinterpret_cast<TransposeParameter *>(op_parameter_); | auto param = reinterpret_cast<TransposeParameter *>(op_parameter_); | ||||
| auto in_shape = inTensor->shape(); | |||||
| auto out_shape = outTensor->shape(); | |||||
| auto in_shape = in_tensor->shape(); | |||||
| auto out_shape = out_tensor->shape(); | |||||
| param->strides_[param->num_axes_ - 1] = 1; | param->strides_[param->num_axes_ - 1] = 1; | ||||
| param->out_strides_[param->num_axes_ - 1] = 1; | param->out_strides_[param->num_axes_ - 1] = 1; | ||||
| param->data_size_ = inTensor->Size(); | |||||
| param->data_size_ = in_tensor->Size(); | |||||
| for (int i = param->num_axes_ - 2; i >= 0; i--) { | for (int i = param->num_axes_ - 2; i >= 0; i--) { | ||||
| param->strides_[i] = in_shape[i + 1] * param->strides_[i + 1]; | param->strides_[i] = in_shape[i + 1] * param->strides_[i + 1]; | ||||
| param->out_strides_[i] = out_shape[i + 1] * param->out_strides_[i + 1]; | param->out_strides_[i] = out_shape[i + 1] * param->out_strides_[i + 1]; | ||||
| } | } | ||||
| if (fp16_in_data_ != nullptr) { | if (fp16_in_data_ != nullptr) { | ||||
| free(fp16_in_data_); | |||||
| context_->allocator->Free(fp16_in_data_); | |||||
| fp16_in_data_ = nullptr; | fp16_in_data_ = nullptr; | ||||
| } | } | ||||
| fp16_in_data_ = reinterpret_cast<float16_t *>(malloc(sizeof(float16_t) * inTensor->ElementsNum())); | |||||
| if (in_tensor->data_type() == kNumberTypeFloat || in_tensor->data_type() == kNumberTypeFloat32) { | |||||
| fp16_in_data_ = | |||||
| reinterpret_cast<float16_t *>(context_->allocator->Malloc(sizeof(float16_t) * in_tensor->ElementsNum())); | |||||
| if (fp16_in_data_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Malloc data failed"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| } | |||||
| if (fp16_out_data_ != nullptr) { | if (fp16_out_data_ != nullptr) { | ||||
| free(fp16_out_data_); | |||||
| context_->allocator->Free(fp16_out_data_); | |||||
| fp16_out_data_ = nullptr; | fp16_out_data_ = nullptr; | ||||
| } | } | ||||
| fp16_out_data_ = reinterpret_cast<float16_t *>(malloc(sizeof(float16_t) * outTensor->ElementsNum())); | |||||
| if (out_tensor->data_type() == kNumberTypeFloat || out_tensor->data_type() == kNumberTypeFloat32) { | |||||
| fp16_out_data_ = | |||||
| reinterpret_cast<float16_t *>(context_->allocator->Malloc(sizeof(float16_t) * out_tensor->ElementsNum())); | |||||
| if (fp16_out_data_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Malloc data failed"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| } | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -79,12 +94,27 @@ int TransposeFp16CPUKernel::TransposeParallel(int task_id) { | |||||
| } | } | ||||
| int thread_offset = task_id * thread_h_stride_; | int thread_offset = task_id * thread_h_stride_; | ||||
| TransposeParameter *param = reinterpret_cast<TransposeParameter *>(this->op_parameter_); | TransposeParameter *param = reinterpret_cast<TransposeParameter *>(this->op_parameter_); | ||||
| if (in_tensors_.at(0)->data_type() == kNumberTypeFloat16) { | |||||
| fp16_in_data_ = reinterpret_cast<float16_t *>(in_tensors_.at(0)->Data()); | |||||
| } | |||||
| if (out_tensors_.at(0)->data_type() == kNumberTypeFloat16) { | |||||
| fp16_out_data_ = reinterpret_cast<float16_t *>(out_tensors_.at(0)->Data()); | |||||
| } | |||||
| auto ret = DoTranspose(fp16_in_data_, fp16_out_data_, in_shape_, out_shape_, param, thread_offset, | auto ret = DoTranspose(fp16_in_data_, fp16_out_data_, in_shape_, out_shape_, param, thread_offset, | ||||
| thread_offset + num_unit_thread); | thread_offset + num_unit_thread); | ||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| MS_LOG(ERROR) << "Transpose error task_id[" << task_id << "] error_code[" << ret << "]"; | MS_LOG(ERROR) << "Transpose error task_id[" << task_id << "] error_code[" << ret << "]"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| if (in_tensors_.at(0)->data_type() == kNumberTypeFloat32 || in_tensors_.at(0)->data_type() == kNumberTypeFloat) { | |||||
| context_->allocator->Free(fp16_in_data_); | |||||
| } | |||||
| if (out_tensors_.at(0)->data_type() == kNumberTypeFloat32 || out_tensors_.at(0)->data_type() == kNumberTypeFloat) { | |||||
| context_->allocator->Free(fp16_out_data_); | |||||
| } | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -112,9 +142,17 @@ int TransposeFp16CPUKernel::Run() { | |||||
| MS_LOG(ERROR) << "null pointer dreferencing."; | MS_LOG(ERROR) << "null pointer dreferencing."; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| in_data_ = reinterpret_cast<float *>(in_tensor->Data()); | |||||
| out_data_ = reinterpret_cast<float *>(out_tensor->Data()); | |||||
| Float32ToFloat16(in_data_, fp16_in_data_, in_tensor->ElementsNum()); | |||||
| if (in_tensor->data_type() == kNumberTypeFloat || in_tensor->data_type() == kNumberTypeFloat32) { | |||||
| in_data_ = reinterpret_cast<float *>(in_tensor->Data()); | |||||
| Float32ToFloat16(in_data_, fp16_in_data_, in_tensor->ElementsNum()); | |||||
| } else { | |||||
| fp16_in_data_ = reinterpret_cast<float16_t *>(in_tensor->Data()); | |||||
| } | |||||
| if (out_tensor->data_type() == kNumberTypeFloat16) { | |||||
| fp16_out_data_ = reinterpret_cast<float16_t *>(out_tensor->Data()); | |||||
| } | |||||
| in_shape_ = const_cast<int *>(in_tensor->shape().data()); | in_shape_ = const_cast<int *>(in_tensor->shape().data()); | ||||
| out_shape_ = const_cast<int *>(out_tensor->shape().data()); | out_shape_ = const_cast<int *>(out_tensor->shape().data()); | ||||
| @@ -123,9 +161,24 @@ int TransposeFp16CPUKernel::Run() { | |||||
| MS_LOG(ERROR) << "Tranpose error error_code[" << ret << "]"; | MS_LOG(ERROR) << "Tranpose error error_code[" << ret << "]"; | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| Float16ToFloat32(fp16_out_data_, out_data_, out_tensor->ElementsNum()); | |||||
| if (in_tensor->data_type() == kNumberTypeFloat || in_tensor->data_type() == kNumberTypeFloat32) { | |||||
| context_->allocator->Free(fp16_in_data_); | |||||
| fp16_in_data_ = nullptr; | |||||
| } | |||||
| if (out_tensor->data_type() == kNumberTypeFloat || out_tensor->data_type() == kNumberTypeFloat32) { | |||||
| out_data_ = reinterpret_cast<float *>(out_tensor->Data()); | |||||
| if (out_data_ == nullptr) { | |||||
| return RET_ERROR; | |||||
| } | |||||
| Float16ToFloat32(fp16_out_data_, out_data_, out_tensor->ElementsNum()); | |||||
| context_->allocator->Free(fp16_out_data_); | |||||
| fp16_out_data_ = nullptr; | |||||
| } | |||||
| return ret; | return ret; | ||||
| } // namespace mindspore::kernel | |||||
| } | |||||
| kernel::LiteKernel *CpuTransposeFp16KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs, | kernel::LiteKernel *CpuTransposeFp16KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs, | ||||
| const std::vector<lite::tensor::Tensor *> &outputs, | const std::vector<lite::tensor::Tensor *> &outputs, | ||||
| @@ -30,16 +30,7 @@ class TransposeFp16CPUKernel : public LiteKernel { | |||||
| const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx, | const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx, | ||||
| const lite::Primitive *primitive) | const lite::Primitive *primitive) | ||||
| : LiteKernel(param, inputs, outputs, ctx, primitive), thread_num_(ctx->thread_num_) {} | : LiteKernel(param, inputs, outputs, ctx, primitive), thread_num_(ctx->thread_num_) {} | ||||
| ~TransposeFp16CPUKernel() { | |||||
| if (fp16_in_data_ != nullptr) { | |||||
| free(fp16_in_data_); | |||||
| fp16_in_data_ = nullptr; | |||||
| } | |||||
| if (fp16_out_data_ != nullptr) { | |||||
| free(fp16_out_data_); | |||||
| fp16_out_data_ = nullptr; | |||||
| } | |||||
| } | |||||
| ~TransposeFp16CPUKernel() = default; | |||||
| int Init() override; | int Init() override; | ||||
| int ReSize() override; | int ReSize() override; | ||||