From: @hangangqiang Reviewed-by: @zhang_xue_tong,@zhanghaibo5 Signed-off-by: @zhang_xue_tongpull/14152/MERGE
| @@ -474,6 +474,25 @@ void PackNCHWFp32ToNC8HW8Fp16(float *src, float16_t *dst, int batch, int plane, | |||||
| } | } | ||||
| } | } | ||||
| void PackNCHWFp16ToNC8HW8Fp16(float16_t *src, float16_t *dst, int batch, int plane, int channel) { | |||||
| int c8 = UP_DIV(channel, C8NUM); | |||||
| for (int b = 0; b < batch; b++) { | |||||
| int src_offset = b * plane * channel; | |||||
| int dst_offset = b * plane * c8 * C8NUM; | |||||
| for (int c = 0; c < channel; c++) { | |||||
| int c8_block_num = c / C8NUM; | |||||
| int c8_block_rem = c % C8NUM; | |||||
| int src_c_offset = src_offset + c * plane; | |||||
| int dst_c_offset = dst_offset + c8_block_num * plane * C8NUM; | |||||
| for (int k = 0; k < plane; k++) { | |||||
| int src_kernel_offset = src_c_offset + k; | |||||
| int dst_kernel_offset = dst_c_offset + C8NUM * k + c8_block_rem; | |||||
| (dst + dst_kernel_offset)[0] = (float16_t)(src + src_kernel_offset)[0]; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| void PackNHWCFp32ToNHWC8Fp16(float *src, float16_t *dst, int batch, int plane, int channel) { | void PackNHWCFp32ToNHWC8Fp16(float *src, float16_t *dst, int batch, int plane, int channel) { | ||||
| int c8_channel = UP_DIV(channel, C8NUM) * C8NUM; | int c8_channel = UP_DIV(channel, C8NUM) * C8NUM; | ||||
| for (int b = 0; b < batch; b++) { | for (int b = 0; b < batch; b++) { | ||||
| @@ -504,6 +523,21 @@ void PackNHWCFp32ToC8HWN8Fp16(float *src, float16_t *dst, int batch, int plane, | |||||
| return; | return; | ||||
| } | } | ||||
| void PackNHWCFp16ToC8HWN8Fp16(float16_t *src, float16_t *dst, int batch, int plane, int channel) { | |||||
| for (int n = 0; n < batch; n++) { | |||||
| for (int hw = 0; hw < plane; hw++) { | |||||
| for (int c = 0; c < channel; c++) { | |||||
| int c8div = c / C8NUM; | |||||
| int c8mod = c % C8NUM; | |||||
| int src_index = n * plane * channel + hw * channel + c; | |||||
| int dst_index = c8div * batch * plane * C8NUM + hw * batch * C8NUM + n * C8NUM + c8mod; | |||||
| dst[dst_index] = src[src_index]; | |||||
| } | |||||
| } | |||||
| } | |||||
| return; | |||||
| } | |||||
| void PackNHWC8Fp16ToNHWCFp32(float16_t *src, float *dst, int batch, int plane, int channel) { | void PackNHWC8Fp16ToNHWCFp32(float16_t *src, float *dst, int batch, int plane, int channel) { | ||||
| int c8_channel = UP_DIV(channel, C8NUM) * C8NUM; | int c8_channel = UP_DIV(channel, C8NUM) * C8NUM; | ||||
| for (int b = 0; b < batch; b++) { | for (int b = 0; b < batch; b++) { | ||||
| @@ -61,10 +61,14 @@ void PackNC8HW8ToNHWCFp16(const void *src, void *dst, int batch, int plane, int | |||||
| void PackNCHWFp32ToNC8HW8Fp16(float *src, float16_t *dst, int batch, int plane, int channel); | void PackNCHWFp32ToNC8HW8Fp16(float *src, float16_t *dst, int batch, int plane, int channel); | ||||
| void PackNCHWFp16ToNC8HW8Fp16(float16_t *src, float16_t *dst, int batch, int plane, int channel); | |||||
| void PackNHWCFp32ToNHWC8Fp16(float *src, float16_t *dst, int batch, int plane, int channel); | void PackNHWCFp32ToNHWC8Fp16(float *src, float16_t *dst, int batch, int plane, int channel); | ||||
| void PackNHWCFp32ToC8HWN8Fp16(float *src, float16_t *dst, int batch, int plane, int channel); | void PackNHWCFp32ToC8HWN8Fp16(float *src, float16_t *dst, int batch, int plane, int channel); | ||||
| void PackNHWCFp16ToC8HWN8Fp16(float16_t *src, float16_t *dst, int batch, int plane, int channel); | |||||
| void PackNHWC8Fp16ToNHWCFp32(float16_t *src, float *dst, int batch, int plane, int channel); | void PackNHWC8Fp16ToNHWCFp32(float16_t *src, float *dst, int batch, int plane, int channel); | ||||
| void PackNHWC8ToNHWCFp16(float16_t *src, float16_t *dst, int batch, int plane, int channel); | void PackNHWC8ToNHWCFp16(float16_t *src, float16_t *dst, int batch, int plane, int channel); | ||||
| @@ -30,9 +30,6 @@ int QuantDtypeCastInferShape(const TensorC *const *inputs, size_t inputs_size, T | |||||
| TensorC *output = outputs[0]; | TensorC *output = outputs[0]; | ||||
| QuantDtypeCastParameter *param = (QuantDtypeCastParameter *)parameter; | QuantDtypeCastParameter *param = (QuantDtypeCastParameter *)parameter; | ||||
| if (input->data_type_ != param->srcT_) { | |||||
| return NNACL_ERR; | |||||
| } | |||||
| output->data_type_ = param->dstT_; | output->data_type_ = param->dstT_; | ||||
| output->format_ = input->format_; | output->format_ = input->format_; | ||||
| if (!parameter->infer_flag_) { | if (!parameter->infer_flag_) { | ||||
| @@ -24,7 +24,7 @@ extern "C" { | |||||
| typedef struct QuantDtypeCastParameter { | typedef struct QuantDtypeCastParameter { | ||||
| OpParameter op_parameter_; | OpParameter op_parameter_; | ||||
| int srcT_; | |||||
| int srcT_; // deprecated | |||||
| int dstT_; | int dstT_; | ||||
| } QuantDtypeCastParameter; | } QuantDtypeCastParameter; | ||||
| @@ -17,6 +17,7 @@ | |||||
| #include "src/inner_context.h" | #include "src/inner_context.h" | ||||
| #include "include/errorcode.h" | #include "include/errorcode.h" | ||||
| #include "src/common/log_adapter.h" | #include "src/common/log_adapter.h" | ||||
| #include "src/common/utils.h" | |||||
| #ifdef SUPPORT_NPU | #ifdef SUPPORT_NPU | ||||
| #include "src/runtime/agent/npu/npu_manager.h" | #include "src/runtime/agent/npu/npu_manager.h" | ||||
| #endif | #endif | ||||
| @@ -85,18 +86,18 @@ int InnerContext::IsValid() const { | |||||
| MS_LOG(ERROR) << "Device list is empty."; | MS_LOG(ERROR) << "Device list is empty."; | ||||
| return RET_NOT_SUPPORT; | return RET_NOT_SUPPORT; | ||||
| } | } | ||||
| if (!IsCpuEnabled()) { | |||||
| MS_LOG(ERROR) << "CPU is not supported."; | |||||
| if (!IsUserSetCpu()) { | |||||
| MS_LOG(ERROR) << "CPU context should be set."; | |||||
| return RET_NOT_SUPPORT; | return RET_NOT_SUPPORT; | ||||
| } | } | ||||
| #ifndef SUPPORT_GPU | #ifndef SUPPORT_GPU | ||||
| if (IsGpuEnabled()) { | |||||
| if (IsUserSetGpu()) { | |||||
| MS_LOG(ERROR) << "GPU is not supported."; | MS_LOG(ERROR) << "GPU is not supported."; | ||||
| return RET_NOT_SUPPORT; | return RET_NOT_SUPPORT; | ||||
| } | } | ||||
| #endif | #endif | ||||
| #ifndef SUPPORT_NPU | #ifndef SUPPORT_NPU | ||||
| if (IsNpuEnabled()) { | |||||
| if (IsUserSetNpu()) { | |||||
| MS_LOG(ERROR) << "NPU is not supported."; | MS_LOG(ERROR) << "NPU is not supported."; | ||||
| return RET_NOT_SUPPORT; | return RET_NOT_SUPPORT; | ||||
| } | } | ||||
| @@ -108,6 +109,9 @@ bool InnerContext::IsCpuFloat16Enabled() const { | |||||
| if (!IsCpuEnabled()) { | if (!IsCpuEnabled()) { | ||||
| return false; | return false; | ||||
| } | } | ||||
| if (!IsSupportFloat16()) { | |||||
| return false; | |||||
| } | |||||
| return GetCpuInfo().enable_float16_; | return GetCpuInfo().enable_float16_; | ||||
| } | } | ||||
| @@ -115,31 +119,47 @@ bool InnerContext::IsGpuFloat16Enabled() const { | |||||
| if (!IsGpuEnabled()) { | if (!IsGpuEnabled()) { | ||||
| return false; | return false; | ||||
| } | } | ||||
| if (!IsSupportFloat16()) { | |||||
| return false; | |||||
| } | |||||
| return GetGpuInfo().enable_float16_; | return GetGpuInfo().enable_float16_; | ||||
| } | } | ||||
| bool InnerContext::IsCpuEnabled() const { | |||||
| bool InnerContext::IsCpuEnabled() const { return IsUserSetCpu(); } | |||||
| bool InnerContext::IsGpuEnabled() const { | |||||
| #ifdef SUPPORT_GPU | |||||
| return IsUserSetGpu(); | |||||
| #else | |||||
| return false; | |||||
| #endif | |||||
| } | |||||
| bool InnerContext::IsNpuEnabled() const { | |||||
| #ifdef SUPPORT_NPU | |||||
| MS_ASSERT(npu_manager_ != nullptr); | |||||
| return IsUserSetNpu() && npu_manager_->IsSupportNPU(); | |||||
| #else | |||||
| return false; | |||||
| #endif | |||||
| } | |||||
| bool InnerContext::IsUserSetCpu() const { | |||||
| return this->device_list_.end() != | return this->device_list_.end() != | ||||
| std::find_if(this->device_list_.begin(), this->device_list_.end(), | std::find_if(this->device_list_.begin(), this->device_list_.end(), | ||||
| [](const DeviceContext &device) { return device.device_type_ == DT_CPU; }); | [](const DeviceContext &device) { return device.device_type_ == DT_CPU; }); | ||||
| } | } | ||||
| bool InnerContext::IsGpuEnabled() const { | |||||
| bool InnerContext::IsUserSetGpu() const { | |||||
| return this->device_list_.end() != | return this->device_list_.end() != | ||||
| std::find_if(this->device_list_.begin(), this->device_list_.end(), | std::find_if(this->device_list_.begin(), this->device_list_.end(), | ||||
| [](const DeviceContext &device) { return device.device_type_ == DT_GPU; }); | [](const DeviceContext &device) { return device.device_type_ == DT_GPU; }); | ||||
| } | } | ||||
| bool InnerContext::IsNpuEnabled() const { | |||||
| #ifdef SUPPORT_NPU | |||||
| MS_ASSERT(npu_manager_ != nullptr); | |||||
| bool InnerContext::IsUserSetNpu() const { | |||||
| return this->device_list_.end() != | return this->device_list_.end() != | ||||
| std::find_if(this->device_list_.begin(), this->device_list_.end(), | |||||
| [](const DeviceContext &device) { return device.device_type_ == DT_NPU; }) && | |||||
| npu_manager_->IsSupportNPU(); | |||||
| #else | |||||
| return false; | |||||
| #endif | |||||
| std::find_if(this->device_list_.begin(), this->device_list_.end(), | |||||
| [](const DeviceContext &device) { return device.device_type_ == DT_NPU; }); | |||||
| } | } | ||||
| CpuDeviceInfo InnerContext::GetCpuInfo() const { | CpuDeviceInfo InnerContext::GetCpuInfo() const { | ||||
| @@ -58,6 +58,13 @@ struct InnerContext : public Context { | |||||
| virtual ~InnerContext(); | virtual ~InnerContext(); | ||||
| private: | |||||
| bool IsUserSetCpu() const; | |||||
| bool IsUserSetGpu() const; | |||||
| bool IsUserSetNpu() const; | |||||
| #if SUPPORT_NPU | #if SUPPORT_NPU | ||||
| private: | private: | ||||
| @@ -44,48 +44,12 @@ int QuantDTypeCastCPUKernel::Init() { | |||||
| MS_ASSERT(out_tensor); | MS_ASSERT(out_tensor); | ||||
| auto param = reinterpret_cast<QuantDTypeCastParameter *>(op_parameter_); | auto param = reinterpret_cast<QuantDTypeCastParameter *>(op_parameter_); | ||||
| MS_ASSERT(param); | MS_ASSERT(param); | ||||
| if (param->srcT == kNumberTypeFloat32 && param->dstT == kNumberTypeInt8) { | |||||
| if (in_tensor->data_type() != kNumberTypeFloat32 || out_tensor->data_type() != kNumberTypeInt8) { | |||||
| MS_LOG(ERROR) << "param data type and tensor data type do not match."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| } else if (param->srcT == kNumberTypeInt8 && param->dstT == kNumberTypeFloat32) { | |||||
| if (in_tensor->data_type() != kNumberTypeInt8 || out_tensor->data_type() != kNumberTypeFloat32) { | |||||
| MS_LOG(ERROR) << "param data type and tensor data type do not match."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| } else if (param->srcT == kNumberTypeUInt8 && param->dstT == kNumberTypeInt8) { | |||||
| if (in_tensor->data_type() != kNumberTypeUInt8 || out_tensor->data_type() != kNumberTypeInt8) { | |||||
| MS_LOG(ERROR) << "param data type and tensor data type do not match."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| } else if (param->srcT == kNumberTypeInt8 && param->dstT == kNumberTypeInt8) { | |||||
| if (in_tensor->data_type() != kNumberTypeInt8 || out_tensor->data_type() != kNumberTypeInt8) { | |||||
| MS_LOG(ERROR) << "param data type and tensor data type do not match."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| } else if (param->srcT == kNumberTypeInt8 && param->dstT == kNumberTypeUInt8) { | |||||
| if (in_tensor->data_type() != kNumberTypeInt8 || out_tensor->data_type() != kNumberTypeUInt8) { | |||||
| MS_LOG(ERROR) << "param data type and tensor data type do not match."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| } else if (param->srcT == kNumberTypeUInt8 && param->dstT == kNumberTypeFloat32) { | |||||
| if (in_tensor->data_type() != kNumberTypeUInt8 || out_tensor->data_type() != kNumberTypeFloat32) { | |||||
| MS_LOG(ERROR) << "param data type and tensor data type do not match."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| } else if (param->srcT == kNumberTypeFloat32 && param->dstT == kNumberTypeUInt8) { | |||||
| if (in_tensor->data_type() != kNumberTypeFloat32 || out_tensor->data_type() != kNumberTypeUInt8) { | |||||
| MS_LOG(ERROR) << "param data type and tensor data type do not match."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| } else { | |||||
| MS_LOG(ERROR) << "param data type not supported:" | |||||
| << " src: " << param->srcT << " dst: " << param->dstT; | |||||
| return RET_PARAM_INVALID; | |||||
| } | |||||
| src_dtype = param->srcT; | |||||
| src_dtype = in_tensor->data_type(); | |||||
| dst_dtype = param->dstT; | dst_dtype = param->dstT; | ||||
| if (out_tensor->data_type() != dst_dtype) { | |||||
| MS_LOG(ERROR) << "param data type and tensor data type do not match."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| if (!InferShapeDone()) { | if (!InferShapeDone()) { | ||||
| return RET_OK; | return RET_OK; | ||||
| @@ -149,6 +113,10 @@ int QuantDTypeCastCPUKernel::QuantDTypeCast(int task_id) { | |||||
| ret = DoQuantizeFp32ToInt8(float32_ptr_ + thread_offset, int8_out_ptr_ + thread_offset, output_quant_arg.scale, | ret = DoQuantizeFp32ToInt8(float32_ptr_ + thread_offset, int8_out_ptr_ + thread_offset, output_quant_arg.scale, | ||||
| output_quant_arg.zeroPoint, num_unit_thread, from_uint8_src); | output_quant_arg.zeroPoint, num_unit_thread, from_uint8_src); | ||||
| } | } | ||||
| } else { | |||||
| MS_LOG(ERROR) << "param data type not supported:" | |||||
| << " src: " << src_dtype << " dst: " << dst_dtype; | |||||
| return RET_PARAM_INVALID; | |||||
| } | } | ||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| @@ -47,7 +47,7 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() { | |||||
| MS_LOG(ERROR) << "get execute filter data failed."; | MS_LOG(ERROR) << "get execute filter data failed."; | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| PackNCHWToNHWCFp16(fp16_weight_, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(), | |||||
| PackNCHWToNHWCFp16(execute_weight_, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(), | |||||
| weight_tensor->Batch()); | weight_tensor->Batch()); | ||||
| if (fp16_weight_ != nullptr) { | if (fp16_weight_ != nullptr) { | ||||
| free(fp16_weight_); | free(fp16_weight_); | ||||
| @@ -64,7 +64,7 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() { | |||||
| if (in_tensors_.size() == kInputSize2) { | if (in_tensors_.size() == kInputSize2) { | ||||
| auto bias_tensor = in_tensors_.at(kBiasIndex); | auto bias_tensor = in_tensors_.at(kBiasIndex); | ||||
| MS_ASSERT(origin_bias_); | MS_ASSERT(origin_bias_); | ||||
| auto ori_bias = reinterpret_cast<float *>(origin_bias_); | |||||
| auto ori_bias = reinterpret_cast<float16_t *>(origin_bias_); | |||||
| for (int i = 0; i < bias_tensor->ElementsNum(); i++) { | for (int i = 0; i < bias_tensor->ElementsNum(); i++) { | ||||
| bias_fp16[i] = (float16_t)ori_bias[i]; | bias_fp16[i] = (float16_t)ori_bias[i]; | ||||
| } | } | ||||
| @@ -68,7 +68,7 @@ int ConvolutionDepthwiseSWFp16CPUKernel::InitWeightBias() { | |||||
| MS_LOG(ERROR) << "Malloc buffer failed."; | MS_LOG(ERROR) << "Malloc buffer failed."; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| PackNCHWFp32ToNC8HW8Fp16(reinterpret_cast<float *>(origin_weight_), packed_weight_, 1, | |||||
| PackNCHWFp16ToNC8HW8Fp16(reinterpret_cast<float16_t *>(origin_weight_), packed_weight_, 1, | |||||
| weight_tensor->Height() * weight_tensor->Width(), weight_tensor->Batch()); | weight_tensor->Height() * weight_tensor->Width(), weight_tensor->Batch()); | ||||
| bias_data_ = reinterpret_cast<float16_t *>(malloc(C8NUM * OC8 * sizeof(float16_t))); | bias_data_ = reinterpret_cast<float16_t *>(malloc(C8NUM * OC8 * sizeof(float16_t))); | ||||
| @@ -81,7 +81,7 @@ int ConvolutionDepthwiseSWFp16CPUKernel::InitWeightBias() { | |||||
| if (in_tensors_.size() == kInputSize2) { | if (in_tensors_.size() == kInputSize2) { | ||||
| auto bias_tensor = in_tensors_.at(kBiasIndex); | auto bias_tensor = in_tensors_.at(kBiasIndex); | ||||
| MS_ASSERT(origin_bias_); | MS_ASSERT(origin_bias_); | ||||
| auto ori_bias = reinterpret_cast<float *>(origin_bias_); | |||||
| auto ori_bias = reinterpret_cast<float16_t *>(origin_bias_); | |||||
| for (int i = 0; i < bias_tensor->ElementsNum(); i++) { | for (int i = 0; i < bias_tensor->ElementsNum(); i++) { | ||||
| bias_fp16[i] = (float16_t)ori_bias[i]; | bias_fp16[i] = (float16_t)ori_bias[i]; | ||||
| } | } | ||||
| @@ -73,7 +73,7 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() { | |||||
| // init weight: o, h, w, i; o == group, i == 1 | // init weight: o, h, w, i; o == group, i == 1 | ||||
| auto weight_tensor = in_tensors_.at(kWeightIndex); | auto weight_tensor = in_tensors_.at(kWeightIndex); | ||||
| int OC8 = UP_DIV(weight_tensor->Batch(), C8NUM); | int OC8 = UP_DIV(weight_tensor->Batch(), C8NUM); | ||||
| auto origin_weight = reinterpret_cast<float *>(weight_tensor->MutableData()); | |||||
| auto origin_weight = reinterpret_cast<float16_t *>(weight_tensor->MutableData()); | |||||
| int pack_weight_size = C8NUM * OC8 * weight_tensor->Height() * weight_tensor->Width(); | int pack_weight_size = C8NUM * OC8 * weight_tensor->Height() * weight_tensor->Width(); | ||||
| packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t))); | packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t))); | ||||
| @@ -81,7 +81,7 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() { | |||||
| MS_LOG(ERROR) << "Malloc buffer failed."; | MS_LOG(ERROR) << "Malloc buffer failed."; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| PackNCHWFp32ToNC8HW8Fp16(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(), | |||||
| PackNCHWFp16ToNC8HW8Fp16(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(), | |||||
| weight_tensor->Batch()); | weight_tensor->Batch()); | ||||
| bias_data_ = reinterpret_cast<float16_t *>(malloc(C8NUM * OC8 * sizeof(float16_t))); | bias_data_ = reinterpret_cast<float16_t *>(malloc(C8NUM * OC8 * sizeof(float16_t))); | ||||
| @@ -92,9 +92,9 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() { | |||||
| memset(bias_data_, 0, C8NUM * OC8 * sizeof(float16_t)); | memset(bias_data_, 0, C8NUM * OC8 * sizeof(float16_t)); | ||||
| if (in_tensors_.size() == kInputSize2) { | if (in_tensors_.size() == kInputSize2) { | ||||
| auto bias_tensor = in_tensors_.at(kBiasIndex); | auto bias_tensor = in_tensors_.at(kBiasIndex); | ||||
| auto ori_bias = reinterpret_cast<float *>(bias_tensor->MutableData()); | |||||
| auto ori_bias = reinterpret_cast<float16_t *>(bias_tensor->MutableData()); | |||||
| for (int i = 0; i < bias_tensor->ElementsNum(); i++) { | for (int i = 0; i < bias_tensor->ElementsNum(); i++) { | ||||
| reinterpret_cast<float *>(bias_data_)[i] = (float16_t)ori_bias[i]; | |||||
| reinterpret_cast<float16_t *>(bias_data_)[i] = ori_bias[i]; | |||||
| } | } | ||||
| } | } | ||||
| @@ -57,7 +57,8 @@ int DeConvolutionFp16CPUKernel::InitWeightBias() { | |||||
| auto kernel_h = weight_tensor->Height(); | auto kernel_h = weight_tensor->Height(); | ||||
| auto kernel_w = weight_tensor->Width(); | auto kernel_w = weight_tensor->Width(); | ||||
| bias_data_ = malloc(UP_ROUND(output_channel, C4NUM) * sizeof(float16_t)); | |||||
| auto bias_size = UP_ROUND(output_channel, C4NUM) * sizeof(float16_t); | |||||
| bias_data_ = malloc(bias_size); | |||||
| if (bias_data_ == nullptr) { | if (bias_data_ == nullptr) { | ||||
| MS_LOG(ERROR) << "deconv malloc bias_data_ error!"; | MS_LOG(ERROR) << "deconv malloc bias_data_ error!"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| @@ -65,8 +66,15 @@ int DeConvolutionFp16CPUKernel::InitWeightBias() { | |||||
| memset(bias_data_, 0, UP_ROUND(output_channel, C4NUM) * sizeof(float16_t)); | memset(bias_data_, 0, UP_ROUND(output_channel, C4NUM) * sizeof(float16_t)); | ||||
| if (in_tensors_.size() == 3 && in_tensors_.at(kBiasIndex)->shape().size() == 1 && | if (in_tensors_.size() == 3 && in_tensors_.at(kBiasIndex)->shape().size() == 1 && | ||||
| in_tensors_.at(kBiasIndex)->DimensionSize(0) == output_channel) { | in_tensors_.at(kBiasIndex)->DimensionSize(0) == output_channel) { | ||||
| Float32ToFloat16(reinterpret_cast<float *>(in_tensors_.at(2)->MutableData()), | |||||
| reinterpret_cast<float16_t *>(bias_data_), output_channel); | |||||
| if (in_tensors_.at(2)->data_type() != kNumberTypeFloat16) { | |||||
| MS_LOG(ERROR) << "deconv fp16 kernel require fp16 bias"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| if (bias_size != in_tensors_.at(2)->Size()) { | |||||
| MS_LOG(ERROR) << "input bias size not match : " << bias_size << " vs " << in_tensors_.at(2)->Size(); | |||||
| return RET_ERROR; | |||||
| } | |||||
| memcpy(bias_data_, in_tensors_.at(2)->data_c(), bias_size); | |||||
| } | } | ||||
| size_t weight_pack_size = input_channel * kernel_w * kernel_h * UP_ROUND(output_channel, C8NUM) * sizeof(float16_t); | size_t weight_pack_size = input_channel * kernel_w * kernel_h * UP_ROUND(output_channel, C8NUM) * sizeof(float16_t); | ||||
| @@ -76,7 +84,11 @@ int DeConvolutionFp16CPUKernel::InitWeightBias() { | |||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| memset(execute_weight_, 0, weight_pack_size); | memset(execute_weight_, 0, weight_pack_size); | ||||
| PackNHWCFp32ToC8HWN8Fp16(reinterpret_cast<float *>(in_tensors_.at(1)->MutableData()), execute_weight_, input_channel, | |||||
| if (in_tensors_.at(1)->data_type() != kNumberTypeFloat16) { | |||||
| MS_LOG(ERROR) << "deconv fp16 kernel require fp16 weight"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| PackNHWCFp16ToC8HWN8Fp16(reinterpret_cast<float16_t *>(in_tensors_.at(1)->data_c()), execute_weight_, input_channel, | |||||
| kernel_w * kernel_h, output_channel); | kernel_w * kernel_h, output_channel); | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -341,7 +341,7 @@ int DeConvWinogradFp16CPUKernel::InitDataParam() { | |||||
| auto fp16_bias_data = reinterpret_cast<float16_t *>(bias_data_); | auto fp16_bias_data = reinterpret_cast<float16_t *>(bias_data_); | ||||
| if (in_tensors_.size() == 3 && in_tensors_.at(kBiasIndex)->shape().size() == 1 && | if (in_tensors_.size() == 3 && in_tensors_.at(kBiasIndex)->shape().size() == 1 && | ||||
| in_tensors_.at(kBiasIndex)->DimensionSize(0) == conv_param_->output_channel_) { | in_tensors_.at(kBiasIndex)->DimensionSize(0) == conv_param_->output_channel_) { | ||||
| auto src_bias = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->MutableData()); | |||||
| auto src_bias = reinterpret_cast<float16_t *>(in_tensors_.at(kBiasIndex)->MutableData()); | |||||
| MS_ASSERT(src_bias); | MS_ASSERT(src_bias); | ||||
| for (int i = 0; i < conv_param_->output_channel_; ++i) { | for (int i = 0; i < conv_param_->output_channel_; ++i) { | ||||
| fp16_bias_data[i] = (float16_t)src_bias[i]; | fp16_bias_data[i] = (float16_t)src_bias[i]; | ||||
| @@ -250,6 +250,9 @@ int CopyConstTensor(Tensor *tensor, std::map<Tensor *, Tensor *> *restored_origi | |||||
| return RET_ERROR; | return RET_ERROR; | ||||
| #endif | #endif | ||||
| } else { | } else { | ||||
| if (tensor->own_data()) { | |||||
| return RET_OK; | |||||
| } | |||||
| tensor->set_data(nullptr); | tensor->set_data(nullptr); | ||||
| auto ret = tensor->MallocData(); | auto ret = tensor->MallocData(); | ||||
| if (RET_OK != ret) { | if (RET_OK != ret) { | ||||
| @@ -264,8 +267,18 @@ int CopyConstTensor(Tensor *tensor, std::map<Tensor *, Tensor *> *restored_origi | |||||
| } | } | ||||
| #endif | #endif | ||||
| inline void RestoreTensorData(const std::map<Tensor *, Tensor *> &restored_origin_tensors) { | |||||
| for (auto &restored_origin_tensor : restored_origin_tensors) { | |||||
| inline void FreeRestoreTensors(std::map<Tensor *, Tensor *> *restored_origin_tensors) { | |||||
| MS_ASSERT(restored_origin_tensors != nullptr); | |||||
| for (auto &restored_origin_tensor : *restored_origin_tensors) { | |||||
| restored_origin_tensor.second->set_data(nullptr); | |||||
| delete (restored_origin_tensor.second); | |||||
| } | |||||
| restored_origin_tensors->clear(); | |||||
| } | |||||
| inline void RestoreTensorData(std::map<Tensor *, Tensor *> *restored_origin_tensors) { | |||||
| MS_ASSERT(restored_origin_tensors != nullptr); | |||||
| for (auto &restored_origin_tensor : *restored_origin_tensors) { | |||||
| auto *origin_tensor = restored_origin_tensor.first; | auto *origin_tensor = restored_origin_tensor.first; | ||||
| auto *restored_tensor = restored_origin_tensor.second; | auto *restored_tensor = restored_origin_tensor.second; | ||||
| MS_ASSERT(origin_tensor != nullptr); | MS_ASSERT(origin_tensor != nullptr); | ||||
| @@ -275,15 +288,7 @@ inline void RestoreTensorData(const std::map<Tensor *, Tensor *> &restored_origi | |||||
| origin_tensor->set_data(restored_tensor->data_c()); | origin_tensor->set_data(restored_tensor->data_c()); | ||||
| origin_tensor->set_own_data(restored_tensor->own_data()); | origin_tensor->set_own_data(restored_tensor->own_data()); | ||||
| } | } | ||||
| } | |||||
| inline void FreeRestoreTensors(std::map<Tensor *, Tensor *> *restored_origin_tensors) { | |||||
| MS_ASSERT(restored_origin_tensors != nullptr); | |||||
| for (auto &restored_origin_tensor : *restored_origin_tensors) { | |||||
| restored_origin_tensor.second->set_data(nullptr); | |||||
| delete (restored_origin_tensor.second); | |||||
| } | |||||
| restored_origin_tensors->clear(); | |||||
| FreeRestoreTensors(restored_origin_tensors); | |||||
| } | } | ||||
| inline bool IsChannelFirst(int index, OpParameter *op_parameter) { | inline bool IsChannelFirst(int index, OpParameter *op_parameter) { | ||||
| @@ -308,54 +313,54 @@ kernel::LiteKernel *Scheduler::FindCpuKernel(const std::vector<Tensor *> &in_ten | |||||
| if (!KernelRegistry::GetInstance()->SupportKernel(desc)) { | if (!KernelRegistry::GetInstance()->SupportKernel(desc)) { | ||||
| return nullptr; | return nullptr; | ||||
| } | } | ||||
| kernel::KernelKey cpu_desc = desc; | |||||
| if (kernel_data_type == kNumberTypeFloat16) { | |||||
| if (!context_->IsCpuFloat16Enabled() || | |||||
| (cpu_desc.data_type != kNumberTypeFloat32 && cpu_desc.data_type != kNumberTypeFloat16)) { | |||||
| return nullptr; | |||||
| } | |||||
| cpu_desc.data_type = kNumberTypeFloat16; | |||||
| } | |||||
| std::map<Tensor *, Tensor *> restored_origin_tensors; | std::map<Tensor *, Tensor *> restored_origin_tensors; | ||||
| int index = 0; | int index = 0; | ||||
| for (auto &tensor : in_tensors) { | for (auto &tensor : in_tensors) { | ||||
| auto channel_first = IsChannelFirst(index++, op_parameter); | auto channel_first = IsChannelFirst(index++, op_parameter); | ||||
| auto *restore_tensor = DequantUtil::DequantTensor(tensor, desc.data_type, channel_first, kernel_data_type); | |||||
| auto *restore_tensor = DequantUtil::DequantTensor(tensor, cpu_desc.data_type, channel_first, kernel_data_type); | |||||
| if (restore_tensor != nullptr) { | if (restore_tensor != nullptr) { | ||||
| restored_origin_tensors[tensor] = restore_tensor; | restored_origin_tensors[tensor] = restore_tensor; | ||||
| } else { | } else { | ||||
| #ifndef SUPPORT_TRAIN | #ifndef SUPPORT_TRAIN | ||||
| if (!IsPackedOp(op_type) && !tensor->own_data()) { // && op_type != schema::PrimitiveType_LSTM | |||||
| auto ret = CopyConstTensor(tensor, &restored_origin_tensors, kernel_data_type); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(DEBUG) << "CopyConstTensor failed: " << ret; | |||||
| return nullptr; | |||||
| } | |||||
| auto ret = CopyConstTensor(tensor, &restored_origin_tensors, kernel_data_type); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(DEBUG) << "CopyConstTensor failed: " << ret; | |||||
| return nullptr; | |||||
| } | } | ||||
| #endif | #endif | ||||
| } | } | ||||
| } | } | ||||
| auto *kernel = KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, context_, desc, op_parameter); | |||||
| auto *kernel = KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, context_, cpu_desc, op_parameter); | |||||
| if (kernel != nullptr) { | if (kernel != nullptr) { | ||||
| MS_LOG(DEBUG) << "Get TypeId(" << kernel_data_type << ") op success: " << PrimitiveTypeName(op_type); | |||||
| MS_LOG(DEBUG) << "Get TypeId(" << kernel_data_type << ") op success: " << PrimitiveCurVersionTypeName(op_type); | |||||
| FreeRestoreTensors(&restored_origin_tensors); | FreeRestoreTensors(&restored_origin_tensors); | ||||
| } else { | } else { | ||||
| RestoreTensorData(restored_origin_tensors); | |||||
| RestoreTensorData(&restored_origin_tensors); | |||||
| } | } | ||||
| return kernel; | return kernel; | ||||
| } | |||||
| } // namespace mindspore::lite | |||||
| kernel::LiteKernel *Scheduler::FindBackendKernel(const std::vector<Tensor *> &in_tensors, | |||||
| const std::vector<Tensor *> &out_tensors, const Model::Node *node, | |||||
| TypeId prefer_data_type) { | |||||
| MS_ASSERT(node != nullptr); | |||||
| bool need_dequant = node->quant_type_ == schema::QuantType_WeightQuant; | |||||
| TypeId data_type = need_dequant ? kNumberTypeFloat32 : GetFirstFp32Fp16OrInt8Type(in_tensors); | |||||
| OpParameter *op_parameter = op_parameters_[node->output_indices_.at(0)]; | |||||
| if (op_parameter == nullptr) { | |||||
| MS_LOG(ERROR) << "Can not find OpParameter!type: " << PrimitiveTypeName(GetPrimitiveType(node->primitive_)); | |||||
| return nullptr; | |||||
| } | |||||
| bool infer_shape_interrupt = !op_parameter->infer_flag_; | |||||
| kernel::KernelKey desc{kCPU, data_type, static_cast<schema::PrimitiveType>(op_parameter->type_)}; | |||||
| #if SUPPORT_GPU | |||||
| kernel::LiteKernel *Scheduler::FindGpuKernel(const std::vector<Tensor *> &in_tensors, | |||||
| const std::vector<Tensor *> &out_tensors, OpParameter *op_parameter, | |||||
| const kernel::KernelKey &desc) { | |||||
| MS_ASSERT(op_parameter != nullptr); | |||||
| if (context_->IsGpuEnabled()) { | if (context_->IsGpuEnabled()) { | ||||
| // support more data type like int32 | // support more data type like int32 | ||||
| kernel::KernelKey gpu_desc{kGPU, kNumberTypeFloat32, desc.type}; | kernel::KernelKey gpu_desc{kGPU, kNumberTypeFloat32, desc.type}; | ||||
| if (context_->IsGpuFloat16Enabled()) gpu_desc.data_type = kNumberTypeFloat16; | |||||
| if (in_tensors.front()->data_type() == kNumberTypeInt8) gpu_desc.data_type = kNumberTypeInt8; | |||||
| if (context_->IsGpuFloat16Enabled()) { | |||||
| gpu_desc.data_type = kNumberTypeFloat16; | |||||
| } | |||||
| if (in_tensors.front()->data_type() == kNumberTypeInt8) { | |||||
| gpu_desc.data_type = kNumberTypeInt8; | |||||
| } | |||||
| // weight quant | // weight quant | ||||
| std::map<Tensor *, Tensor *> restored_origin_tensors; | std::map<Tensor *, Tensor *> restored_origin_tensors; | ||||
| @@ -370,36 +375,32 @@ kernel::LiteKernel *Scheduler::FindBackendKernel(const std::vector<Tensor *> &in | |||||
| auto *kernel = KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, context_, gpu_desc, op_parameter); | auto *kernel = KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, context_, gpu_desc, op_parameter); | ||||
| if (kernel != nullptr) { | if (kernel != nullptr) { | ||||
| MS_LOG(DEBUG) << "Get gpu op success: " << PrimitiveCurVersionTypeName(gpu_desc.type) << " " << node->name_; | |||||
| MS_LOG(DEBUG) << "Get gpu op success: " << PrimitiveCurVersionTypeName(gpu_desc.type); | |||||
| FreeRestoreTensors(&restored_origin_tensors); | FreeRestoreTensors(&restored_origin_tensors); | ||||
| return kernel; | |||||
| } else { | } else { | ||||
| MS_LOG(DEBUG) << "Get gpu op failed, scheduler to cpu: " << PrimitiveCurVersionTypeName(gpu_desc.type) << " " | |||||
| << node->name_; | |||||
| auto ret = InferNodeShape(node, &infer_shape_interrupt); | |||||
| if (ret == RET_INFER_INVALID || ret == RET_OK) { | |||||
| op_parameter = op_parameters_[node->output_indices_.at(0)]; | |||||
| } else { | |||||
| RestoreTensorData(restored_origin_tensors); | |||||
| MS_LOG(ERROR) << "Try repeat infer fail: " << node->name_; | |||||
| return nullptr; | |||||
| } | |||||
| MS_LOG(DEBUG) << "Get gpu op failed, scheduler to cpu: " << PrimitiveCurVersionTypeName(gpu_desc.type); | |||||
| RestoreTensorData(&restored_origin_tensors); | |||||
| } | } | ||||
| return kernel; | |||||
| } else { | |||||
| return nullptr; | |||||
| } | } | ||||
| #endif | |||||
| #if SUPPORT_NPU | |||||
| } | |||||
| kernel::LiteKernel *Scheduler::FindNpuKernel(const std::vector<Tensor *> &in_tensors, | |||||
| const std::vector<Tensor *> &out_tensors, OpParameter *op_parameter, | |||||
| const kernel::KernelKey &desc) { | |||||
| MS_ASSERT(op_parameter != nullptr); | |||||
| kernel::KernelKey npu_desc{kNPU, desc.data_type, desc.type}; | |||||
| if (context_->IsNpuEnabled()) { | if (context_->IsNpuEnabled()) { | ||||
| if (desc.data_type == kNumberTypeFloat16) { | |||||
| desc.data_type = kNumberTypeFloat32; | |||||
| if (npu_desc.data_type == kNumberTypeFloat16) { | |||||
| npu_desc.data_type = kNumberTypeFloat32; | |||||
| } | } | ||||
| for (auto tensor : in_tensors) { | for (auto tensor : in_tensors) { | ||||
| if (tensor->data_type() == kNumberTypeFloat16) { | if (tensor->data_type() == kNumberTypeFloat16) { | ||||
| tensor->set_data_type(kNumberTypeFloat32); | tensor->set_data_type(kNumberTypeFloat32); | ||||
| } | } | ||||
| } | } | ||||
| kernel::KernelKey npu_desc{kNPU, desc.data_type, desc.type}; | |||||
| // weight quant | |||||
| std::map<Tensor *, Tensor *> restored_origin_tensors; | std::map<Tensor *, Tensor *> restored_origin_tensors; | ||||
| for (auto &tensor : in_tensors) { | for (auto &tensor : in_tensors) { | ||||
| int index = 0; | int index = 0; | ||||
| @@ -411,33 +412,72 @@ kernel::LiteKernel *Scheduler::FindBackendKernel(const std::vector<Tensor *> &in | |||||
| } | } | ||||
| auto *kernel = KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, context_, npu_desc, op_parameter); | auto *kernel = KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, context_, npu_desc, op_parameter); | ||||
| if (kernel != nullptr) { | if (kernel != nullptr) { | ||||
| MS_LOG(DEBUG) << "Get npu op success: " << PrimitiveCurVersionTypeName(npu_desc.type) << " " << node->name_; | |||||
| FreeRestoreTensors(&restored_origin_tensors); | FreeRestoreTensors(&restored_origin_tensors); | ||||
| return kernel; | |||||
| MS_LOG(DEBUG) << "Get npu op success: " << PrimitiveCurVersionTypeName(npu_desc.type); | |||||
| } else { | } else { | ||||
| MS_LOG(DEBUG) << "Get npu op failed, scheduler to cpu: " << PrimitiveCurVersionTypeName(npu_desc.type) << " " | |||||
| << node->name_; | |||||
| RestoreTensorData(restored_origin_tensors); | |||||
| auto ret = InferNodeShape(node, &infer_shape_interrupt); | |||||
| if (ret == RET_INFER_INVALID || ret == RET_OK) { | |||||
| op_parameter = op_parameters_[node->output_indices_.at(0)]; | |||||
| } else { | |||||
| MS_LOG(ERROR) << "Try repeat infer fail: " << node->name_; | |||||
| return nullptr; | |||||
| } | |||||
| RestoreTensorData(&restored_origin_tensors); | |||||
| MS_LOG(DEBUG) << "Get npu op failed, scheduler to cpu: " << PrimitiveCurVersionTypeName(npu_desc.type); | |||||
| } | |||||
| return kernel; | |||||
| } else { | |||||
| return nullptr; | |||||
| } | |||||
| } | |||||
| kernel::LiteKernel *Scheduler::FindBackendKernel(const std::vector<Tensor *> &in_tensors, | |||||
| const std::vector<Tensor *> &out_tensors, const Model::Node *node, | |||||
| TypeId prefer_data_type) { | |||||
| MS_ASSERT(node != nullptr); | |||||
| // why we need this | |||||
| TypeId data_type = | |||||
| (node->quant_type_ == schema::QuantType_WeightQuant) ? kNumberTypeFloat32 : GetFirstFp32Fp16OrInt8Type(in_tensors); | |||||
| OpParameter *op_parameter = op_parameters_[node->output_indices_.at(0)]; | |||||
| if (op_parameter == nullptr) { | |||||
| MS_LOG(ERROR) << "Can not find OpParameter!type: " << PrimitiveTypeName(GetPrimitiveType(node->primitive_)); | |||||
| return nullptr; | |||||
| } | |||||
| bool infer_shape_interrupt = !op_parameter->infer_flag_; | |||||
| kernel::KernelKey desc{kCPU, data_type, static_cast<schema::PrimitiveType>(op_parameter->type_)}; | |||||
| kernel::LiteKernel *kernel = nullptr; | |||||
| #ifdef SUPPORT_GPU | |||||
| kernel = FindGpuKernel(in_tensors, out_tensors, op_parameter, desc); | |||||
| if (kernel != nullptr) { | |||||
| return kernel; | |||||
| } else { | |||||
| MS_LOG(DEBUG) << "Get gpu op failed, scheduler to cpu: " << PrimitiveCurVersionTypeName(desc.type) << " " | |||||
| << node->name_; | |||||
| auto ret = InferNodeShape(node, &infer_shape_interrupt); | |||||
| if (ret == RET_INFER_INVALID || ret == RET_OK) { | |||||
| op_parameter = op_parameters_[node->output_indices_.at(0)]; | |||||
| } else { | |||||
| MS_LOG(ERROR) << "Try repeat infer fail: " << node->name_; | |||||
| return nullptr; | |||||
| } | } | ||||
| } | } | ||||
| #endif | #endif | ||||
| if ((prefer_data_type == kNumberTypeFloat16 || prefer_data_type == kTypeUnknown) && | |||||
| mindspore::lite::IsSupportFloat16() && | |||||
| ((context_->IsCpuFloat16Enabled() && data_type == kNumberTypeFloat32) || data_type == kNumberTypeFloat16)) { | |||||
| kernel::KernelKey fp16_cpu_desc{desc.arch, kNumberTypeFloat16, desc.type}; | |||||
| auto kernel = FindCpuKernel(in_tensors, out_tensors, op_parameter, fp16_cpu_desc, kNumberTypeFloat16); | |||||
| #ifdef SUPPORT_NPU | |||||
| kernel = FindNpuKernel(in_tensors, out_tensors, op_parameter, desc); | |||||
| if (kernel != nullptr) { | |||||
| return kernel; | |||||
| } else { | |||||
| MS_LOG(DEBUG) << "Get npu op failed, scheduler to cpu: " << PrimitiveCurVersionTypeName(desc.type) << " " | |||||
| << node->name_; | |||||
| auto ret = InferNodeShape(node, &infer_shape_interrupt); | |||||
| if (ret == RET_INFER_INVALID || ret == RET_OK) { | |||||
| op_parameter = op_parameters_[node->output_indices_.at(0)]; | |||||
| } else { | |||||
| MS_LOG(ERROR) << "Try repeat infer fail: " << node->name_; | |||||
| return nullptr; | |||||
| } | |||||
| } | |||||
| #endif | |||||
| if (prefer_data_type == kNumberTypeFloat16 || prefer_data_type == kTypeUnknown) { | |||||
| kernel = FindCpuKernel(in_tensors, out_tensors, op_parameter, desc, kNumberTypeFloat16); | |||||
| if (kernel != nullptr) { | if (kernel != nullptr) { | ||||
| return kernel; | return kernel; | ||||
| } else { | } else { | ||||
| MS_LOG(DEBUG) << "Get fp16 op failed, scheduler to cpu: " << PrimitiveCurVersionTypeName(fp16_cpu_desc.type) | |||||
| << " " << node->name_; | |||||
| MS_LOG(DEBUG) << "Get fp16 op failed, scheduler to cpu: " << PrimitiveCurVersionTypeName(desc.type) << " " | |||||
| << node->name_; | |||||
| auto ret = InferNodeShape(node, &infer_shape_interrupt); | auto ret = InferNodeShape(node, &infer_shape_interrupt); | ||||
| if (ret == RET_INFER_INVALID || ret == RET_OK) { | if (ret == RET_INFER_INVALID || ret == RET_OK) { | ||||
| op_parameter = op_parameters_[node->output_indices_.at(0)]; | op_parameter = op_parameters_[node->output_indices_.at(0)]; | ||||
| @@ -452,20 +492,18 @@ kernel::LiteKernel *Scheduler::FindBackendKernel(const std::vector<Tensor *> &in | |||||
| desc.data_type = kNumberTypeFloat32; | desc.data_type = kNumberTypeFloat32; | ||||
| } | } | ||||
| if (prefer_data_type == kNumberTypeFloat32 || prefer_data_type == kTypeUnknown) { | if (prefer_data_type == kNumberTypeFloat32 || prefer_data_type == kTypeUnknown) { | ||||
| auto kernel = FindCpuKernel(in_tensors, out_tensors, op_parameter, desc, kNumberTypeFloat32); | |||||
| kernel = FindCpuKernel(in_tensors, out_tensors, op_parameter, desc, kNumberTypeFloat32); | |||||
| if (kernel != nullptr) { | if (kernel != nullptr) { | ||||
| return kernel; | return kernel; | ||||
| } else { | } else { | ||||
| auto ret = InferNodeShape(node, &infer_shape_interrupt); | auto ret = InferNodeShape(node, &infer_shape_interrupt); | ||||
| if (!(ret == RET_INFER_INVALID || ret == RET_OK)) { | if (!(ret == RET_INFER_INVALID || ret == RET_OK)) { | ||||
| MS_LOG(ERROR) | |||||
| << "Try repeat infer fail: " << node->name_; | |||||
| MS_LOG(ERROR) << "Try repeat infer fail: " << node->name_; | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| return nullptr; | return nullptr; | ||||
| } // namespace mindspore::lite | |||||
| } | |||||
| kernel::LiteKernel *Scheduler::SchedulePartialToKernel(const lite::Model::Node *src_node) { | kernel::LiteKernel *Scheduler::SchedulePartialToKernel(const lite::Model::Node *src_node) { | ||||
| MS_ASSERT(src_model_ != nullptr); | MS_ASSERT(src_model_ != nullptr); | ||||
| @@ -61,6 +61,10 @@ class Scheduler { | |||||
| TypeId prefer_data_type = kTypeUnknown); | TypeId prefer_data_type = kTypeUnknown); | ||||
| kernel::LiteKernel *FindCpuKernel(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors, | kernel::LiteKernel *FindCpuKernel(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors, | ||||
| OpParameter *op_parameter, const kernel::KernelKey &desc, TypeId kernel_data_type); | OpParameter *op_parameter, const kernel::KernelKey &desc, TypeId kernel_data_type); | ||||
| kernel::LiteKernel *FindGpuKernel(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors, | |||||
| OpParameter *op_parameter, const kernel::KernelKey &desc); | |||||
| kernel::LiteKernel *FindNpuKernel(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors, | |||||
| OpParameter *op_parameter, const kernel::KernelKey &desc); | |||||
| // schedule a partial node to a subgraph_kernel | // schedule a partial node to a subgraph_kernel | ||||
| kernel::LiteKernel *SchedulePartialToKernel(const lite::Model::Node *src_node); | kernel::LiteKernel *SchedulePartialToKernel(const lite::Model::Node *src_node); | ||||
| // schedule a node to a kernel | // schedule a node to a kernel | ||||
| @@ -412,9 +412,7 @@ int Benchmark::MarkPerformance() { | |||||
| for (int i = 0; i < flags_->loop_count_; i++) { | for (int i = 0; i < flags_->loop_count_; i++) { | ||||
| session_->BindThread(true); | session_->BindThread(true); | ||||
| auto start = GetTimeUs(); | auto start = GetTimeUs(); | ||||
| auto status = (flags_->time_profiling_ || flags_->perf_profiling_) | |||||
| ? session_->RunGraph(before_call_back_, after_call_back_) | |||||
| : session_->RunGraph(); | |||||
| auto status = session_->RunGraph(before_call_back_, after_call_back_); | |||||
| if (status != 0) { | if (status != 0) { | ||||
| MS_LOG(ERROR) << "Inference error " << status; | MS_LOG(ERROR) << "Inference error " << status; | ||||
| std::cerr << "Inference error " << status; | std::cerr << "Inference error " << status; | ||||
| @@ -479,7 +477,7 @@ int Benchmark::MarkAccuracy() { | |||||
| std::cerr << "PrintInputData error " << status << std::endl; | std::cerr << "PrintInputData error " << status << std::endl; | ||||
| return status; | return status; | ||||
| } | } | ||||
| status = session_->RunGraph(); | |||||
| status = session_->RunGraph(before_call_back_, after_call_back_); | |||||
| if (status != RET_OK) { | if (status != RET_OK) { | ||||
| MS_LOG(ERROR) << "Inference error " << status; | MS_LOG(ERROR) << "Inference error " << status; | ||||
| std::cerr << "Inference error " << status << std::endl; | std::cerr << "Inference error " << status << std::endl; | ||||
| @@ -615,7 +613,9 @@ int Benchmark::RunBenchmark() { | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| } | } | ||||
| if (model != nullptr) model->Free(); | |||||
| if (model != nullptr) { | |||||
| model->Free(); | |||||
| } | |||||
| ms_inputs_ = session_->GetInputs(); | ms_inputs_ = session_->GetInputs(); | ||||
| auto end_prepare_time = GetTimeUs(); | auto end_prepare_time = GetTimeUs(); | ||||
| @@ -689,18 +689,18 @@ int Benchmark::InitTimeProfilingCallbackParameter() { | |||||
| // before callback | // before callback | ||||
| before_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &before_inputs, | before_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &before_inputs, | ||||
| const std::vector<mindspore::tensor::MSTensor *> &before_outputs, | const std::vector<mindspore::tensor::MSTensor *> &before_outputs, | ||||
| const CallBackParam &callParam) { | |||||
| const CallBackParam &call_param) { | |||||
| if (before_inputs.empty()) { | if (before_inputs.empty()) { | ||||
| MS_LOG(INFO) << "The num of beforeInputs is empty"; | MS_LOG(INFO) << "The num of beforeInputs is empty"; | ||||
| } | } | ||||
| if (before_outputs.empty()) { | if (before_outputs.empty()) { | ||||
| MS_LOG(INFO) << "The num of beforeOutputs is empty"; | MS_LOG(INFO) << "The num of beforeOutputs is empty"; | ||||
| } | } | ||||
| if (op_times_by_type_.find(callParam.node_type) == op_times_by_type_.end()) { | |||||
| op_times_by_type_.insert(std::make_pair(callParam.node_type, std::make_pair(0, 0.0f))); | |||||
| if (op_times_by_type_.find(call_param.node_type) == op_times_by_type_.end()) { | |||||
| op_times_by_type_.insert(std::make_pair(call_param.node_type, std::make_pair(0, 0.0f))); | |||||
| } | } | ||||
| if (op_times_by_name_.find(callParam.node_name) == op_times_by_name_.end()) { | |||||
| op_times_by_name_.insert(std::make_pair(callParam.node_name, std::make_pair(0, 0.0f))); | |||||
| if (op_times_by_name_.find(call_param.node_name) == op_times_by_name_.end()) { | |||||
| op_times_by_name_.insert(std::make_pair(call_param.node_name, std::make_pair(0, 0.0f))); | |||||
| } | } | ||||
| op_call_times_total_++; | op_call_times_total_++; | ||||
| @@ -735,6 +735,7 @@ int Benchmark::InitTimeProfilingCallbackParameter() { | |||||
| }; | }; | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int Benchmark::InitPerfProfilingCallbackParameter() { | int Benchmark::InitPerfProfilingCallbackParameter() { | ||||
| #ifndef ENABLE_ARM64 | #ifndef ENABLE_ARM64 | ||||
| MS_LOG(ERROR) << "Only support perf_profiling on arm64."; | MS_LOG(ERROR) << "Only support perf_profiling on arm64."; | ||||
| @@ -781,18 +782,18 @@ int Benchmark::InitPerfProfilingCallbackParameter() { | |||||
| // before callback | // before callback | ||||
| before_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &before_inputs, | before_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &before_inputs, | ||||
| const std::vector<mindspore::tensor::MSTensor *> &before_outputs, | const std::vector<mindspore::tensor::MSTensor *> &before_outputs, | ||||
| const CallBackParam &callParam) { | |||||
| const CallBackParam &call_param) { | |||||
| if (before_inputs.empty()) { | if (before_inputs.empty()) { | ||||
| MS_LOG(INFO) << "The num of beforeInputs is empty"; | MS_LOG(INFO) << "The num of beforeInputs is empty"; | ||||
| } | } | ||||
| if (before_outputs.empty()) { | if (before_outputs.empty()) { | ||||
| MS_LOG(INFO) << "The num of beforeOutputs is empty"; | MS_LOG(INFO) << "The num of beforeOutputs is empty"; | ||||
| } | } | ||||
| if (op_perf_by_type_.find(callParam.node_type) == op_perf_by_type_.end()) { | |||||
| op_perf_by_type_.insert(std::make_pair(callParam.node_type, std::make_pair(0, zero))); | |||||
| if (op_perf_by_type_.find(call_param.node_type) == op_perf_by_type_.end()) { | |||||
| op_perf_by_type_.insert(std::make_pair(call_param.node_type, std::make_pair(0, zero))); | |||||
| } | } | ||||
| if (op_perf_by_name_.find(callParam.node_name) == op_perf_by_name_.end()) { | |||||
| op_perf_by_name_.insert(std::make_pair(callParam.node_name, std::make_pair(0, zero))); | |||||
| if (op_perf_by_name_.find(call_param.node_name) == op_perf_by_name_.end()) { | |||||
| op_perf_by_name_.insert(std::make_pair(call_param.node_name, std::make_pair(0, zero))); | |||||
| } | } | ||||
| op_call_times_total_++; | op_call_times_total_++; | ||||
| @@ -831,12 +832,89 @@ int Benchmark::InitPerfProfilingCallbackParameter() { | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| namespace { | |||||
| template <typename T> | |||||
| std::string DataToString(void *data, size_t data_number) { | |||||
| if (data == nullptr) { | |||||
| return "Data of tensor is nullptr"; | |||||
| } | |||||
| std::ostringstream oss; | |||||
| auto casted_data = static_cast<T *>(data); | |||||
| for (size_t i = 0; i < 40 && i < data_number; i++) { | |||||
| oss << " " << casted_data[i]; | |||||
| } | |||||
| return oss.str(); | |||||
| } | |||||
| std::string DumpMSTensor(tensor::MSTensor *tensor) { | |||||
| if (tensor == nullptr) { | |||||
| return "Tensor is nullptr"; | |||||
| } | |||||
| std::ostringstream oss; | |||||
| oss << " DataType: " << tensor->data_type(); | |||||
| oss << " Shape:"; | |||||
| for (auto &dim : tensor->shape()) { | |||||
| oss << " " << dim; | |||||
| } | |||||
| oss << std::endl << "Data:"; | |||||
| switch (tensor->data_type()) { | |||||
| case kNumberTypeFloat32: { | |||||
| oss << DataToString<float>(tensor->data(), tensor->ElementsNum()); | |||||
| } break; | |||||
| case kNumberTypeFloat16: { | |||||
| oss << DataToString<int16_t>(tensor->data(), tensor->ElementsNum()); | |||||
| } break; | |||||
| case kNumberTypeInt32: { | |||||
| oss << DataToString<int32_t>(tensor->data(), tensor->ElementsNum()); | |||||
| } break; | |||||
| case kNumberTypeInt16: { | |||||
| oss << DataToString<int16_t>(tensor->data(), tensor->ElementsNum()); | |||||
| } break; | |||||
| case kNumberTypeInt8: { | |||||
| oss << DataToString<int8_t>(tensor->data(), tensor->ElementsNum()); | |||||
| } break; | |||||
| default: | |||||
| oss << "Unsupported data type to print"; | |||||
| break; | |||||
| } | |||||
| return oss.str(); | |||||
| } | |||||
| } // namespace | |||||
| int Benchmark::InitDumpProfilingCallbackParameter() { | |||||
| // before callback | |||||
| before_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &before_inputs, | |||||
| const std::vector<mindspore::tensor::MSTensor *> &before_outputs, | |||||
| const CallBackParam &call_param) { return true; }; | |||||
| // after callback | |||||
| after_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &after_inputs, | |||||
| const std::vector<mindspore::tensor::MSTensor *> &after_outputs, | |||||
| const CallBackParam &call_param) { | |||||
| std::cout << "================================================================" << std::endl; | |||||
| std::cout << call_param.node_name << " inputs : " << std::endl; | |||||
| for (auto ms_tensor : after_inputs) { | |||||
| std::cout << DumpMSTensor(ms_tensor) << std::endl; | |||||
| } | |||||
| std::cout << "----------------------------------------------------------------" << std::endl; | |||||
| std::cout << call_param.node_name << " outputs : " << std::endl; | |||||
| for (const auto ms_tensor : after_outputs) { | |||||
| std::cout << DumpMSTensor(ms_tensor) << std::endl; | |||||
| } | |||||
| std::cout << "================================================================" << std::endl; | |||||
| return true; | |||||
| }; | |||||
| return RET_OK; | |||||
| } | |||||
| int Benchmark::InitCallbackParameter() { | int Benchmark::InitCallbackParameter() { | ||||
| int ret = RET_OK; | int ret = RET_OK; | ||||
| if (flags_->time_profiling_) { | if (flags_->time_profiling_) { | ||||
| ret = InitTimeProfilingCallbackParameter(); | ret = InitTimeProfilingCallbackParameter(); | ||||
| } else if (flags_->perf_profiling_) { | } else if (flags_->perf_profiling_) { | ||||
| ret = InitPerfProfilingCallbackParameter(); | ret = InitPerfProfilingCallbackParameter(); | ||||
| } else if (flags_->dump_profiling_) { | |||||
| ret = InitDumpProfilingCallbackParameter(); | |||||
| } | } | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| @@ -917,16 +995,14 @@ int Benchmark::Init() { | |||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| if (flags_->time_profiling_ || flags_->perf_profiling_) { | |||||
| if (flags_->time_profiling_ && flags_->perf_profiling_) { | |||||
| MS_LOG(INFO) << "time_profiling is enabled, will not run perf_profiling."; | |||||
| } | |||||
| auto status = InitCallbackParameter(); | |||||
| if (status != RET_OK) { | |||||
| MS_LOG(ERROR) << "Init callback Parameter failed."; | |||||
| std::cerr << "Init callback Parameter failed." << std::endl; | |||||
| return RET_ERROR; | |||||
| } | |||||
| if (flags_->time_profiling_ && flags_->perf_profiling_) { | |||||
| MS_LOG(INFO) << "time_profiling is enabled, will not run perf_profiling."; | |||||
| } | |||||
| auto status = InitCallbackParameter(); | |||||
| if (status != RET_OK) { | |||||
| MS_LOG(ERROR) << "Init callback Parameter failed."; | |||||
| std::cerr << "Init callback Parameter failed." << std::endl; | |||||
| return RET_ERROR; | |||||
| } | } | ||||
| return RET_OK; | return RET_OK; | ||||
| @@ -113,9 +113,6 @@ class MS_API BenchmarkFlags : public virtual FlagParser { | |||||
| int num_threads_ = 2; | int num_threads_ = 2; | ||||
| bool enable_fp16_ = false; | bool enable_fp16_ = false; | ||||
| int warm_up_loop_count_ = 3; | int warm_up_loop_count_ = 3; | ||||
| bool time_profiling_ = false; | |||||
| bool perf_profiling_ = false; | |||||
| std::string perf_event_ = "CYCLE"; | |||||
| // MarkAccuracy | // MarkAccuracy | ||||
| std::string benchmark_data_file_; | std::string benchmark_data_file_; | ||||
| std::string benchmark_data_type_ = "FLOAT"; | std::string benchmark_data_type_ = "FLOAT"; | ||||
| @@ -125,6 +122,10 @@ class MS_API BenchmarkFlags : public virtual FlagParser { | |||||
| std::vector<std::vector<int>> resize_dims_; | std::vector<std::vector<int>> resize_dims_; | ||||
| std::string device_ = "CPU"; | std::string device_ = "CPU"; | ||||
| bool time_profiling_ = false; | |||||
| bool perf_profiling_ = false; | |||||
| std::string perf_event_ = "CYCLE"; | |||||
| bool dump_profiling_ = false; | |||||
| }; | }; | ||||
| class MS_API Benchmark { | class MS_API Benchmark { | ||||
| @@ -163,9 +164,13 @@ class MS_API Benchmark { | |||||
| int *total_size); | int *total_size); | ||||
| int InitCallbackParameter(); | int InitCallbackParameter(); | ||||
| int InitTimeProfilingCallbackParameter(); | int InitTimeProfilingCallbackParameter(); | ||||
| int InitPerfProfilingCallbackParameter(); | int InitPerfProfilingCallbackParameter(); | ||||
| int InitDumpProfilingCallbackParameter(); | |||||
| int PrintResult(const std::vector<std::string> &title, const std::map<std::string, std::pair<int, float>> &result); | int PrintResult(const std::vector<std::string> &title, const std::map<std::string, std::pair<int, float>> &result); | ||||
| #ifdef ENABLE_ARM64 | #ifdef ENABLE_ARM64 | ||||
| @@ -289,8 +294,8 @@ class MS_API Benchmark { | |||||
| std::map<std::string, std::pair<int, struct PerfCount>> op_perf_by_type_; | std::map<std::string, std::pair<int, struct PerfCount>> op_perf_by_type_; | ||||
| std::map<std::string, std::pair<int, struct PerfCount>> op_perf_by_name_; | std::map<std::string, std::pair<int, struct PerfCount>> op_perf_by_name_; | ||||
| #endif | #endif | ||||
| KernelCallBack before_call_back_; | |||||
| KernelCallBack after_call_back_; | |||||
| KernelCallBack before_call_back_ = nullptr; | |||||
| KernelCallBack after_call_back_ = nullptr; | |||||
| std::mt19937 random_engine_; | std::mt19937 random_engine_; | ||||
| }; | }; | ||||
| @@ -193,7 +193,7 @@ STATUS InferShapePass::GetCNodeInputTensors(const CNodePtr &cnode, std::vector<l | |||||
| tensor::TensorPtr tensor_info; | tensor::TensorPtr tensor_info; | ||||
| auto status = GetTensorInfoFromAbstract(&tensor_info, cnode, i); | auto status = GetTensorInfoFromAbstract(&tensor_info, cnode, i); | ||||
| if (status != RET_OK) { | if (status != RET_OK) { | ||||
| MS_LOG(ERROR) << "get tensor info failed."; | |||||
| MS_LOG(DEBUG) << "get tensor info failed."; | |||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| std::unique_ptr<lite::Tensor> tensor = nullptr; | std::unique_ptr<lite::Tensor> tensor = nullptr; | ||||