| @@ -347,11 +347,16 @@ void LiteSession::InitGraphInOutTensors(const lite::Model *model) { | |||
| } | |||
| } | |||
| void LiteSession::FreePackOpWeight() { | |||
| for (auto *kernel : kernels_) { | |||
| void LiteSession::FreePackOpWeight(const std::vector<kernel::LiteKernel *> &kernels) { | |||
| for (auto *kernel : kernels) { | |||
| MS_ASSERT(kernel != nullptr); | |||
| if (!IsPackedOp(kernel->Type())) { | |||
| continue; | |||
| if (kernel->subgraph_type() == kernel::kNotSubGraph) { | |||
| if (!IsPackedOp(kernel->Type())) { | |||
| continue; | |||
| } | |||
| } else { | |||
| auto subgraph = reinterpret_cast<kernel::SubGraphKernel *>(kernel); | |||
| FreePackOpWeight(subgraph->nodes()); | |||
| } | |||
| auto inputs = kernel->in_tensors(); | |||
| for (auto *tensor : inputs) { | |||
| @@ -444,8 +449,10 @@ int LiteSession::CompileGraph(Model *model) { | |||
| is_running_.store(false); | |||
| return ret; | |||
| } | |||
| #ifndef SUPPORT_TRAIN | |||
| // For reducing runtime RAM, free packop weight because packop will pack weight and will not access to origin weight | |||
| FreePackOpWeight(); | |||
| FreePackOpWeight(kernels_); | |||
| #endif | |||
| is_running_.store(false); | |||
| return RET_OK; | |||
| } // namespace lite | |||
| @@ -106,7 +106,7 @@ class LiteSession : public session::LiteSession { | |||
| static int ReSizeKernels(const std::vector<kernel::LiteKernel *> &kernels); | |||
| void FreePackOpWeight(); | |||
| static void FreePackOpWeight(const std::vector<kernel::LiteKernel *> &kernels); | |||
| private: | |||
| void ResetInputsShape(const std::vector<std::vector<int>> &dims); | |||
| @@ -368,8 +368,6 @@ bool OpenCLRuntime::SetFp16Enable(bool enable) { | |||
| return fp16_enable_ == enable; | |||
| } | |||
| bool OpenCLRuntime::IsSupportFloat16() { return support_fp16_; } | |||
| int OpenCLRuntime::BuildKernel(const cl::Kernel &kernel, const std::string &program_name, | |||
| const std::string &kernel_name, const std::vector<std::string> &build_options_ext) { | |||
| std::string build_option = default_build_option_; | |||
| @@ -70,7 +70,6 @@ class OpenCLRuntime { | |||
| GpuInfo GetGpuInfo(); | |||
| bool GetFp16Enable() const; | |||
| bool SetFp16Enable(bool enable); | |||
| bool IsSupportFloat16(); | |||
| bool GetSVMEnable() const { return svm_enable_; } | |||
| void SetSVMEnable(bool enable) { svm_enable_ = enable; } | |||
| const std::vector<size_t> &GetWorkItemSize() const { return max_work_item_sizes_; } | |||
| @@ -260,7 +260,7 @@ void Conv2DOpenCLKernel::InitFilter() { | |||
| // rearrange filter | |||
| auto filter_tensor = in_tensors_.at(1); | |||
| void *src_data = filter_tensor->data_c(); | |||
| void *src_data = stored_filter_ == nullptr ? filter_tensor->data_c() : stored_filter_; | |||
| auto src_dtype = filter_tensor->data_type(); | |||
| auto dst_dtype = use_fp16_ ? kNumberTypeFloat16 : kNumberTypeFloat32; | |||
| std::vector<char> tmp(size, 0); | |||
| @@ -279,7 +279,7 @@ void Conv2DOpenCLKernel::InitFilter() { | |||
| allocator->UnmapBuffer(packed_filter_); | |||
| } | |||
| FreeTmpWeight(in_tensors_.at(kWeightIndex)); | |||
| FreeStoredData(stored_filter_); | |||
| } | |||
| void Conv2DOpenCLKernel::InitBias() { | |||
| @@ -287,6 +287,7 @@ void Conv2DOpenCLKernel::InitBias() { | |||
| // align bias from C to C4 | |||
| auto bias_tensor = in_tensors_.at(2); | |||
| void *src_data = stored_bias_ == nullptr ? bias_tensor->data_c() : stored_bias_; | |||
| size_t packed_bias_size = UP_ROUND(CO_SLICES_, block_size_.C) * CO_TILE * sizeof_FLT_; | |||
| packed_bias_ = allocator->Malloc(packed_bias_size, lite::opencl::MemType::BUF); | |||
| @@ -294,10 +295,10 @@ void Conv2DOpenCLKernel::InitBias() { | |||
| memset(packed_bias_, 0x00, packed_bias_size); | |||
| if (bias_tensor->data_type() == kNumberTypeFloat16) { | |||
| if (use_fp16_) { | |||
| memcpy(packed_bias_, bias_tensor->data_c(), CO_ * sizeof_FLT_); | |||
| memcpy(packed_bias_, src_data, CO_ * sizeof_FLT_); | |||
| } else { | |||
| auto packed_bias_fp32 = reinterpret_cast<float *>(packed_bias_); | |||
| auto origin_bias_fp16 = reinterpret_cast<float16_t *>(bias_tensor->data_c()); | |||
| auto origin_bias_fp16 = reinterpret_cast<float16_t *>(src_data); | |||
| MS_ASSERT(origin_bias_fp16); | |||
| for (int i = 0; i < CO_; ++i) { | |||
| packed_bias_fp32[i] = static_cast<float>(origin_bias_fp16[i]); | |||
| @@ -306,17 +307,17 @@ void Conv2DOpenCLKernel::InitBias() { | |||
| } else { | |||
| if (use_fp16_) { | |||
| auto packed_bias_fp16 = reinterpret_cast<float16_t *>(packed_bias_); | |||
| auto origin_bias_fp32 = reinterpret_cast<float *>(bias_tensor->data_c()); | |||
| auto origin_bias_fp32 = reinterpret_cast<float *>(src_data); | |||
| MS_ASSERT(origin_bias_fp32); | |||
| for (int i = 0; i < CO_; ++i) { | |||
| packed_bias_fp16[i] = static_cast<float16_t>(origin_bias_fp32[i]); | |||
| } | |||
| } else { | |||
| memcpy(packed_bias_, bias_tensor->data_c(), CO_ * sizeof_FLT_); | |||
| memcpy(packed_bias_, src_data, CO_ * sizeof_FLT_); | |||
| } | |||
| } | |||
| allocator->UnmapBuffer(packed_bias_); | |||
| FreeTmpWeight(in_tensors_.at(kBiasIndex)); | |||
| FreeStoredData(stored_bias_); | |||
| } | |||
| void Conv2DOpenCLKernel::SetConstArgs() { | |||
| @@ -403,6 +404,24 @@ std::vector<BaseTuningParameter> Conv2DOpenCLKernel::GenerateTuningParam() { | |||
| return tuning_params; | |||
| } | |||
| int Conv2DOpenCLKernel::StoreConstData() { | |||
| if (!op_parameter_->infer_flag_) { | |||
| stored_filter_ = StoreTensorData(in_tensors_.at(kWeightIndex)); | |||
| if (stored_filter_ == nullptr) { | |||
| MS_LOG(ERROR) << "Store weight failed."; | |||
| return RET_ERROR; | |||
| } | |||
| if (in_tensors_.size() > kBiasIndex) { | |||
| stored_bias_ = StoreTensorData(in_tensors_.at(kBiasIndex)); | |||
| if (stored_bias_ == nullptr) { | |||
| MS_LOG(ERROR) << "Store bias failed."; | |||
| return RET_ERROR; | |||
| } | |||
| } | |||
| } | |||
| return RET_OK; | |||
| } | |||
| bool UseFcReplaceConv(const std::vector<lite::Tensor *> &inputs, const std::vector<lite::Tensor *> &outputs, | |||
| ConvParameter *param) { | |||
| MS_ASSERT(param); | |||
| @@ -528,11 +547,12 @@ kernel::LiteKernel *OpenCLConv2DCreator(const std::vector<lite::Tensor *> &input | |||
| } | |||
| } | |||
| if (!infer_shape_done) { | |||
| StoreTmpWeight(inputs.at(kWeightIndex)); | |||
| if (inputs.size() > kBiasIndex) { | |||
| StoreTmpWeight(inputs.at(kBiasIndex)); | |||
| auto ret = reinterpret_cast<Conv2DOpenCLKernel *>(kernel)->StoreConstData(); | |||
| if (ret != mindspore::lite::RET_OK) { | |||
| MS_LOG(ERROR) << "Store " << opParameter->name_ << " const data failed!"; | |||
| delete kernel; | |||
| return nullptr; | |||
| } | |||
| MS_LOG(WARNING) << "kernel don't infer shape yet!"; | |||
| return kernel; | |||
| } | |||
| if (kernel->CheckSpecs() != RET_OK || kernel->OpenCLKernel::CheckSpecs() != RET_OK) { | |||
| @@ -58,6 +58,8 @@ class Conv2DOpenCLKernel : public OpenCLKernel { | |||
| void SetGlobalLocal() override; | |||
| int Run() override; | |||
| int StoreConstData() override; | |||
| std::string Key() override { | |||
| auto key = OpenCLKernel::Key(); | |||
| key += "_" + std::to_string(KH_) + "_" + std::to_string(KW_) + "_" + std::to_string(param_->stride_h_) + "_" + | |||
| @@ -94,7 +96,9 @@ class Conv2DOpenCLKernel : public OpenCLKernel { | |||
| int KH_{}; | |||
| int KW_{}; | |||
| void *packed_filter_{nullptr}; | |||
| void *stored_filter_{nullptr}; | |||
| void *packed_bias_{nullptr}; | |||
| void *stored_bias_{nullptr}; | |||
| MemType filter_type_{MemType::BUF}; | |||
| bool has_bias_{false}; | |||
| int TILE_HW_{}; | |||
| @@ -148,7 +148,7 @@ int Conv2dTransposeOpenCLKernel::InitFilter() { | |||
| padWeight_ = allocator->Malloc(div_ci * div_co * C4NUM * C4NUM * kh * kw * data_size, lite::opencl::MemType::BUF); | |||
| padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true); | |||
| memset(padWeight_, 0x00, div_ci * div_co * C4NUM * C4NUM * kh * kw * data_size); | |||
| auto origin_weight = in_tensors_.at(kWeightIndex)->data_c(); | |||
| auto origin_weight = stored_weight_ == nullptr ? in_tensors_.at(kWeightIndex)->data_c() : stored_weight_; | |||
| auto weight_dtype = in_tensors_.at(kWeightIndex)->data_type(); | |||
| int index = 0; | |||
| for (int co_i = 0; co_i < div_co; co_i++) { | |||
| @@ -188,6 +188,7 @@ int Conv2dTransposeOpenCLKernel::InitFilter() { | |||
| } | |||
| } | |||
| allocator->UnmapBuffer(padWeight_); | |||
| FreeStoredData(stored_weight_); | |||
| return RET_OK; | |||
| } | |||
| @@ -209,20 +210,22 @@ int Conv2dTransposeOpenCLKernel::InitBias() { | |||
| bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true); | |||
| memset(bias_, 0x00, div_co * C4NUM * data_size); | |||
| if (in_tensors_.size() == 3) { | |||
| void *src_data = stored_bias_ == nullptr ? in_tensors_.at(kBiasIndex)->data_c() : stored_bias_; | |||
| auto bias_dtype = in_tensors_[2]->data_type(); | |||
| if (bias_dtype == kNumberTypeFloat32 && enable_fp16_) { | |||
| for (int i = 0; i < co; i++) { | |||
| reinterpret_cast<float16_t *>(bias_)[i] = reinterpret_cast<float *>(in_tensors_[2]->data_c())[i]; | |||
| reinterpret_cast<float16_t *>(bias_)[i] = reinterpret_cast<float *>(src_data)[i]; | |||
| } | |||
| } else if (bias_dtype == kNumberTypeFloat16 && !enable_fp16_) { | |||
| for (int i = 0; i < co; i++) { | |||
| reinterpret_cast<float *>(bias_)[i] = reinterpret_cast<float16_t *>(in_tensors_[2]->data_c())[i]; | |||
| reinterpret_cast<float *>(bias_)[i] = reinterpret_cast<float16_t *>(src_data)[i]; | |||
| } | |||
| } else { | |||
| memcpy(bias_, in_tensors_[2]->data_c(), co * data_size); | |||
| memcpy(bias_, src_data, co * data_size); | |||
| } | |||
| } | |||
| allocator->UnmapBuffer(bias_); | |||
| FreeStoredData(stored_bias_); | |||
| return RET_OK; | |||
| } | |||
| @@ -243,6 +246,24 @@ int Conv2dTransposeOpenCLKernel::InferShape() { | |||
| return RET_OK; | |||
| } | |||
| int Conv2dTransposeOpenCLKernel::StoreConstData() { | |||
| if (!op_parameter_->infer_flag_) { | |||
| stored_weight_ = StoreTensorData(in_tensors_.at(kWeightIndex)); | |||
| if (stored_weight_ == nullptr) { | |||
| MS_LOG(ERROR) << "Store weight failed."; | |||
| return RET_ERROR; | |||
| } | |||
| if (in_tensors_.size() > kBiasIndex) { | |||
| stored_bias_ = StoreTensorData(in_tensors_.at(kBiasIndex)); | |||
| if (stored_bias_ == nullptr) { | |||
| MS_LOG(ERROR) << "Store bias failed."; | |||
| return RET_ERROR; | |||
| } | |||
| } | |||
| } | |||
| return RET_OK; | |||
| } | |||
| kernel::LiteKernel *OpenCLConv2dTransposeCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter, | |||
| const lite::Context *ctx, const kernel::KernelKey &desc) { | |||
| @@ -39,10 +39,13 @@ class Conv2dTransposeOpenCLKernel : public OpenCLKernel { | |||
| void SetConstArgs() override; | |||
| void SetGlobalLocal() override; | |||
| int InferShape() override; | |||
| int StoreConstData() override; | |||
| private: | |||
| void *padWeight_{nullptr}; | |||
| void *bias_{nullptr}; | |||
| void *stored_weight_{nullptr}; | |||
| void *stored_bias_{nullptr}; | |||
| bool enable_fp16_{false}; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| @@ -110,7 +110,7 @@ int DepthwiseConv2dOpenCLKernel::InitWeights() { | |||
| size_t dtype_size = is_fp16 ? sizeof(int16_t) : sizeof(float); | |||
| auto out_info = GpuTensorInfo(out_tensors_[0]); | |||
| // weight: o, h, w, i; o == group, i == 1 | |||
| void *origin_weight = in_tensors_.at(kWeightIndex)->data_c(); | |||
| void *origin_weight = stored_weight_ == nullptr ? in_tensors_.at(kWeightIndex)->data_c() : stored_weight_; | |||
| int CO4 = UP_DIV(out_info.C, C4NUM * block_size_.C); | |||
| int pack_weight_size = C4NUM * CO4 * parameter->kernel_h_ * parameter->kernel_w_; | |||
| @@ -162,6 +162,7 @@ int DepthwiseConv2dOpenCLKernel::InitWeights() { | |||
| if (packed_weight_ == nullptr) { | |||
| return RET_ERROR; | |||
| } | |||
| FreeStoredData(stored_weight_); | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| @@ -196,12 +197,14 @@ int DepthwiseConv2dOpenCLKernel::InitBias() { | |||
| src_type = in_tensors_.at(kBiasIndex)->data_type(); | |||
| dst_type = is_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32; | |||
| auto element_size = in_tensors_.at(kBiasIndex)->ElementsNum(); | |||
| ConvertBias(in_tensors_.at(kBiasIndex)->data_c(), temp_bias.data(), element_size, dtype_size, src_type, dst_type); | |||
| void *src_data = stored_bias_ == nullptr ? in_tensors_.at(kBiasIndex)->data_c() : stored_bias_; | |||
| ConvertBias(src_data, temp_bias.data(), element_size, dtype_size, src_type, dst_type); | |||
| } | |||
| bias_data_ = allocator->Malloc(bias_size, temp_bias.data()); | |||
| if (bias_data_ == nullptr) { | |||
| return RET_ERROR; | |||
| } | |||
| FreeStoredData(stored_bias_); | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| @@ -250,6 +253,24 @@ void DepthwiseConv2dOpenCLKernel::SetGlobalLocal() { | |||
| OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); | |||
| } | |||
| int DepthwiseConv2dOpenCLKernel::StoreConstData() { | |||
| if (!op_parameter_->infer_flag_) { | |||
| stored_weight_ = StoreTensorData(in_tensors_.at(kWeightIndex)); | |||
| if (stored_weight_ == nullptr) { | |||
| MS_LOG(ERROR) << "Store weight failed."; | |||
| return RET_ERROR; | |||
| } | |||
| if (in_tensors_.size() > kBiasIndex) { | |||
| stored_bias_ = StoreTensorData(in_tensors_.at(kBiasIndex)); | |||
| if (stored_bias_ == nullptr) { | |||
| MS_LOG(ERROR) << "Store bias failed."; | |||
| return RET_ERROR; | |||
| } | |||
| } | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int DepthwiseConv2dOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running!"; | |||
| ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_[0]->data_c()); | |||
| @@ -44,10 +44,13 @@ class DepthwiseConv2dOpenCLKernel : public OpenCLKernel { | |||
| int InitBias(); | |||
| void SetConstArgs() override; | |||
| void SetGlobalLocal() override; | |||
| int StoreConstData() override; | |||
| private: | |||
| void *packed_weight_{nullptr}; | |||
| void *stored_weight_{nullptr}; | |||
| void *bias_data_{nullptr}; | |||
| void *stored_bias_{nullptr}; | |||
| struct { | |||
| int H{2}; | |||
| int W{2}; | |||
| @@ -140,8 +140,9 @@ int FullConnectionOpenCLKernel::InitFilter() { | |||
| auto padWeightFp32 = reinterpret_cast<float *>(padWeight_); | |||
| auto padWeightFp16 = reinterpret_cast<float16_t *>(padWeight_); | |||
| memset(padWeight_, 0x00, nhw_remainder * intensor_shape.Slice * co4 * C4NUM * C4NUM * dtype_size); | |||
| auto originWeightFp32 = reinterpret_cast<float *>(in_tensors_.at(kWeightIndex)->data_c()); | |||
| auto originWeightFp16 = reinterpret_cast<float16_t *>(in_tensors_.at(kWeightIndex)->data_c()); | |||
| void *src_data = stored_weight_ == nullptr ? in_tensors_.at(kWeightIndex)->data_c() : stored_weight_; | |||
| auto originWeightFp32 = reinterpret_cast<float *>(src_data); | |||
| auto originWeightFp16 = reinterpret_cast<float16_t *>(src_data); | |||
| bool isModelFp16 = in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16; | |||
| // pad weight | |||
| @@ -182,6 +183,7 @@ int FullConnectionOpenCLKernel::InitFilter() { | |||
| } | |||
| } | |||
| allocator->UnmapBuffer(padWeight_); | |||
| FreeStoredData(stored_weight_); | |||
| return RET_OK; | |||
| } | |||
| @@ -202,19 +204,21 @@ int FullConnectionOpenCLKernel::InitBias() { | |||
| bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true); | |||
| memset(bias_, 0x00, co4 * C4NUM * dtype_size); | |||
| if (in_tensors_.size() == 3) { | |||
| if (in_tensors_[2]->data_type() == kNumberTypeFloat32 && enable_fp16_) { | |||
| void *src_data = stored_bias_ == nullptr ? in_tensors_.at(kBiasIndex)->data_c() : stored_bias_; | |||
| if (in_tensors_[kBiasIndex]->data_type() == kNumberTypeFloat32 && enable_fp16_) { | |||
| for (int i = 0; i < CO_; i++) { | |||
| reinterpret_cast<float16_t *>(bias_)[i] = reinterpret_cast<float *>(in_tensors_[2]->data_c())[i]; | |||
| reinterpret_cast<float16_t *>(bias_)[i] = reinterpret_cast<float *>(src_data)[i]; | |||
| } | |||
| } else if (in_tensors_[2]->data_type() == kNumberTypeFloat16 && !enable_fp16_) { | |||
| } else if (in_tensors_[kBiasIndex]->data_type() == kNumberTypeFloat16 && !enable_fp16_) { | |||
| for (int i = 0; i < CO_; i++) { | |||
| reinterpret_cast<float *>(bias_)[i] = reinterpret_cast<float16_t *>(in_tensors_[2]->data_c())[i]; | |||
| reinterpret_cast<float *>(bias_)[i] = reinterpret_cast<float16_t *>(src_data)[i]; | |||
| } | |||
| } else { | |||
| memcpy(bias_, in_tensors_[2]->data_c(), CO_ * dtype_size); | |||
| memcpy(bias_, src_data, CO_ * dtype_size); | |||
| } | |||
| } | |||
| allocator->UnmapBuffer(bias_); | |||
| FreeStoredData(stored_bias_); | |||
| return RET_OK; | |||
| } | |||
| @@ -244,6 +248,24 @@ void FullConnectionOpenCLKernel::SetConstArgs() { | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count, static_cast<cl_int>(param->act_type_)); | |||
| } | |||
| int FullConnectionOpenCLKernel::StoreConstData() { | |||
| if (!op_parameter_->infer_flag_) { | |||
| stored_weight_ = StoreTensorData(in_tensors_.at(kWeightIndex)); | |||
| if (stored_weight_ == nullptr) { | |||
| MS_LOG(ERROR) << "Store weight failed."; | |||
| return RET_ERROR; | |||
| } | |||
| if (in_tensors_.size() > kBiasIndex) { | |||
| stored_bias_ = StoreTensorData(in_tensors_.at(kBiasIndex)); | |||
| if (stored_bias_ == nullptr) { | |||
| MS_LOG(ERROR) << "Store bias failed."; | |||
| return RET_ERROR; | |||
| } | |||
| } | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int FullConnectionOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running!"; | |||
| int arg_count = 0; | |||
| @@ -36,12 +36,15 @@ class FullConnectionOpenCLKernel : public OpenCLKernel { | |||
| void SetConstArgs() override; | |||
| void SetGlobalLocal() override; | |||
| int Tune() override { return lite::RET_OK; } | |||
| int StoreConstData() override; | |||
| private: | |||
| int InitFilter(); | |||
| int InitBias(); | |||
| void *padWeight_{nullptr}; | |||
| void *bias_{nullptr}; | |||
| void *stored_weight_{nullptr}; | |||
| void *stored_bias_{nullptr}; | |||
| bool enable_fp16_{false}; | |||
| bool transposeA{false}; | |||
| bool transposeB{true}; | |||
| @@ -136,8 +136,9 @@ int MatMulOpenCLKernel::InitWeights() { | |||
| auto padWeightFp32 = reinterpret_cast<float *>(padWeight_); | |||
| auto padWeightFp16 = reinterpret_cast<float16_t *>(padWeight_); | |||
| memset(padWeight_, 0x00, a * b * ci4 * co4 * C4NUM * C4NUM * dtype_size); | |||
| auto originWeightFp32 = reinterpret_cast<float *>(in_tensors_.at(kWeightIndex)->data_c()); | |||
| auto originWeightFp16 = reinterpret_cast<float16_t *>(in_tensors_.at(kWeightIndex)->data_c()); | |||
| void *src_data = stored_weight_ == nullptr ? in_tensors_.at(kWeightIndex)->data_c() : stored_weight_; | |||
| auto originWeightFp32 = reinterpret_cast<float *>(src_data); | |||
| auto originWeightFp16 = reinterpret_cast<float16_t *>(src_data); | |||
| bool isModelFp16 = in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16; | |||
| // pad weight | |||
| // ABCICO -> AB(CI4)(CO4)(4 from CO)(4 from CI) | |||
| @@ -181,6 +182,7 @@ int MatMulOpenCLKernel::InitWeights() { | |||
| } | |||
| allocator->UnmapBuffer(padWeight_); | |||
| FreeStoredData(stored_weight_); | |||
| return InitBias(); | |||
| } | |||
| @@ -202,19 +204,21 @@ int MatMulOpenCLKernel::InitBias() { | |||
| bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true); | |||
| memset(bias_, 0x00, co4 * C4NUM * dtype_size); | |||
| if (in_tensors_.size() == 3) { | |||
| if (in_tensors_[2]->data_type() == kNumberTypeFloat32 && enable_fp16_) { | |||
| void *src_data = stored_bias_ == nullptr ? in_tensors_.at(kBiasIndex)->data_c() : stored_bias_; | |||
| if (in_tensors_[kBiasIndex]->data_type() == kNumberTypeFloat32 && enable_fp16_) { | |||
| for (int i = 0; i < CO_; i++) { | |||
| reinterpret_cast<float16_t *>(bias_)[i] = reinterpret_cast<float *>(in_tensors_[2]->data_c())[i]; | |||
| reinterpret_cast<float16_t *>(bias_)[i] = reinterpret_cast<float *>(src_data)[i]; | |||
| } | |||
| } else if (in_tensors_[2]->data_type() == kNumberTypeFloat16 && !enable_fp16_) { | |||
| } else if (in_tensors_[kBiasIndex]->data_type() == kNumberTypeFloat16 && !enable_fp16_) { | |||
| for (int i = 0; i < CO_; i++) { | |||
| reinterpret_cast<float *>(bias_)[i] = reinterpret_cast<float16_t *>(in_tensors_[2]->data_c())[i]; | |||
| reinterpret_cast<float *>(bias_)[i] = reinterpret_cast<float16_t *>(src_data)[i]; | |||
| } | |||
| } else { | |||
| memcpy(bias_, in_tensors_[2]->data_c(), CO_ * dtype_size); | |||
| memcpy(bias_, src_data, CO_ * dtype_size); | |||
| } | |||
| } | |||
| allocator->UnmapBuffer(bias_); | |||
| FreeStoredData(stored_bias_); | |||
| return RET_OK; | |||
| } | |||
| @@ -254,6 +258,24 @@ int MatMulOpenCLKernel::Run() { | |||
| return RET_OK; | |||
| } | |||
| int MatMulOpenCLKernel::StoreConstData() { | |||
| if (!op_parameter_->infer_flag_) { | |||
| stored_weight_ = StoreTensorData(in_tensors_.at(kWeightIndex)); | |||
| if (stored_weight_ == nullptr) { | |||
| MS_LOG(ERROR) << "Store weight failed."; | |||
| return RET_ERROR; | |||
| } | |||
| if (in_tensors_.size() > kBiasIndex) { | |||
| stored_bias_ = StoreTensorData(in_tensors_.at(kBiasIndex)); | |||
| if (stored_bias_ == nullptr) { | |||
| MS_LOG(ERROR) << "Store bias failed."; | |||
| return RET_ERROR; | |||
| } | |||
| } | |||
| } | |||
| return RET_OK; | |||
| } | |||
| kernel::LiteKernel *OpenCLMatMulKernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter, | |||
| const lite::Context *ctx, const kernel::KernelKey &desc) { | |||
| @@ -274,6 +296,12 @@ kernel::LiteKernel *OpenCLMatMulKernelCreator(const std::vector<lite::Tensor *> | |||
| } | |||
| if (!infer_shape_done) { | |||
| MS_LOG(WARNING) << "kernel don't infer shape yet!"; | |||
| auto ret = reinterpret_cast<MatMulOpenCLKernel *>(kernel)->StoreConstData(); | |||
| if (ret != mindspore::lite::RET_OK) { | |||
| MS_LOG(ERROR) << "Store " << opParameter->name_ << " const data failed!"; | |||
| delete kernel; | |||
| return nullptr; | |||
| } | |||
| return kernel; | |||
| } | |||
| if (kernel->CheckSpecs() != RET_OK || kernel->OpenCLKernel::CheckSpecs() != RET_OK) { | |||
| @@ -38,6 +38,7 @@ class MatMulOpenCLKernel : public OpenCLKernel { | |||
| void SetGlobalLocal() override; | |||
| int Tune() override { return lite::RET_OK; } | |||
| int InitBias(); | |||
| int StoreConstData() override; | |||
| protected: | |||
| void *padWeight_{nullptr}; | |||
| @@ -47,6 +48,8 @@ class MatMulOpenCLKernel : public OpenCLKernel { | |||
| int dims{}; | |||
| void *bias_{nullptr}; | |||
| int CO_{1}; | |||
| void *stored_weight_{nullptr}; | |||
| void *stored_bias_{nullptr}; | |||
| static constexpr int MAX_DIMS{4}; // max supported matmul dims | |||
| bool act_weight_{false}; | |||
| std::vector<int> inShape{std::vector<int>(MAX_DIMS, 1)}; | |||
| @@ -108,12 +108,13 @@ void WinogradOpenCLKernel::InitFilter() { | |||
| // rearrange filter | |||
| auto filter_tensor = in_tensors_.at(1); | |||
| void *src_filter_data = stored_filter_ == nullptr ? filter_tensor->data_c() : stored_filter_; | |||
| #ifndef ENABLE_ARM64 | |||
| auto winograd_filter = GenerateWinogradFilter(filter_tensor->data_c(), filter_tensor->data_type(), CO_, CI_); | |||
| auto winograd_filter = GenerateWinogradFilter(src_filter_data, filter_tensor->data_type(), CO_, CI_); | |||
| void *src_data = winograd_filter.data(); | |||
| #else | |||
| std::unique_ptr<float[]> winograd_filter(new float[CO_ * 6 * 6 * CI_]); | |||
| WinogradWeightTransform(reinterpret_cast<const float *>(filter_tensor->data_c()), | |||
| WinogradWeightTransform(reinterpret_cast<const float *>(src_filter_data), | |||
| reinterpret_cast<float *>(winograd_filter.get()), nullptr, Gt, 1, 6, 3, CI_, CO_, false); | |||
| void *src_data = winograd_filter.get(); | |||
| @@ -136,6 +137,7 @@ void WinogradOpenCLKernel::InitFilter() { | |||
| memcpy(packed_filter_, tmp.data(), size); | |||
| allocator->UnmapBuffer(packed_filter_); | |||
| } | |||
| FreeStoredData(stored_filter_); | |||
| } | |||
| void WinogradOpenCLKernel::AllocateMemory() { | |||
| @@ -195,6 +195,7 @@ class OpenCLKernel : public LiteKernel { | |||
| virtual std::vector<BaseTuningParameter> GenerateTuningParam(); | |||
| virtual int AssignTuningParam(const BaseTuningParameter ¶m); | |||
| virtual int Tune(); | |||
| virtual int StoreConstData() { return RET_OK; } | |||
| int GetImageSize(size_t idx, lite::opencl::ImageSize *img_size); | |||
| void PrintOutput(int print_num = 10, const std::string &out_file = ""); | |||
| @@ -259,6 +260,12 @@ kernel::LiteKernel *OpenCLKernelCreator(const std::vector<lite::Tensor *> &input | |||
| delete kernel; | |||
| return nullptr; | |||
| } | |||
| ret = reinterpret_cast<OpenCLKernel *>(kernel)->StoreConstData(); | |||
| if (ret != mindspore::lite::RET_OK) { | |||
| MS_LOG(ERROR) << "Store " << opParameter->name_ << " const data failed!"; | |||
| delete kernel; | |||
| return nullptr; | |||
| } | |||
| return kernel; | |||
| } | |||
| } // namespace mindspore::kernel | |||
| @@ -301,29 +301,22 @@ int CheckParamLikeTensor(const std::string &kernel_name, const std::string &tens | |||
| return RET_OK; | |||
| } | |||
| static std::set<void *> tmp_weights; | |||
| void StoreTmpWeight(lite::Tensor *tensor) { | |||
| MS_LOG(WARNING) << "store weight when kernel don't infer shape!"; | |||
| void *StoreTensorData(lite::Tensor *tensor) { | |||
| if ((tensor != nullptr) && (tensor->data_c() != nullptr) && (tensor->Size() > 0)) { | |||
| void *new_data = malloc(tensor->Size()); | |||
| MS_ASSERT(new_data); | |||
| if (new_data == nullptr) { | |||
| return; | |||
| void *stored_data = malloc(tensor->Size()); | |||
| if (stored_data == nullptr) { | |||
| MS_LOG(ERROR) << "StoreTensorData Malloc Failed."; | |||
| return nullptr; | |||
| } | |||
| memcpy(new_data, tensor->data_c(), tensor->Size()); | |||
| tensor->set_data(new_data); | |||
| tmp_weights.insert(new_data); | |||
| memcpy(stored_data, tensor->data_c(), tensor->Size()); | |||
| return stored_data; | |||
| } | |||
| return nullptr; | |||
| } | |||
| void FreeTmpWeight(lite::Tensor *tensor) { | |||
| MS_ASSERT(tensor != nullptr); | |||
| auto data = tensor->data_c(); | |||
| if (tmp_weights.count(data)) { | |||
| tmp_weights.erase(data); | |||
| void FreeStoredData(void *data) { | |||
| if (data != nullptr) { | |||
| free(data); | |||
| tensor->set_data(nullptr); | |||
| } | |||
| } | |||
| @@ -64,8 +64,9 @@ void PackNHWCToNHWC4(void *src, void *dst, bool src_is_fp16, bool dst_is_fp16, c | |||
| int CheckParamLikeTensor(const std::string &kernel_name, const std::string &tensor_name, lite::Tensor *tensor, | |||
| TypeId expect_data_type, const std::vector<int> &expect_shape); | |||
| void StoreTmpWeight(lite::Tensor *tensor); | |||
| void FreeTmpWeight(lite::Tensor *tensor); | |||
| void *StoreTensorData(lite::Tensor *tensor); | |||
| void FreeStoredData(void *data); | |||
| std::vector<std::string> CreateBuildOptionsExtByDType(TypeId type_id); | |||
| @@ -271,13 +271,13 @@ int CastConstTensorsData(const std::vector<Tensor *> &tensors, std::map<Tensor * | |||
| if (tensor->data_type() == kNumberTypeFloat32 && dst_data_type == kNumberTypeFloat16) { | |||
| auto ret = CastConstTensorData(tensor, restored_origin_tensors, kNumberTypeFloat16); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Cast const tensor from fp32 to fp16 failed, tensor name : " << tensor->tensor_name(); | |||
| MS_LOG(DEBUG) << "Cast const tensor from fp32 to fp16 failed, tensor name : " << tensor->tensor_name(); | |||
| return ret; | |||
| } | |||
| } else if (tensor->data_type() == kNumberTypeFloat16 && dst_data_type == kNumberTypeFloat32) { | |||
| auto ret = CastConstTensorData(tensor, restored_origin_tensors, kNumberTypeFloat32); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Cast const tensor from fp16 to fp32 failed, tensor name : " << tensor->tensor_name(); | |||
| MS_LOG(DEBUG) << "Cast const tensor from fp16 to fp32 failed, tensor name : " << tensor->tensor_name(); | |||
| return ret; | |||
| } | |||
| } else { | |||