From 6b05157ed681856958755487cf71aadc0b97838c Mon Sep 17 00:00:00 2001 From: yeyunpeng2020 Date: Thu, 29 Apr 2021 09:26:03 +0800 Subject: [PATCH] store weight and bias to conv etc. --- mindspore/lite/src/lite_session.cc | 17 +++++--- mindspore/lite/src/lite_session.h | 2 +- .../src/runtime/gpu/opencl/opencl_runtime.cc | 2 - .../src/runtime/gpu/opencl/opencl_runtime.h | 1 - .../runtime/kernel/opencl/kernel/conv2d.cc | 42 ++++++++++++++----- .../src/runtime/kernel/opencl/kernel/conv2d.h | 4 ++ .../kernel/opencl/kernel/conv2d_transpose.cc | 29 +++++++++++-- .../kernel/opencl/kernel/conv2d_transpose.h | 3 ++ .../kernel/opencl/kernel/depthwise_conv2d.cc | 25 ++++++++++- .../kernel/opencl/kernel/depthwise_conv2d.h | 3 ++ .../kernel/opencl/kernel/fullconnection.cc | 36 ++++++++++++---- .../kernel/opencl/kernel/fullconnection.h | 3 ++ .../runtime/kernel/opencl/kernel/matmul.cc | 42 +++++++++++++++---- .../src/runtime/kernel/opencl/kernel/matmul.h | 3 ++ .../runtime/kernel/opencl/kernel/winograd.cc | 6 ++- .../src/runtime/kernel/opencl/opencl_kernel.h | 7 ++++ .../lite/src/runtime/kernel/opencl/utils.cc | 27 +++++------- .../lite/src/runtime/kernel/opencl/utils.h | 5 ++- mindspore/lite/src/scheduler.cc | 4 +- 19 files changed, 198 insertions(+), 63 deletions(-) diff --git a/mindspore/lite/src/lite_session.cc b/mindspore/lite/src/lite_session.cc index ab061e1fa1..05439069b6 100644 --- a/mindspore/lite/src/lite_session.cc +++ b/mindspore/lite/src/lite_session.cc @@ -347,11 +347,16 @@ void LiteSession::InitGraphInOutTensors(const lite::Model *model) { } } -void LiteSession::FreePackOpWeight() { - for (auto *kernel : kernels_) { +void LiteSession::FreePackOpWeight(const std::vector &kernels) { + for (auto *kernel : kernels) { MS_ASSERT(kernel != nullptr); - if (!IsPackedOp(kernel->Type())) { - continue; + if (kernel->subgraph_type() == kernel::kNotSubGraph) { + if (!IsPackedOp(kernel->Type())) { + continue; + } + } else { + auto subgraph = reinterpret_cast(kernel); + FreePackOpWeight(subgraph->nodes()); } auto inputs = kernel->in_tensors(); for (auto *tensor : inputs) { @@ -444,8 +449,10 @@ int LiteSession::CompileGraph(Model *model) { is_running_.store(false); return ret; } +#ifndef SUPPORT_TRAIN // For reducing runtime RAM, free packop weight because packop will pack weight and will not access to origin weight - FreePackOpWeight(); + FreePackOpWeight(kernels_); +#endif is_running_.store(false); return RET_OK; } // namespace lite diff --git a/mindspore/lite/src/lite_session.h b/mindspore/lite/src/lite_session.h index 43d8bf8cb1..ff4b75a0e2 100644 --- a/mindspore/lite/src/lite_session.h +++ b/mindspore/lite/src/lite_session.h @@ -106,7 +106,7 @@ class LiteSession : public session::LiteSession { static int ReSizeKernels(const std::vector &kernels); - void FreePackOpWeight(); + static void FreePackOpWeight(const std::vector &kernels); private: void ResetInputsShape(const std::vector> &dims); diff --git a/mindspore/lite/src/runtime/gpu/opencl/opencl_runtime.cc b/mindspore/lite/src/runtime/gpu/opencl/opencl_runtime.cc index 71bcf22f43..90f8068fdd 100644 --- a/mindspore/lite/src/runtime/gpu/opencl/opencl_runtime.cc +++ b/mindspore/lite/src/runtime/gpu/opencl/opencl_runtime.cc @@ -368,8 +368,6 @@ bool OpenCLRuntime::SetFp16Enable(bool enable) { return fp16_enable_ == enable; } -bool OpenCLRuntime::IsSupportFloat16() { return support_fp16_; } - int OpenCLRuntime::BuildKernel(const cl::Kernel &kernel, const std::string &program_name, const std::string &kernel_name, const std::vector &build_options_ext) { std::string build_option = default_build_option_; diff --git a/mindspore/lite/src/runtime/gpu/opencl/opencl_runtime.h b/mindspore/lite/src/runtime/gpu/opencl/opencl_runtime.h index c791fda7ab..d67378c025 100644 --- a/mindspore/lite/src/runtime/gpu/opencl/opencl_runtime.h +++ b/mindspore/lite/src/runtime/gpu/opencl/opencl_runtime.h @@ -70,7 +70,6 @@ class OpenCLRuntime { GpuInfo GetGpuInfo(); bool GetFp16Enable() const; bool SetFp16Enable(bool enable); - bool IsSupportFloat16(); bool GetSVMEnable() const { return svm_enable_; } void SetSVMEnable(bool enable) { svm_enable_ = enable; } const std::vector &GetWorkItemSize() const { return max_work_item_sizes_; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc index 78270c3a09..274767d269 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc @@ -260,7 +260,7 @@ void Conv2DOpenCLKernel::InitFilter() { // rearrange filter auto filter_tensor = in_tensors_.at(1); - void *src_data = filter_tensor->data_c(); + void *src_data = stored_filter_ == nullptr ? filter_tensor->data_c() : stored_filter_; auto src_dtype = filter_tensor->data_type(); auto dst_dtype = use_fp16_ ? kNumberTypeFloat16 : kNumberTypeFloat32; std::vector tmp(size, 0); @@ -279,7 +279,7 @@ void Conv2DOpenCLKernel::InitFilter() { allocator->UnmapBuffer(packed_filter_); } - FreeTmpWeight(in_tensors_.at(kWeightIndex)); + FreeStoredData(stored_filter_); } void Conv2DOpenCLKernel::InitBias() { @@ -287,6 +287,7 @@ void Conv2DOpenCLKernel::InitBias() { // align bias from C to C4 auto bias_tensor = in_tensors_.at(2); + void *src_data = stored_bias_ == nullptr ? bias_tensor->data_c() : stored_bias_; size_t packed_bias_size = UP_ROUND(CO_SLICES_, block_size_.C) * CO_TILE * sizeof_FLT_; packed_bias_ = allocator->Malloc(packed_bias_size, lite::opencl::MemType::BUF); @@ -294,10 +295,10 @@ void Conv2DOpenCLKernel::InitBias() { memset(packed_bias_, 0x00, packed_bias_size); if (bias_tensor->data_type() == kNumberTypeFloat16) { if (use_fp16_) { - memcpy(packed_bias_, bias_tensor->data_c(), CO_ * sizeof_FLT_); + memcpy(packed_bias_, src_data, CO_ * sizeof_FLT_); } else { auto packed_bias_fp32 = reinterpret_cast(packed_bias_); - auto origin_bias_fp16 = reinterpret_cast(bias_tensor->data_c()); + auto origin_bias_fp16 = reinterpret_cast(src_data); MS_ASSERT(origin_bias_fp16); for (int i = 0; i < CO_; ++i) { packed_bias_fp32[i] = static_cast(origin_bias_fp16[i]); @@ -306,17 +307,17 @@ void Conv2DOpenCLKernel::InitBias() { } else { if (use_fp16_) { auto packed_bias_fp16 = reinterpret_cast(packed_bias_); - auto origin_bias_fp32 = reinterpret_cast(bias_tensor->data_c()); + auto origin_bias_fp32 = reinterpret_cast(src_data); MS_ASSERT(origin_bias_fp32); for (int i = 0; i < CO_; ++i) { packed_bias_fp16[i] = static_cast(origin_bias_fp32[i]); } } else { - memcpy(packed_bias_, bias_tensor->data_c(), CO_ * sizeof_FLT_); + memcpy(packed_bias_, src_data, CO_ * sizeof_FLT_); } } allocator->UnmapBuffer(packed_bias_); - FreeTmpWeight(in_tensors_.at(kBiasIndex)); + FreeStoredData(stored_bias_); } void Conv2DOpenCLKernel::SetConstArgs() { @@ -403,6 +404,24 @@ std::vector Conv2DOpenCLKernel::GenerateTuningParam() { return tuning_params; } +int Conv2DOpenCLKernel::StoreConstData() { + if (!op_parameter_->infer_flag_) { + stored_filter_ = StoreTensorData(in_tensors_.at(kWeightIndex)); + if (stored_filter_ == nullptr) { + MS_LOG(ERROR) << "Store weight failed."; + return RET_ERROR; + } + if (in_tensors_.size() > kBiasIndex) { + stored_bias_ = StoreTensorData(in_tensors_.at(kBiasIndex)); + if (stored_bias_ == nullptr) { + MS_LOG(ERROR) << "Store bias failed."; + return RET_ERROR; + } + } + } + return RET_OK; +} + bool UseFcReplaceConv(const std::vector &inputs, const std::vector &outputs, ConvParameter *param) { MS_ASSERT(param); @@ -528,11 +547,12 @@ kernel::LiteKernel *OpenCLConv2DCreator(const std::vector &input } } if (!infer_shape_done) { - StoreTmpWeight(inputs.at(kWeightIndex)); - if (inputs.size() > kBiasIndex) { - StoreTmpWeight(inputs.at(kBiasIndex)); + auto ret = reinterpret_cast(kernel)->StoreConstData(); + if (ret != mindspore::lite::RET_OK) { + MS_LOG(ERROR) << "Store " << opParameter->name_ << " const data failed!"; + delete kernel; + return nullptr; } - MS_LOG(WARNING) << "kernel don't infer shape yet!"; return kernel; } if (kernel->CheckSpecs() != RET_OK || kernel->OpenCLKernel::CheckSpecs() != RET_OK) { diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h index 7f20d14932..63fd3942ab 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h @@ -58,6 +58,8 @@ class Conv2DOpenCLKernel : public OpenCLKernel { void SetGlobalLocal() override; int Run() override; + int StoreConstData() override; + std::string Key() override { auto key = OpenCLKernel::Key(); key += "_" + std::to_string(KH_) + "_" + std::to_string(KW_) + "_" + std::to_string(param_->stride_h_) + "_" + @@ -94,7 +96,9 @@ class Conv2DOpenCLKernel : public OpenCLKernel { int KH_{}; int KW_{}; void *packed_filter_{nullptr}; + void *stored_filter_{nullptr}; void *packed_bias_{nullptr}; + void *stored_bias_{nullptr}; MemType filter_type_{MemType::BUF}; bool has_bias_{false}; int TILE_HW_{}; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc index ab1c55658f..a4acdc1d9a 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc @@ -148,7 +148,7 @@ int Conv2dTransposeOpenCLKernel::InitFilter() { padWeight_ = allocator->Malloc(div_ci * div_co * C4NUM * C4NUM * kh * kw * data_size, lite::opencl::MemType::BUF); padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true); memset(padWeight_, 0x00, div_ci * div_co * C4NUM * C4NUM * kh * kw * data_size); - auto origin_weight = in_tensors_.at(kWeightIndex)->data_c(); + auto origin_weight = stored_weight_ == nullptr ? in_tensors_.at(kWeightIndex)->data_c() : stored_weight_; auto weight_dtype = in_tensors_.at(kWeightIndex)->data_type(); int index = 0; for (int co_i = 0; co_i < div_co; co_i++) { @@ -188,6 +188,7 @@ int Conv2dTransposeOpenCLKernel::InitFilter() { } } allocator->UnmapBuffer(padWeight_); + FreeStoredData(stored_weight_); return RET_OK; } @@ -209,20 +210,22 @@ int Conv2dTransposeOpenCLKernel::InitBias() { bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true); memset(bias_, 0x00, div_co * C4NUM * data_size); if (in_tensors_.size() == 3) { + void *src_data = stored_bias_ == nullptr ? in_tensors_.at(kBiasIndex)->data_c() : stored_bias_; auto bias_dtype = in_tensors_[2]->data_type(); if (bias_dtype == kNumberTypeFloat32 && enable_fp16_) { for (int i = 0; i < co; i++) { - reinterpret_cast(bias_)[i] = reinterpret_cast(in_tensors_[2]->data_c())[i]; + reinterpret_cast(bias_)[i] = reinterpret_cast(src_data)[i]; } } else if (bias_dtype == kNumberTypeFloat16 && !enable_fp16_) { for (int i = 0; i < co; i++) { - reinterpret_cast(bias_)[i] = reinterpret_cast(in_tensors_[2]->data_c())[i]; + reinterpret_cast(bias_)[i] = reinterpret_cast(src_data)[i]; } } else { - memcpy(bias_, in_tensors_[2]->data_c(), co * data_size); + memcpy(bias_, src_data, co * data_size); } } allocator->UnmapBuffer(bias_); + FreeStoredData(stored_bias_); return RET_OK; } @@ -243,6 +246,24 @@ int Conv2dTransposeOpenCLKernel::InferShape() { return RET_OK; } +int Conv2dTransposeOpenCLKernel::StoreConstData() { + if (!op_parameter_->infer_flag_) { + stored_weight_ = StoreTensorData(in_tensors_.at(kWeightIndex)); + if (stored_weight_ == nullptr) { + MS_LOG(ERROR) << "Store weight failed."; + return RET_ERROR; + } + if (in_tensors_.size() > kBiasIndex) { + stored_bias_ = StoreTensorData(in_tensors_.at(kBiasIndex)); + if (stored_bias_ == nullptr) { + MS_LOG(ERROR) << "Store bias failed."; + return RET_ERROR; + } + } + } + return RET_OK; +} + kernel::LiteKernel *OpenCLConv2dTransposeCreator(const std::vector &inputs, const std::vector &outputs, OpParameter *opParameter, const lite::Context *ctx, const kernel::KernelKey &desc) { diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.h index b1c1c78922..d3582bd4ad 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.h @@ -39,10 +39,13 @@ class Conv2dTransposeOpenCLKernel : public OpenCLKernel { void SetConstArgs() override; void SetGlobalLocal() override; int InferShape() override; + int StoreConstData() override; private: void *padWeight_{nullptr}; void *bias_{nullptr}; + void *stored_weight_{nullptr}; + void *stored_bias_{nullptr}; bool enable_fp16_{false}; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc index 936abed116..d1ba226817 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc @@ -110,7 +110,7 @@ int DepthwiseConv2dOpenCLKernel::InitWeights() { size_t dtype_size = is_fp16 ? sizeof(int16_t) : sizeof(float); auto out_info = GpuTensorInfo(out_tensors_[0]); // weight: o, h, w, i; o == group, i == 1 - void *origin_weight = in_tensors_.at(kWeightIndex)->data_c(); + void *origin_weight = stored_weight_ == nullptr ? in_tensors_.at(kWeightIndex)->data_c() : stored_weight_; int CO4 = UP_DIV(out_info.C, C4NUM * block_size_.C); int pack_weight_size = C4NUM * CO4 * parameter->kernel_h_ * parameter->kernel_w_; @@ -162,6 +162,7 @@ int DepthwiseConv2dOpenCLKernel::InitWeights() { if (packed_weight_ == nullptr) { return RET_ERROR; } + FreeStoredData(stored_weight_); return mindspore::lite::RET_OK; } @@ -196,12 +197,14 @@ int DepthwiseConv2dOpenCLKernel::InitBias() { src_type = in_tensors_.at(kBiasIndex)->data_type(); dst_type = is_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32; auto element_size = in_tensors_.at(kBiasIndex)->ElementsNum(); - ConvertBias(in_tensors_.at(kBiasIndex)->data_c(), temp_bias.data(), element_size, dtype_size, src_type, dst_type); + void *src_data = stored_bias_ == nullptr ? in_tensors_.at(kBiasIndex)->data_c() : stored_bias_; + ConvertBias(src_data, temp_bias.data(), element_size, dtype_size, src_type, dst_type); } bias_data_ = allocator->Malloc(bias_size, temp_bias.data()); if (bias_data_ == nullptr) { return RET_ERROR; } + FreeStoredData(stored_bias_); return mindspore::lite::RET_OK; } @@ -250,6 +253,24 @@ void DepthwiseConv2dOpenCLKernel::SetGlobalLocal() { OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); } +int DepthwiseConv2dOpenCLKernel::StoreConstData() { + if (!op_parameter_->infer_flag_) { + stored_weight_ = StoreTensorData(in_tensors_.at(kWeightIndex)); + if (stored_weight_ == nullptr) { + MS_LOG(ERROR) << "Store weight failed."; + return RET_ERROR; + } + if (in_tensors_.size() > kBiasIndex) { + stored_bias_ = StoreTensorData(in_tensors_.at(kBiasIndex)); + if (stored_bias_ == nullptr) { + MS_LOG(ERROR) << "Store bias failed."; + return RET_ERROR; + } + } + } + return RET_OK; +} + int DepthwiseConv2dOpenCLKernel::Run() { MS_LOG(DEBUG) << this->name() << " Running!"; ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_[0]->data_c()); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h index 706f755329..37ff4962d7 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h @@ -44,10 +44,13 @@ class DepthwiseConv2dOpenCLKernel : public OpenCLKernel { int InitBias(); void SetConstArgs() override; void SetGlobalLocal() override; + int StoreConstData() override; private: void *packed_weight_{nullptr}; + void *stored_weight_{nullptr}; void *bias_data_{nullptr}; + void *stored_bias_{nullptr}; struct { int H{2}; int W{2}; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc index bd6ff8f9a2..1313f0f55b 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc @@ -140,8 +140,9 @@ int FullConnectionOpenCLKernel::InitFilter() { auto padWeightFp32 = reinterpret_cast(padWeight_); auto padWeightFp16 = reinterpret_cast(padWeight_); memset(padWeight_, 0x00, nhw_remainder * intensor_shape.Slice * co4 * C4NUM * C4NUM * dtype_size); - auto originWeightFp32 = reinterpret_cast(in_tensors_.at(kWeightIndex)->data_c()); - auto originWeightFp16 = reinterpret_cast(in_tensors_.at(kWeightIndex)->data_c()); + void *src_data = stored_weight_ == nullptr ? in_tensors_.at(kWeightIndex)->data_c() : stored_weight_; + auto originWeightFp32 = reinterpret_cast(src_data); + auto originWeightFp16 = reinterpret_cast(src_data); bool isModelFp16 = in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16; // pad weight @@ -182,6 +183,7 @@ int FullConnectionOpenCLKernel::InitFilter() { } } allocator->UnmapBuffer(padWeight_); + FreeStoredData(stored_weight_); return RET_OK; } @@ -202,19 +204,21 @@ int FullConnectionOpenCLKernel::InitBias() { bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true); memset(bias_, 0x00, co4 * C4NUM * dtype_size); if (in_tensors_.size() == 3) { - if (in_tensors_[2]->data_type() == kNumberTypeFloat32 && enable_fp16_) { + void *src_data = stored_bias_ == nullptr ? in_tensors_.at(kBiasIndex)->data_c() : stored_bias_; + if (in_tensors_[kBiasIndex]->data_type() == kNumberTypeFloat32 && enable_fp16_) { for (int i = 0; i < CO_; i++) { - reinterpret_cast(bias_)[i] = reinterpret_cast(in_tensors_[2]->data_c())[i]; + reinterpret_cast(bias_)[i] = reinterpret_cast(src_data)[i]; } - } else if (in_tensors_[2]->data_type() == kNumberTypeFloat16 && !enable_fp16_) { + } else if (in_tensors_[kBiasIndex]->data_type() == kNumberTypeFloat16 && !enable_fp16_) { for (int i = 0; i < CO_; i++) { - reinterpret_cast(bias_)[i] = reinterpret_cast(in_tensors_[2]->data_c())[i]; + reinterpret_cast(bias_)[i] = reinterpret_cast(src_data)[i]; } } else { - memcpy(bias_, in_tensors_[2]->data_c(), CO_ * dtype_size); + memcpy(bias_, src_data, CO_ * dtype_size); } } allocator->UnmapBuffer(bias_); + FreeStoredData(stored_bias_); return RET_OK; } @@ -244,6 +248,24 @@ void FullConnectionOpenCLKernel::SetConstArgs() { ocl_runtime_->SetKernelArg(kernel_, arg_count, static_cast(param->act_type_)); } +int FullConnectionOpenCLKernel::StoreConstData() { + if (!op_parameter_->infer_flag_) { + stored_weight_ = StoreTensorData(in_tensors_.at(kWeightIndex)); + if (stored_weight_ == nullptr) { + MS_LOG(ERROR) << "Store weight failed."; + return RET_ERROR; + } + if (in_tensors_.size() > kBiasIndex) { + stored_bias_ = StoreTensorData(in_tensors_.at(kBiasIndex)); + if (stored_bias_ == nullptr) { + MS_LOG(ERROR) << "Store bias failed."; + return RET_ERROR; + } + } + } + return RET_OK; +} + int FullConnectionOpenCLKernel::Run() { MS_LOG(DEBUG) << this->name() << " Running!"; int arg_count = 0; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h index bb2a1fd294..fe24788527 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h @@ -36,12 +36,15 @@ class FullConnectionOpenCLKernel : public OpenCLKernel { void SetConstArgs() override; void SetGlobalLocal() override; int Tune() override { return lite::RET_OK; } + int StoreConstData() override; private: int InitFilter(); int InitBias(); void *padWeight_{nullptr}; void *bias_{nullptr}; + void *stored_weight_{nullptr}; + void *stored_bias_{nullptr}; bool enable_fp16_{false}; bool transposeA{false}; bool transposeB{true}; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc index b6fdb6007f..7e25a3975e 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc @@ -136,8 +136,9 @@ int MatMulOpenCLKernel::InitWeights() { auto padWeightFp32 = reinterpret_cast(padWeight_); auto padWeightFp16 = reinterpret_cast(padWeight_); memset(padWeight_, 0x00, a * b * ci4 * co4 * C4NUM * C4NUM * dtype_size); - auto originWeightFp32 = reinterpret_cast(in_tensors_.at(kWeightIndex)->data_c()); - auto originWeightFp16 = reinterpret_cast(in_tensors_.at(kWeightIndex)->data_c()); + void *src_data = stored_weight_ == nullptr ? in_tensors_.at(kWeightIndex)->data_c() : stored_weight_; + auto originWeightFp32 = reinterpret_cast(src_data); + auto originWeightFp16 = reinterpret_cast(src_data); bool isModelFp16 = in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16; // pad weight // ABCICO -> AB(CI4)(CO4)(4 from CO)(4 from CI) @@ -181,6 +182,7 @@ int MatMulOpenCLKernel::InitWeights() { } allocator->UnmapBuffer(padWeight_); + FreeStoredData(stored_weight_); return InitBias(); } @@ -202,19 +204,21 @@ int MatMulOpenCLKernel::InitBias() { bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true); memset(bias_, 0x00, co4 * C4NUM * dtype_size); if (in_tensors_.size() == 3) { - if (in_tensors_[2]->data_type() == kNumberTypeFloat32 && enable_fp16_) { + void *src_data = stored_bias_ == nullptr ? in_tensors_.at(kBiasIndex)->data_c() : stored_bias_; + if (in_tensors_[kBiasIndex]->data_type() == kNumberTypeFloat32 && enable_fp16_) { for (int i = 0; i < CO_; i++) { - reinterpret_cast(bias_)[i] = reinterpret_cast(in_tensors_[2]->data_c())[i]; + reinterpret_cast(bias_)[i] = reinterpret_cast(src_data)[i]; } - } else if (in_tensors_[2]->data_type() == kNumberTypeFloat16 && !enable_fp16_) { + } else if (in_tensors_[kBiasIndex]->data_type() == kNumberTypeFloat16 && !enable_fp16_) { for (int i = 0; i < CO_; i++) { - reinterpret_cast(bias_)[i] = reinterpret_cast(in_tensors_[2]->data_c())[i]; + reinterpret_cast(bias_)[i] = reinterpret_cast(src_data)[i]; } } else { - memcpy(bias_, in_tensors_[2]->data_c(), CO_ * dtype_size); + memcpy(bias_, src_data, CO_ * dtype_size); } } allocator->UnmapBuffer(bias_); + FreeStoredData(stored_bias_); return RET_OK; } @@ -254,6 +258,24 @@ int MatMulOpenCLKernel::Run() { return RET_OK; } +int MatMulOpenCLKernel::StoreConstData() { + if (!op_parameter_->infer_flag_) { + stored_weight_ = StoreTensorData(in_tensors_.at(kWeightIndex)); + if (stored_weight_ == nullptr) { + MS_LOG(ERROR) << "Store weight failed."; + return RET_ERROR; + } + if (in_tensors_.size() > kBiasIndex) { + stored_bias_ = StoreTensorData(in_tensors_.at(kBiasIndex)); + if (stored_bias_ == nullptr) { + MS_LOG(ERROR) << "Store bias failed."; + return RET_ERROR; + } + } + } + return RET_OK; +} + kernel::LiteKernel *OpenCLMatMulKernelCreator(const std::vector &inputs, const std::vector &outputs, OpParameter *opParameter, const lite::Context *ctx, const kernel::KernelKey &desc) { @@ -274,6 +296,12 @@ kernel::LiteKernel *OpenCLMatMulKernelCreator(const std::vector } if (!infer_shape_done) { MS_LOG(WARNING) << "kernel don't infer shape yet!"; + auto ret = reinterpret_cast(kernel)->StoreConstData(); + if (ret != mindspore::lite::RET_OK) { + MS_LOG(ERROR) << "Store " << opParameter->name_ << " const data failed!"; + delete kernel; + return nullptr; + } return kernel; } if (kernel->CheckSpecs() != RET_OK || kernel->OpenCLKernel::CheckSpecs() != RET_OK) { diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h index 20297d98a7..e472f5aa2d 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h @@ -38,6 +38,7 @@ class MatMulOpenCLKernel : public OpenCLKernel { void SetGlobalLocal() override; int Tune() override { return lite::RET_OK; } int InitBias(); + int StoreConstData() override; protected: void *padWeight_{nullptr}; @@ -47,6 +48,8 @@ class MatMulOpenCLKernel : public OpenCLKernel { int dims{}; void *bias_{nullptr}; int CO_{1}; + void *stored_weight_{nullptr}; + void *stored_bias_{nullptr}; static constexpr int MAX_DIMS{4}; // max supported matmul dims bool act_weight_{false}; std::vector inShape{std::vector(MAX_DIMS, 1)}; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.cc index 7973c27822..06974127d3 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.cc @@ -108,12 +108,13 @@ void WinogradOpenCLKernel::InitFilter() { // rearrange filter auto filter_tensor = in_tensors_.at(1); + void *src_filter_data = stored_filter_ == nullptr ? filter_tensor->data_c() : stored_filter_; #ifndef ENABLE_ARM64 - auto winograd_filter = GenerateWinogradFilter(filter_tensor->data_c(), filter_tensor->data_type(), CO_, CI_); + auto winograd_filter = GenerateWinogradFilter(src_filter_data, filter_tensor->data_type(), CO_, CI_); void *src_data = winograd_filter.data(); #else std::unique_ptr winograd_filter(new float[CO_ * 6 * 6 * CI_]); - WinogradWeightTransform(reinterpret_cast(filter_tensor->data_c()), + WinogradWeightTransform(reinterpret_cast(src_filter_data), reinterpret_cast(winograd_filter.get()), nullptr, Gt, 1, 6, 3, CI_, CO_, false); void *src_data = winograd_filter.get(); @@ -136,6 +137,7 @@ void WinogradOpenCLKernel::InitFilter() { memcpy(packed_filter_, tmp.data(), size); allocator->UnmapBuffer(packed_filter_); } + FreeStoredData(stored_filter_); } void WinogradOpenCLKernel::AllocateMemory() { diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h index 55c765e4bd..13afc3433d 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h +++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h @@ -195,6 +195,7 @@ class OpenCLKernel : public LiteKernel { virtual std::vector GenerateTuningParam(); virtual int AssignTuningParam(const BaseTuningParameter ¶m); virtual int Tune(); + virtual int StoreConstData() { return RET_OK; } int GetImageSize(size_t idx, lite::opencl::ImageSize *img_size); void PrintOutput(int print_num = 10, const std::string &out_file = ""); @@ -259,6 +260,12 @@ kernel::LiteKernel *OpenCLKernelCreator(const std::vector &input delete kernel; return nullptr; } + ret = reinterpret_cast(kernel)->StoreConstData(); + if (ret != mindspore::lite::RET_OK) { + MS_LOG(ERROR) << "Store " << opParameter->name_ << " const data failed!"; + delete kernel; + return nullptr; + } return kernel; } } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/utils.cc b/mindspore/lite/src/runtime/kernel/opencl/utils.cc index 20c1d065b8..36b7fe9ccf 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/utils.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/utils.cc @@ -301,29 +301,22 @@ int CheckParamLikeTensor(const std::string &kernel_name, const std::string &tens return RET_OK; } -static std::set tmp_weights; - -void StoreTmpWeight(lite::Tensor *tensor) { - MS_LOG(WARNING) << "store weight when kernel don't infer shape!"; +void *StoreTensorData(lite::Tensor *tensor) { if ((tensor != nullptr) && (tensor->data_c() != nullptr) && (tensor->Size() > 0)) { - void *new_data = malloc(tensor->Size()); - MS_ASSERT(new_data); - if (new_data == nullptr) { - return; + void *stored_data = malloc(tensor->Size()); + if (stored_data == nullptr) { + MS_LOG(ERROR) << "StoreTensorData Malloc Failed."; + return nullptr; } - memcpy(new_data, tensor->data_c(), tensor->Size()); - tensor->set_data(new_data); - tmp_weights.insert(new_data); + memcpy(stored_data, tensor->data_c(), tensor->Size()); + return stored_data; } + return nullptr; } -void FreeTmpWeight(lite::Tensor *tensor) { - MS_ASSERT(tensor != nullptr); - auto data = tensor->data_c(); - if (tmp_weights.count(data)) { - tmp_weights.erase(data); +void FreeStoredData(void *data) { + if (data != nullptr) { free(data); - tensor->set_data(nullptr); } } diff --git a/mindspore/lite/src/runtime/kernel/opencl/utils.h b/mindspore/lite/src/runtime/kernel/opencl/utils.h index b4d64e8645..ffcb75b313 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/utils.h +++ b/mindspore/lite/src/runtime/kernel/opencl/utils.h @@ -64,8 +64,9 @@ void PackNHWCToNHWC4(void *src, void *dst, bool src_is_fp16, bool dst_is_fp16, c int CheckParamLikeTensor(const std::string &kernel_name, const std::string &tensor_name, lite::Tensor *tensor, TypeId expect_data_type, const std::vector &expect_shape); -void StoreTmpWeight(lite::Tensor *tensor); -void FreeTmpWeight(lite::Tensor *tensor); +void *StoreTensorData(lite::Tensor *tensor); + +void FreeStoredData(void *data); std::vector CreateBuildOptionsExtByDType(TypeId type_id); diff --git a/mindspore/lite/src/scheduler.cc b/mindspore/lite/src/scheduler.cc index eb621edb94..4954f03cfc 100644 --- a/mindspore/lite/src/scheduler.cc +++ b/mindspore/lite/src/scheduler.cc @@ -271,13 +271,13 @@ int CastConstTensorsData(const std::vector &tensors, std::mapdata_type() == kNumberTypeFloat32 && dst_data_type == kNumberTypeFloat16) { auto ret = CastConstTensorData(tensor, restored_origin_tensors, kNumberTypeFloat16); if (ret != RET_OK) { - MS_LOG(ERROR) << "Cast const tensor from fp32 to fp16 failed, tensor name : " << tensor->tensor_name(); + MS_LOG(DEBUG) << "Cast const tensor from fp32 to fp16 failed, tensor name : " << tensor->tensor_name(); return ret; } } else if (tensor->data_type() == kNumberTypeFloat16 && dst_data_type == kNumberTypeFloat32) { auto ret = CastConstTensorData(tensor, restored_origin_tensors, kNumberTypeFloat32); if (ret != RET_OK) { - MS_LOG(ERROR) << "Cast const tensor from fp16 to fp32 failed, tensor name : " << tensor->tensor_name(); + MS_LOG(DEBUG) << "Cast const tensor from fp16 to fp32 failed, tensor name : " << tensor->tensor_name(); return ret; } } else {