Browse Source

store weight and bias to conv etc.

pull/15867/head
yeyunpeng2020 4 years ago
parent
commit
6b05157ed6
19 changed files with 198 additions and 63 deletions
  1. +12
    -5
      mindspore/lite/src/lite_session.cc
  2. +1
    -1
      mindspore/lite/src/lite_session.h
  3. +0
    -2
      mindspore/lite/src/runtime/gpu/opencl/opencl_runtime.cc
  4. +0
    -1
      mindspore/lite/src/runtime/gpu/opencl/opencl_runtime.h
  5. +31
    -11
      mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
  6. +4
    -0
      mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h
  7. +25
    -4
      mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
  8. +3
    -0
      mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.h
  9. +23
    -2
      mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
  10. +3
    -0
      mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h
  11. +29
    -7
      mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc
  12. +3
    -0
      mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h
  13. +35
    -7
      mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
  14. +3
    -0
      mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h
  15. +4
    -2
      mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.cc
  16. +7
    -0
      mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
  17. +10
    -17
      mindspore/lite/src/runtime/kernel/opencl/utils.cc
  18. +3
    -2
      mindspore/lite/src/runtime/kernel/opencl/utils.h
  19. +2
    -2
      mindspore/lite/src/scheduler.cc

+ 12
- 5
mindspore/lite/src/lite_session.cc View File

@@ -347,11 +347,16 @@ void LiteSession::InitGraphInOutTensors(const lite::Model *model) {
} }
} }


void LiteSession::FreePackOpWeight() {
for (auto *kernel : kernels_) {
void LiteSession::FreePackOpWeight(const std::vector<kernel::LiteKernel *> &kernels) {
for (auto *kernel : kernels) {
MS_ASSERT(kernel != nullptr); MS_ASSERT(kernel != nullptr);
if (!IsPackedOp(kernel->Type())) {
continue;
if (kernel->subgraph_type() == kernel::kNotSubGraph) {
if (!IsPackedOp(kernel->Type())) {
continue;
}
} else {
auto subgraph = reinterpret_cast<kernel::SubGraphKernel *>(kernel);
FreePackOpWeight(subgraph->nodes());
} }
auto inputs = kernel->in_tensors(); auto inputs = kernel->in_tensors();
for (auto *tensor : inputs) { for (auto *tensor : inputs) {
@@ -444,8 +449,10 @@ int LiteSession::CompileGraph(Model *model) {
is_running_.store(false); is_running_.store(false);
return ret; return ret;
} }
#ifndef SUPPORT_TRAIN
// For reducing runtime RAM, free packop weight because packop will pack weight and will not access to origin weight // For reducing runtime RAM, free packop weight because packop will pack weight and will not access to origin weight
FreePackOpWeight();
FreePackOpWeight(kernels_);
#endif
is_running_.store(false); is_running_.store(false);
return RET_OK; return RET_OK;
} // namespace lite } // namespace lite


+ 1
- 1
mindspore/lite/src/lite_session.h View File

@@ -106,7 +106,7 @@ class LiteSession : public session::LiteSession {


static int ReSizeKernels(const std::vector<kernel::LiteKernel *> &kernels); static int ReSizeKernels(const std::vector<kernel::LiteKernel *> &kernels);


void FreePackOpWeight();
static void FreePackOpWeight(const std::vector<kernel::LiteKernel *> &kernels);


private: private:
void ResetInputsShape(const std::vector<std::vector<int>> &dims); void ResetInputsShape(const std::vector<std::vector<int>> &dims);


+ 0
- 2
mindspore/lite/src/runtime/gpu/opencl/opencl_runtime.cc View File

@@ -368,8 +368,6 @@ bool OpenCLRuntime::SetFp16Enable(bool enable) {
return fp16_enable_ == enable; return fp16_enable_ == enable;
} }


bool OpenCLRuntime::IsSupportFloat16() { return support_fp16_; }

int OpenCLRuntime::BuildKernel(const cl::Kernel &kernel, const std::string &program_name, int OpenCLRuntime::BuildKernel(const cl::Kernel &kernel, const std::string &program_name,
const std::string &kernel_name, const std::vector<std::string> &build_options_ext) { const std::string &kernel_name, const std::vector<std::string> &build_options_ext) {
std::string build_option = default_build_option_; std::string build_option = default_build_option_;


+ 0
- 1
mindspore/lite/src/runtime/gpu/opencl/opencl_runtime.h View File

@@ -70,7 +70,6 @@ class OpenCLRuntime {
GpuInfo GetGpuInfo(); GpuInfo GetGpuInfo();
bool GetFp16Enable() const; bool GetFp16Enable() const;
bool SetFp16Enable(bool enable); bool SetFp16Enable(bool enable);
bool IsSupportFloat16();
bool GetSVMEnable() const { return svm_enable_; } bool GetSVMEnable() const { return svm_enable_; }
void SetSVMEnable(bool enable) { svm_enable_ = enable; } void SetSVMEnable(bool enable) { svm_enable_ = enable; }
const std::vector<size_t> &GetWorkItemSize() const { return max_work_item_sizes_; } const std::vector<size_t> &GetWorkItemSize() const { return max_work_item_sizes_; }


+ 31
- 11
mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc View File

@@ -260,7 +260,7 @@ void Conv2DOpenCLKernel::InitFilter() {


// rearrange filter // rearrange filter
auto filter_tensor = in_tensors_.at(1); auto filter_tensor = in_tensors_.at(1);
void *src_data = filter_tensor->data_c();
void *src_data = stored_filter_ == nullptr ? filter_tensor->data_c() : stored_filter_;
auto src_dtype = filter_tensor->data_type(); auto src_dtype = filter_tensor->data_type();
auto dst_dtype = use_fp16_ ? kNumberTypeFloat16 : kNumberTypeFloat32; auto dst_dtype = use_fp16_ ? kNumberTypeFloat16 : kNumberTypeFloat32;
std::vector<char> tmp(size, 0); std::vector<char> tmp(size, 0);
@@ -279,7 +279,7 @@ void Conv2DOpenCLKernel::InitFilter() {
allocator->UnmapBuffer(packed_filter_); allocator->UnmapBuffer(packed_filter_);
} }


FreeTmpWeight(in_tensors_.at(kWeightIndex));
FreeStoredData(stored_filter_);
} }


void Conv2DOpenCLKernel::InitBias() { void Conv2DOpenCLKernel::InitBias() {
@@ -287,6 +287,7 @@ void Conv2DOpenCLKernel::InitBias() {


// align bias from C to C4 // align bias from C to C4
auto bias_tensor = in_tensors_.at(2); auto bias_tensor = in_tensors_.at(2);
void *src_data = stored_bias_ == nullptr ? bias_tensor->data_c() : stored_bias_;
size_t packed_bias_size = UP_ROUND(CO_SLICES_, block_size_.C) * CO_TILE * sizeof_FLT_; size_t packed_bias_size = UP_ROUND(CO_SLICES_, block_size_.C) * CO_TILE * sizeof_FLT_;
packed_bias_ = allocator->Malloc(packed_bias_size, lite::opencl::MemType::BUF); packed_bias_ = allocator->Malloc(packed_bias_size, lite::opencl::MemType::BUF);


@@ -294,10 +295,10 @@ void Conv2DOpenCLKernel::InitBias() {
memset(packed_bias_, 0x00, packed_bias_size); memset(packed_bias_, 0x00, packed_bias_size);
if (bias_tensor->data_type() == kNumberTypeFloat16) { if (bias_tensor->data_type() == kNumberTypeFloat16) {
if (use_fp16_) { if (use_fp16_) {
memcpy(packed_bias_, bias_tensor->data_c(), CO_ * sizeof_FLT_);
memcpy(packed_bias_, src_data, CO_ * sizeof_FLT_);
} else { } else {
auto packed_bias_fp32 = reinterpret_cast<float *>(packed_bias_); auto packed_bias_fp32 = reinterpret_cast<float *>(packed_bias_);
auto origin_bias_fp16 = reinterpret_cast<float16_t *>(bias_tensor->data_c());
auto origin_bias_fp16 = reinterpret_cast<float16_t *>(src_data);
MS_ASSERT(origin_bias_fp16); MS_ASSERT(origin_bias_fp16);
for (int i = 0; i < CO_; ++i) { for (int i = 0; i < CO_; ++i) {
packed_bias_fp32[i] = static_cast<float>(origin_bias_fp16[i]); packed_bias_fp32[i] = static_cast<float>(origin_bias_fp16[i]);
@@ -306,17 +307,17 @@ void Conv2DOpenCLKernel::InitBias() {
} else { } else {
if (use_fp16_) { if (use_fp16_) {
auto packed_bias_fp16 = reinterpret_cast<float16_t *>(packed_bias_); auto packed_bias_fp16 = reinterpret_cast<float16_t *>(packed_bias_);
auto origin_bias_fp32 = reinterpret_cast<float *>(bias_tensor->data_c());
auto origin_bias_fp32 = reinterpret_cast<float *>(src_data);
MS_ASSERT(origin_bias_fp32); MS_ASSERT(origin_bias_fp32);
for (int i = 0; i < CO_; ++i) { for (int i = 0; i < CO_; ++i) {
packed_bias_fp16[i] = static_cast<float16_t>(origin_bias_fp32[i]); packed_bias_fp16[i] = static_cast<float16_t>(origin_bias_fp32[i]);
} }
} else { } else {
memcpy(packed_bias_, bias_tensor->data_c(), CO_ * sizeof_FLT_);
memcpy(packed_bias_, src_data, CO_ * sizeof_FLT_);
} }
} }
allocator->UnmapBuffer(packed_bias_); allocator->UnmapBuffer(packed_bias_);
FreeTmpWeight(in_tensors_.at(kBiasIndex));
FreeStoredData(stored_bias_);
} }


void Conv2DOpenCLKernel::SetConstArgs() { void Conv2DOpenCLKernel::SetConstArgs() {
@@ -403,6 +404,24 @@ std::vector<BaseTuningParameter> Conv2DOpenCLKernel::GenerateTuningParam() {
return tuning_params; return tuning_params;
} }


int Conv2DOpenCLKernel::StoreConstData() {
if (!op_parameter_->infer_flag_) {
stored_filter_ = StoreTensorData(in_tensors_.at(kWeightIndex));
if (stored_filter_ == nullptr) {
MS_LOG(ERROR) << "Store weight failed.";
return RET_ERROR;
}
if (in_tensors_.size() > kBiasIndex) {
stored_bias_ = StoreTensorData(in_tensors_.at(kBiasIndex));
if (stored_bias_ == nullptr) {
MS_LOG(ERROR) << "Store bias failed.";
return RET_ERROR;
}
}
}
return RET_OK;
}

bool UseFcReplaceConv(const std::vector<lite::Tensor *> &inputs, const std::vector<lite::Tensor *> &outputs, bool UseFcReplaceConv(const std::vector<lite::Tensor *> &inputs, const std::vector<lite::Tensor *> &outputs,
ConvParameter *param) { ConvParameter *param) {
MS_ASSERT(param); MS_ASSERT(param);
@@ -528,11 +547,12 @@ kernel::LiteKernel *OpenCLConv2DCreator(const std::vector<lite::Tensor *> &input
} }
} }
if (!infer_shape_done) { if (!infer_shape_done) {
StoreTmpWeight(inputs.at(kWeightIndex));
if (inputs.size() > kBiasIndex) {
StoreTmpWeight(inputs.at(kBiasIndex));
auto ret = reinterpret_cast<Conv2DOpenCLKernel *>(kernel)->StoreConstData();
if (ret != mindspore::lite::RET_OK) {
MS_LOG(ERROR) << "Store " << opParameter->name_ << " const data failed!";
delete kernel;
return nullptr;
} }
MS_LOG(WARNING) << "kernel don't infer shape yet!";
return kernel; return kernel;
} }
if (kernel->CheckSpecs() != RET_OK || kernel->OpenCLKernel::CheckSpecs() != RET_OK) { if (kernel->CheckSpecs() != RET_OK || kernel->OpenCLKernel::CheckSpecs() != RET_OK) {


+ 4
- 0
mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h View File

@@ -58,6 +58,8 @@ class Conv2DOpenCLKernel : public OpenCLKernel {
void SetGlobalLocal() override; void SetGlobalLocal() override;
int Run() override; int Run() override;


int StoreConstData() override;

std::string Key() override { std::string Key() override {
auto key = OpenCLKernel::Key(); auto key = OpenCLKernel::Key();
key += "_" + std::to_string(KH_) + "_" + std::to_string(KW_) + "_" + std::to_string(param_->stride_h_) + "_" + key += "_" + std::to_string(KH_) + "_" + std::to_string(KW_) + "_" + std::to_string(param_->stride_h_) + "_" +
@@ -94,7 +96,9 @@ class Conv2DOpenCLKernel : public OpenCLKernel {
int KH_{}; int KH_{};
int KW_{}; int KW_{};
void *packed_filter_{nullptr}; void *packed_filter_{nullptr};
void *stored_filter_{nullptr};
void *packed_bias_{nullptr}; void *packed_bias_{nullptr};
void *stored_bias_{nullptr};
MemType filter_type_{MemType::BUF}; MemType filter_type_{MemType::BUF};
bool has_bias_{false}; bool has_bias_{false};
int TILE_HW_{}; int TILE_HW_{};


+ 25
- 4
mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc View File

@@ -148,7 +148,7 @@ int Conv2dTransposeOpenCLKernel::InitFilter() {
padWeight_ = allocator->Malloc(div_ci * div_co * C4NUM * C4NUM * kh * kw * data_size, lite::opencl::MemType::BUF); padWeight_ = allocator->Malloc(div_ci * div_co * C4NUM * C4NUM * kh * kw * data_size, lite::opencl::MemType::BUF);
padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true); padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true);
memset(padWeight_, 0x00, div_ci * div_co * C4NUM * C4NUM * kh * kw * data_size); memset(padWeight_, 0x00, div_ci * div_co * C4NUM * C4NUM * kh * kw * data_size);
auto origin_weight = in_tensors_.at(kWeightIndex)->data_c();
auto origin_weight = stored_weight_ == nullptr ? in_tensors_.at(kWeightIndex)->data_c() : stored_weight_;
auto weight_dtype = in_tensors_.at(kWeightIndex)->data_type(); auto weight_dtype = in_tensors_.at(kWeightIndex)->data_type();
int index = 0; int index = 0;
for (int co_i = 0; co_i < div_co; co_i++) { for (int co_i = 0; co_i < div_co; co_i++) {
@@ -188,6 +188,7 @@ int Conv2dTransposeOpenCLKernel::InitFilter() {
} }
} }
allocator->UnmapBuffer(padWeight_); allocator->UnmapBuffer(padWeight_);
FreeStoredData(stored_weight_);
return RET_OK; return RET_OK;
} }


@@ -209,20 +210,22 @@ int Conv2dTransposeOpenCLKernel::InitBias() {
bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true); bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true);
memset(bias_, 0x00, div_co * C4NUM * data_size); memset(bias_, 0x00, div_co * C4NUM * data_size);
if (in_tensors_.size() == 3) { if (in_tensors_.size() == 3) {
void *src_data = stored_bias_ == nullptr ? in_tensors_.at(kBiasIndex)->data_c() : stored_bias_;
auto bias_dtype = in_tensors_[2]->data_type(); auto bias_dtype = in_tensors_[2]->data_type();
if (bias_dtype == kNumberTypeFloat32 && enable_fp16_) { if (bias_dtype == kNumberTypeFloat32 && enable_fp16_) {
for (int i = 0; i < co; i++) { for (int i = 0; i < co; i++) {
reinterpret_cast<float16_t *>(bias_)[i] = reinterpret_cast<float *>(in_tensors_[2]->data_c())[i];
reinterpret_cast<float16_t *>(bias_)[i] = reinterpret_cast<float *>(src_data)[i];
} }
} else if (bias_dtype == kNumberTypeFloat16 && !enable_fp16_) { } else if (bias_dtype == kNumberTypeFloat16 && !enable_fp16_) {
for (int i = 0; i < co; i++) { for (int i = 0; i < co; i++) {
reinterpret_cast<float *>(bias_)[i] = reinterpret_cast<float16_t *>(in_tensors_[2]->data_c())[i];
reinterpret_cast<float *>(bias_)[i] = reinterpret_cast<float16_t *>(src_data)[i];
} }
} else { } else {
memcpy(bias_, in_tensors_[2]->data_c(), co * data_size);
memcpy(bias_, src_data, co * data_size);
} }
} }
allocator->UnmapBuffer(bias_); allocator->UnmapBuffer(bias_);
FreeStoredData(stored_bias_);
return RET_OK; return RET_OK;
} }


@@ -243,6 +246,24 @@ int Conv2dTransposeOpenCLKernel::InferShape() {
return RET_OK; return RET_OK;
} }


int Conv2dTransposeOpenCLKernel::StoreConstData() {
if (!op_parameter_->infer_flag_) {
stored_weight_ = StoreTensorData(in_tensors_.at(kWeightIndex));
if (stored_weight_ == nullptr) {
MS_LOG(ERROR) << "Store weight failed.";
return RET_ERROR;
}
if (in_tensors_.size() > kBiasIndex) {
stored_bias_ = StoreTensorData(in_tensors_.at(kBiasIndex));
if (stored_bias_ == nullptr) {
MS_LOG(ERROR) << "Store bias failed.";
return RET_ERROR;
}
}
}
return RET_OK;
}

kernel::LiteKernel *OpenCLConv2dTransposeCreator(const std::vector<lite::Tensor *> &inputs, kernel::LiteKernel *OpenCLConv2dTransposeCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter, const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
const lite::Context *ctx, const kernel::KernelKey &desc) { const lite::Context *ctx, const kernel::KernelKey &desc) {


+ 3
- 0
mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.h View File

@@ -39,10 +39,13 @@ class Conv2dTransposeOpenCLKernel : public OpenCLKernel {
void SetConstArgs() override; void SetConstArgs() override;
void SetGlobalLocal() override; void SetGlobalLocal() override;
int InferShape() override; int InferShape() override;
int StoreConstData() override;


private: private:
void *padWeight_{nullptr}; void *padWeight_{nullptr};
void *bias_{nullptr}; void *bias_{nullptr};
void *stored_weight_{nullptr};
void *stored_bias_{nullptr};
bool enable_fp16_{false}; bool enable_fp16_{false};
}; };
} // namespace mindspore::kernel } // namespace mindspore::kernel


+ 23
- 2
mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc View File

@@ -110,7 +110,7 @@ int DepthwiseConv2dOpenCLKernel::InitWeights() {
size_t dtype_size = is_fp16 ? sizeof(int16_t) : sizeof(float); size_t dtype_size = is_fp16 ? sizeof(int16_t) : sizeof(float);
auto out_info = GpuTensorInfo(out_tensors_[0]); auto out_info = GpuTensorInfo(out_tensors_[0]);
// weight: o, h, w, i; o == group, i == 1 // weight: o, h, w, i; o == group, i == 1
void *origin_weight = in_tensors_.at(kWeightIndex)->data_c();
void *origin_weight = stored_weight_ == nullptr ? in_tensors_.at(kWeightIndex)->data_c() : stored_weight_;
int CO4 = UP_DIV(out_info.C, C4NUM * block_size_.C); int CO4 = UP_DIV(out_info.C, C4NUM * block_size_.C);
int pack_weight_size = C4NUM * CO4 * parameter->kernel_h_ * parameter->kernel_w_; int pack_weight_size = C4NUM * CO4 * parameter->kernel_h_ * parameter->kernel_w_;


@@ -162,6 +162,7 @@ int DepthwiseConv2dOpenCLKernel::InitWeights() {
if (packed_weight_ == nullptr) { if (packed_weight_ == nullptr) {
return RET_ERROR; return RET_ERROR;
} }
FreeStoredData(stored_weight_);
return mindspore::lite::RET_OK; return mindspore::lite::RET_OK;
} }


@@ -196,12 +197,14 @@ int DepthwiseConv2dOpenCLKernel::InitBias() {
src_type = in_tensors_.at(kBiasIndex)->data_type(); src_type = in_tensors_.at(kBiasIndex)->data_type();
dst_type = is_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32; dst_type = is_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32;
auto element_size = in_tensors_.at(kBiasIndex)->ElementsNum(); auto element_size = in_tensors_.at(kBiasIndex)->ElementsNum();
ConvertBias(in_tensors_.at(kBiasIndex)->data_c(), temp_bias.data(), element_size, dtype_size, src_type, dst_type);
void *src_data = stored_bias_ == nullptr ? in_tensors_.at(kBiasIndex)->data_c() : stored_bias_;
ConvertBias(src_data, temp_bias.data(), element_size, dtype_size, src_type, dst_type);
} }
bias_data_ = allocator->Malloc(bias_size, temp_bias.data()); bias_data_ = allocator->Malloc(bias_size, temp_bias.data());
if (bias_data_ == nullptr) { if (bias_data_ == nullptr) {
return RET_ERROR; return RET_ERROR;
} }
FreeStoredData(stored_bias_);
return mindspore::lite::RET_OK; return mindspore::lite::RET_OK;
} }


@@ -250,6 +253,24 @@ void DepthwiseConv2dOpenCLKernel::SetGlobalLocal() {
OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); OpenCLKernel::AlignGlobalLocal(global_size_, local_size_);
} }


int DepthwiseConv2dOpenCLKernel::StoreConstData() {
if (!op_parameter_->infer_flag_) {
stored_weight_ = StoreTensorData(in_tensors_.at(kWeightIndex));
if (stored_weight_ == nullptr) {
MS_LOG(ERROR) << "Store weight failed.";
return RET_ERROR;
}
if (in_tensors_.size() > kBiasIndex) {
stored_bias_ = StoreTensorData(in_tensors_.at(kBiasIndex));
if (stored_bias_ == nullptr) {
MS_LOG(ERROR) << "Store bias failed.";
return RET_ERROR;
}
}
}
return RET_OK;
}

int DepthwiseConv2dOpenCLKernel::Run() { int DepthwiseConv2dOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!"; MS_LOG(DEBUG) << this->name() << " Running!";
ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_[0]->data_c());


+ 3
- 0
mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h View File

@@ -44,10 +44,13 @@ class DepthwiseConv2dOpenCLKernel : public OpenCLKernel {
int InitBias(); int InitBias();
void SetConstArgs() override; void SetConstArgs() override;
void SetGlobalLocal() override; void SetGlobalLocal() override;
int StoreConstData() override;


private: private:
void *packed_weight_{nullptr}; void *packed_weight_{nullptr};
void *stored_weight_{nullptr};
void *bias_data_{nullptr}; void *bias_data_{nullptr};
void *stored_bias_{nullptr};
struct { struct {
int H{2}; int H{2};
int W{2}; int W{2};


+ 29
- 7
mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc View File

@@ -140,8 +140,9 @@ int FullConnectionOpenCLKernel::InitFilter() {
auto padWeightFp32 = reinterpret_cast<float *>(padWeight_); auto padWeightFp32 = reinterpret_cast<float *>(padWeight_);
auto padWeightFp16 = reinterpret_cast<float16_t *>(padWeight_); auto padWeightFp16 = reinterpret_cast<float16_t *>(padWeight_);
memset(padWeight_, 0x00, nhw_remainder * intensor_shape.Slice * co4 * C4NUM * C4NUM * dtype_size); memset(padWeight_, 0x00, nhw_remainder * intensor_shape.Slice * co4 * C4NUM * C4NUM * dtype_size);
auto originWeightFp32 = reinterpret_cast<float *>(in_tensors_.at(kWeightIndex)->data_c());
auto originWeightFp16 = reinterpret_cast<float16_t *>(in_tensors_.at(kWeightIndex)->data_c());
void *src_data = stored_weight_ == nullptr ? in_tensors_.at(kWeightIndex)->data_c() : stored_weight_;
auto originWeightFp32 = reinterpret_cast<float *>(src_data);
auto originWeightFp16 = reinterpret_cast<float16_t *>(src_data);
bool isModelFp16 = in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16; bool isModelFp16 = in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16;


// pad weight // pad weight
@@ -182,6 +183,7 @@ int FullConnectionOpenCLKernel::InitFilter() {
} }
} }
allocator->UnmapBuffer(padWeight_); allocator->UnmapBuffer(padWeight_);
FreeStoredData(stored_weight_);
return RET_OK; return RET_OK;
} }


@@ -202,19 +204,21 @@ int FullConnectionOpenCLKernel::InitBias() {
bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true); bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true);
memset(bias_, 0x00, co4 * C4NUM * dtype_size); memset(bias_, 0x00, co4 * C4NUM * dtype_size);
if (in_tensors_.size() == 3) { if (in_tensors_.size() == 3) {
if (in_tensors_[2]->data_type() == kNumberTypeFloat32 && enable_fp16_) {
void *src_data = stored_bias_ == nullptr ? in_tensors_.at(kBiasIndex)->data_c() : stored_bias_;
if (in_tensors_[kBiasIndex]->data_type() == kNumberTypeFloat32 && enable_fp16_) {
for (int i = 0; i < CO_; i++) { for (int i = 0; i < CO_; i++) {
reinterpret_cast<float16_t *>(bias_)[i] = reinterpret_cast<float *>(in_tensors_[2]->data_c())[i];
reinterpret_cast<float16_t *>(bias_)[i] = reinterpret_cast<float *>(src_data)[i];
} }
} else if (in_tensors_[2]->data_type() == kNumberTypeFloat16 && !enable_fp16_) {
} else if (in_tensors_[kBiasIndex]->data_type() == kNumberTypeFloat16 && !enable_fp16_) {
for (int i = 0; i < CO_; i++) { for (int i = 0; i < CO_; i++) {
reinterpret_cast<float *>(bias_)[i] = reinterpret_cast<float16_t *>(in_tensors_[2]->data_c())[i];
reinterpret_cast<float *>(bias_)[i] = reinterpret_cast<float16_t *>(src_data)[i];
} }
} else { } else {
memcpy(bias_, in_tensors_[2]->data_c(), CO_ * dtype_size);
memcpy(bias_, src_data, CO_ * dtype_size);
} }
} }
allocator->UnmapBuffer(bias_); allocator->UnmapBuffer(bias_);
FreeStoredData(stored_bias_);
return RET_OK; return RET_OK;
} }


@@ -244,6 +248,24 @@ void FullConnectionOpenCLKernel::SetConstArgs() {
ocl_runtime_->SetKernelArg(kernel_, arg_count, static_cast<cl_int>(param->act_type_)); ocl_runtime_->SetKernelArg(kernel_, arg_count, static_cast<cl_int>(param->act_type_));
} }


int FullConnectionOpenCLKernel::StoreConstData() {
if (!op_parameter_->infer_flag_) {
stored_weight_ = StoreTensorData(in_tensors_.at(kWeightIndex));
if (stored_weight_ == nullptr) {
MS_LOG(ERROR) << "Store weight failed.";
return RET_ERROR;
}
if (in_tensors_.size() > kBiasIndex) {
stored_bias_ = StoreTensorData(in_tensors_.at(kBiasIndex));
if (stored_bias_ == nullptr) {
MS_LOG(ERROR) << "Store bias failed.";
return RET_ERROR;
}
}
}
return RET_OK;
}

int FullConnectionOpenCLKernel::Run() { int FullConnectionOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!"; MS_LOG(DEBUG) << this->name() << " Running!";
int arg_count = 0; int arg_count = 0;


+ 3
- 0
mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h View File

@@ -36,12 +36,15 @@ class FullConnectionOpenCLKernel : public OpenCLKernel {
void SetConstArgs() override; void SetConstArgs() override;
void SetGlobalLocal() override; void SetGlobalLocal() override;
int Tune() override { return lite::RET_OK; } int Tune() override { return lite::RET_OK; }
int StoreConstData() override;


private: private:
int InitFilter(); int InitFilter();
int InitBias(); int InitBias();
void *padWeight_{nullptr}; void *padWeight_{nullptr};
void *bias_{nullptr}; void *bias_{nullptr};
void *stored_weight_{nullptr};
void *stored_bias_{nullptr};
bool enable_fp16_{false}; bool enable_fp16_{false};
bool transposeA{false}; bool transposeA{false};
bool transposeB{true}; bool transposeB{true};


+ 35
- 7
mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc View File

@@ -136,8 +136,9 @@ int MatMulOpenCLKernel::InitWeights() {
auto padWeightFp32 = reinterpret_cast<float *>(padWeight_); auto padWeightFp32 = reinterpret_cast<float *>(padWeight_);
auto padWeightFp16 = reinterpret_cast<float16_t *>(padWeight_); auto padWeightFp16 = reinterpret_cast<float16_t *>(padWeight_);
memset(padWeight_, 0x00, a * b * ci4 * co4 * C4NUM * C4NUM * dtype_size); memset(padWeight_, 0x00, a * b * ci4 * co4 * C4NUM * C4NUM * dtype_size);
auto originWeightFp32 = reinterpret_cast<float *>(in_tensors_.at(kWeightIndex)->data_c());
auto originWeightFp16 = reinterpret_cast<float16_t *>(in_tensors_.at(kWeightIndex)->data_c());
void *src_data = stored_weight_ == nullptr ? in_tensors_.at(kWeightIndex)->data_c() : stored_weight_;
auto originWeightFp32 = reinterpret_cast<float *>(src_data);
auto originWeightFp16 = reinterpret_cast<float16_t *>(src_data);
bool isModelFp16 = in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16; bool isModelFp16 = in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16;
// pad weight // pad weight
// ABCICO -> AB(CI4)(CO4)(4 from CO)(4 from CI) // ABCICO -> AB(CI4)(CO4)(4 from CO)(4 from CI)
@@ -181,6 +182,7 @@ int MatMulOpenCLKernel::InitWeights() {
} }


allocator->UnmapBuffer(padWeight_); allocator->UnmapBuffer(padWeight_);
FreeStoredData(stored_weight_);
return InitBias(); return InitBias();
} }


@@ -202,19 +204,21 @@ int MatMulOpenCLKernel::InitBias() {
bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true); bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true);
memset(bias_, 0x00, co4 * C4NUM * dtype_size); memset(bias_, 0x00, co4 * C4NUM * dtype_size);
if (in_tensors_.size() == 3) { if (in_tensors_.size() == 3) {
if (in_tensors_[2]->data_type() == kNumberTypeFloat32 && enable_fp16_) {
void *src_data = stored_bias_ == nullptr ? in_tensors_.at(kBiasIndex)->data_c() : stored_bias_;
if (in_tensors_[kBiasIndex]->data_type() == kNumberTypeFloat32 && enable_fp16_) {
for (int i = 0; i < CO_; i++) { for (int i = 0; i < CO_; i++) {
reinterpret_cast<float16_t *>(bias_)[i] = reinterpret_cast<float *>(in_tensors_[2]->data_c())[i];
reinterpret_cast<float16_t *>(bias_)[i] = reinterpret_cast<float *>(src_data)[i];
} }
} else if (in_tensors_[2]->data_type() == kNumberTypeFloat16 && !enable_fp16_) {
} else if (in_tensors_[kBiasIndex]->data_type() == kNumberTypeFloat16 && !enable_fp16_) {
for (int i = 0; i < CO_; i++) { for (int i = 0; i < CO_; i++) {
reinterpret_cast<float *>(bias_)[i] = reinterpret_cast<float16_t *>(in_tensors_[2]->data_c())[i];
reinterpret_cast<float *>(bias_)[i] = reinterpret_cast<float16_t *>(src_data)[i];
} }
} else { } else {
memcpy(bias_, in_tensors_[2]->data_c(), CO_ * dtype_size);
memcpy(bias_, src_data, CO_ * dtype_size);
} }
} }
allocator->UnmapBuffer(bias_); allocator->UnmapBuffer(bias_);
FreeStoredData(stored_bias_);
return RET_OK; return RET_OK;
} }


@@ -254,6 +258,24 @@ int MatMulOpenCLKernel::Run() {
return RET_OK; return RET_OK;
} }


int MatMulOpenCLKernel::StoreConstData() {
if (!op_parameter_->infer_flag_) {
stored_weight_ = StoreTensorData(in_tensors_.at(kWeightIndex));
if (stored_weight_ == nullptr) {
MS_LOG(ERROR) << "Store weight failed.";
return RET_ERROR;
}
if (in_tensors_.size() > kBiasIndex) {
stored_bias_ = StoreTensorData(in_tensors_.at(kBiasIndex));
if (stored_bias_ == nullptr) {
MS_LOG(ERROR) << "Store bias failed.";
return RET_ERROR;
}
}
}
return RET_OK;
}

kernel::LiteKernel *OpenCLMatMulKernelCreator(const std::vector<lite::Tensor *> &inputs, kernel::LiteKernel *OpenCLMatMulKernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter, const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
const lite::Context *ctx, const kernel::KernelKey &desc) { const lite::Context *ctx, const kernel::KernelKey &desc) {
@@ -274,6 +296,12 @@ kernel::LiteKernel *OpenCLMatMulKernelCreator(const std::vector<lite::Tensor *>
} }
if (!infer_shape_done) { if (!infer_shape_done) {
MS_LOG(WARNING) << "kernel don't infer shape yet!"; MS_LOG(WARNING) << "kernel don't infer shape yet!";
auto ret = reinterpret_cast<MatMulOpenCLKernel *>(kernel)->StoreConstData();
if (ret != mindspore::lite::RET_OK) {
MS_LOG(ERROR) << "Store " << opParameter->name_ << " const data failed!";
delete kernel;
return nullptr;
}
return kernel; return kernel;
} }
if (kernel->CheckSpecs() != RET_OK || kernel->OpenCLKernel::CheckSpecs() != RET_OK) { if (kernel->CheckSpecs() != RET_OK || kernel->OpenCLKernel::CheckSpecs() != RET_OK) {


+ 3
- 0
mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h View File

@@ -38,6 +38,7 @@ class MatMulOpenCLKernel : public OpenCLKernel {
void SetGlobalLocal() override; void SetGlobalLocal() override;
int Tune() override { return lite::RET_OK; } int Tune() override { return lite::RET_OK; }
int InitBias(); int InitBias();
int StoreConstData() override;


protected: protected:
void *padWeight_{nullptr}; void *padWeight_{nullptr};
@@ -47,6 +48,8 @@ class MatMulOpenCLKernel : public OpenCLKernel {
int dims{}; int dims{};
void *bias_{nullptr}; void *bias_{nullptr};
int CO_{1}; int CO_{1};
void *stored_weight_{nullptr};
void *stored_bias_{nullptr};
static constexpr int MAX_DIMS{4}; // max supported matmul dims static constexpr int MAX_DIMS{4}; // max supported matmul dims
bool act_weight_{false}; bool act_weight_{false};
std::vector<int> inShape{std::vector<int>(MAX_DIMS, 1)}; std::vector<int> inShape{std::vector<int>(MAX_DIMS, 1)};


+ 4
- 2
mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.cc View File

@@ -108,12 +108,13 @@ void WinogradOpenCLKernel::InitFilter() {


// rearrange filter // rearrange filter
auto filter_tensor = in_tensors_.at(1); auto filter_tensor = in_tensors_.at(1);
void *src_filter_data = stored_filter_ == nullptr ? filter_tensor->data_c() : stored_filter_;
#ifndef ENABLE_ARM64 #ifndef ENABLE_ARM64
auto winograd_filter = GenerateWinogradFilter(filter_tensor->data_c(), filter_tensor->data_type(), CO_, CI_);
auto winograd_filter = GenerateWinogradFilter(src_filter_data, filter_tensor->data_type(), CO_, CI_);
void *src_data = winograd_filter.data(); void *src_data = winograd_filter.data();
#else #else
std::unique_ptr<float[]> winograd_filter(new float[CO_ * 6 * 6 * CI_]); std::unique_ptr<float[]> winograd_filter(new float[CO_ * 6 * 6 * CI_]);
WinogradWeightTransform(reinterpret_cast<const float *>(filter_tensor->data_c()),
WinogradWeightTransform(reinterpret_cast<const float *>(src_filter_data),
reinterpret_cast<float *>(winograd_filter.get()), nullptr, Gt, 1, 6, 3, CI_, CO_, false); reinterpret_cast<float *>(winograd_filter.get()), nullptr, Gt, 1, 6, 3, CI_, CO_, false);


void *src_data = winograd_filter.get(); void *src_data = winograd_filter.get();
@@ -136,6 +137,7 @@ void WinogradOpenCLKernel::InitFilter() {
memcpy(packed_filter_, tmp.data(), size); memcpy(packed_filter_, tmp.data(), size);
allocator->UnmapBuffer(packed_filter_); allocator->UnmapBuffer(packed_filter_);
} }
FreeStoredData(stored_filter_);
} }


void WinogradOpenCLKernel::AllocateMemory() { void WinogradOpenCLKernel::AllocateMemory() {


+ 7
- 0
mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h View File

@@ -195,6 +195,7 @@ class OpenCLKernel : public LiteKernel {
virtual std::vector<BaseTuningParameter> GenerateTuningParam(); virtual std::vector<BaseTuningParameter> GenerateTuningParam();
virtual int AssignTuningParam(const BaseTuningParameter &param); virtual int AssignTuningParam(const BaseTuningParameter &param);
virtual int Tune(); virtual int Tune();
virtual int StoreConstData() { return RET_OK; }


int GetImageSize(size_t idx, lite::opencl::ImageSize *img_size); int GetImageSize(size_t idx, lite::opencl::ImageSize *img_size);
void PrintOutput(int print_num = 10, const std::string &out_file = ""); void PrintOutput(int print_num = 10, const std::string &out_file = "");
@@ -259,6 +260,12 @@ kernel::LiteKernel *OpenCLKernelCreator(const std::vector<lite::Tensor *> &input
delete kernel; delete kernel;
return nullptr; return nullptr;
} }
ret = reinterpret_cast<OpenCLKernel *>(kernel)->StoreConstData();
if (ret != mindspore::lite::RET_OK) {
MS_LOG(ERROR) << "Store " << opParameter->name_ << " const data failed!";
delete kernel;
return nullptr;
}
return kernel; return kernel;
} }
} // namespace mindspore::kernel } // namespace mindspore::kernel


+ 10
- 17
mindspore/lite/src/runtime/kernel/opencl/utils.cc View File

@@ -301,29 +301,22 @@ int CheckParamLikeTensor(const std::string &kernel_name, const std::string &tens
return RET_OK; return RET_OK;
} }


static std::set<void *> tmp_weights;

void StoreTmpWeight(lite::Tensor *tensor) {
MS_LOG(WARNING) << "store weight when kernel don't infer shape!";
void *StoreTensorData(lite::Tensor *tensor) {
if ((tensor != nullptr) && (tensor->data_c() != nullptr) && (tensor->Size() > 0)) { if ((tensor != nullptr) && (tensor->data_c() != nullptr) && (tensor->Size() > 0)) {
void *new_data = malloc(tensor->Size());
MS_ASSERT(new_data);
if (new_data == nullptr) {
return;
void *stored_data = malloc(tensor->Size());
if (stored_data == nullptr) {
MS_LOG(ERROR) << "StoreTensorData Malloc Failed.";
return nullptr;
} }
memcpy(new_data, tensor->data_c(), tensor->Size());
tensor->set_data(new_data);
tmp_weights.insert(new_data);
memcpy(stored_data, tensor->data_c(), tensor->Size());
return stored_data;
} }
return nullptr;
} }


void FreeTmpWeight(lite::Tensor *tensor) {
MS_ASSERT(tensor != nullptr);
auto data = tensor->data_c();
if (tmp_weights.count(data)) {
tmp_weights.erase(data);
void FreeStoredData(void *data) {
if (data != nullptr) {
free(data); free(data);
tensor->set_data(nullptr);
} }
} }




+ 3
- 2
mindspore/lite/src/runtime/kernel/opencl/utils.h View File

@@ -64,8 +64,9 @@ void PackNHWCToNHWC4(void *src, void *dst, bool src_is_fp16, bool dst_is_fp16, c
int CheckParamLikeTensor(const std::string &kernel_name, const std::string &tensor_name, lite::Tensor *tensor, int CheckParamLikeTensor(const std::string &kernel_name, const std::string &tensor_name, lite::Tensor *tensor,
TypeId expect_data_type, const std::vector<int> &expect_shape); TypeId expect_data_type, const std::vector<int> &expect_shape);


void StoreTmpWeight(lite::Tensor *tensor);
void FreeTmpWeight(lite::Tensor *tensor);
void *StoreTensorData(lite::Tensor *tensor);

void FreeStoredData(void *data);


std::vector<std::string> CreateBuildOptionsExtByDType(TypeId type_id); std::vector<std::string> CreateBuildOptionsExtByDType(TypeId type_id);




+ 2
- 2
mindspore/lite/src/scheduler.cc View File

@@ -271,13 +271,13 @@ int CastConstTensorsData(const std::vector<Tensor *> &tensors, std::map<Tensor *
if (tensor->data_type() == kNumberTypeFloat32 && dst_data_type == kNumberTypeFloat16) { if (tensor->data_type() == kNumberTypeFloat32 && dst_data_type == kNumberTypeFloat16) {
auto ret = CastConstTensorData(tensor, restored_origin_tensors, kNumberTypeFloat16); auto ret = CastConstTensorData(tensor, restored_origin_tensors, kNumberTypeFloat16);
if (ret != RET_OK) { if (ret != RET_OK) {
MS_LOG(ERROR) << "Cast const tensor from fp32 to fp16 failed, tensor name : " << tensor->tensor_name();
MS_LOG(DEBUG) << "Cast const tensor from fp32 to fp16 failed, tensor name : " << tensor->tensor_name();
return ret; return ret;
} }
} else if (tensor->data_type() == kNumberTypeFloat16 && dst_data_type == kNumberTypeFloat32) { } else if (tensor->data_type() == kNumberTypeFloat16 && dst_data_type == kNumberTypeFloat32) {
auto ret = CastConstTensorData(tensor, restored_origin_tensors, kNumberTypeFloat32); auto ret = CastConstTensorData(tensor, restored_origin_tensors, kNumberTypeFloat32);
if (ret != RET_OK) { if (ret != RET_OK) {
MS_LOG(ERROR) << "Cast const tensor from fp16 to fp32 failed, tensor name : " << tensor->tensor_name();
MS_LOG(DEBUG) << "Cast const tensor from fp16 to fp32 failed, tensor name : " << tensor->tensor_name();
return ret; return ret;
} }
} else { } else {


Loading…
Cancel
Save