Browse Source

store weight and bias to conv etc.

pull/15867/head
yeyunpeng2020 4 years ago
parent
commit
6b05157ed6
19 changed files with 198 additions and 63 deletions
  1. +12
    -5
      mindspore/lite/src/lite_session.cc
  2. +1
    -1
      mindspore/lite/src/lite_session.h
  3. +0
    -2
      mindspore/lite/src/runtime/gpu/opencl/opencl_runtime.cc
  4. +0
    -1
      mindspore/lite/src/runtime/gpu/opencl/opencl_runtime.h
  5. +31
    -11
      mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
  6. +4
    -0
      mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h
  7. +25
    -4
      mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
  8. +3
    -0
      mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.h
  9. +23
    -2
      mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
  10. +3
    -0
      mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h
  11. +29
    -7
      mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc
  12. +3
    -0
      mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h
  13. +35
    -7
      mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
  14. +3
    -0
      mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h
  15. +4
    -2
      mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.cc
  16. +7
    -0
      mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
  17. +10
    -17
      mindspore/lite/src/runtime/kernel/opencl/utils.cc
  18. +3
    -2
      mindspore/lite/src/runtime/kernel/opencl/utils.h
  19. +2
    -2
      mindspore/lite/src/scheduler.cc

+ 12
- 5
mindspore/lite/src/lite_session.cc View File

@@ -347,11 +347,16 @@ void LiteSession::InitGraphInOutTensors(const lite::Model *model) {
}
}

void LiteSession::FreePackOpWeight() {
for (auto *kernel : kernels_) {
void LiteSession::FreePackOpWeight(const std::vector<kernel::LiteKernel *> &kernels) {
for (auto *kernel : kernels) {
MS_ASSERT(kernel != nullptr);
if (!IsPackedOp(kernel->Type())) {
continue;
if (kernel->subgraph_type() == kernel::kNotSubGraph) {
if (!IsPackedOp(kernel->Type())) {
continue;
}
} else {
auto subgraph = reinterpret_cast<kernel::SubGraphKernel *>(kernel);
FreePackOpWeight(subgraph->nodes());
}
auto inputs = kernel->in_tensors();
for (auto *tensor : inputs) {
@@ -444,8 +449,10 @@ int LiteSession::CompileGraph(Model *model) {
is_running_.store(false);
return ret;
}
#ifndef SUPPORT_TRAIN
// For reducing runtime RAM, free packop weight because packop will pack weight and will not access to origin weight
FreePackOpWeight();
FreePackOpWeight(kernels_);
#endif
is_running_.store(false);
return RET_OK;
} // namespace lite


+ 1
- 1
mindspore/lite/src/lite_session.h View File

@@ -106,7 +106,7 @@ class LiteSession : public session::LiteSession {

static int ReSizeKernels(const std::vector<kernel::LiteKernel *> &kernels);

void FreePackOpWeight();
static void FreePackOpWeight(const std::vector<kernel::LiteKernel *> &kernels);

private:
void ResetInputsShape(const std::vector<std::vector<int>> &dims);


+ 0
- 2
mindspore/lite/src/runtime/gpu/opencl/opencl_runtime.cc View File

@@ -368,8 +368,6 @@ bool OpenCLRuntime::SetFp16Enable(bool enable) {
return fp16_enable_ == enable;
}

bool OpenCLRuntime::IsSupportFloat16() { return support_fp16_; }

int OpenCLRuntime::BuildKernel(const cl::Kernel &kernel, const std::string &program_name,
const std::string &kernel_name, const std::vector<std::string> &build_options_ext) {
std::string build_option = default_build_option_;


+ 0
- 1
mindspore/lite/src/runtime/gpu/opencl/opencl_runtime.h View File

@@ -70,7 +70,6 @@ class OpenCLRuntime {
GpuInfo GetGpuInfo();
bool GetFp16Enable() const;
bool SetFp16Enable(bool enable);
bool IsSupportFloat16();
bool GetSVMEnable() const { return svm_enable_; }
void SetSVMEnable(bool enable) { svm_enable_ = enable; }
const std::vector<size_t> &GetWorkItemSize() const { return max_work_item_sizes_; }


+ 31
- 11
mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc View File

@@ -260,7 +260,7 @@ void Conv2DOpenCLKernel::InitFilter() {

// rearrange filter
auto filter_tensor = in_tensors_.at(1);
void *src_data = filter_tensor->data_c();
void *src_data = stored_filter_ == nullptr ? filter_tensor->data_c() : stored_filter_;
auto src_dtype = filter_tensor->data_type();
auto dst_dtype = use_fp16_ ? kNumberTypeFloat16 : kNumberTypeFloat32;
std::vector<char> tmp(size, 0);
@@ -279,7 +279,7 @@ void Conv2DOpenCLKernel::InitFilter() {
allocator->UnmapBuffer(packed_filter_);
}

FreeTmpWeight(in_tensors_.at(kWeightIndex));
FreeStoredData(stored_filter_);
}

void Conv2DOpenCLKernel::InitBias() {
@@ -287,6 +287,7 @@ void Conv2DOpenCLKernel::InitBias() {

// align bias from C to C4
auto bias_tensor = in_tensors_.at(2);
void *src_data = stored_bias_ == nullptr ? bias_tensor->data_c() : stored_bias_;
size_t packed_bias_size = UP_ROUND(CO_SLICES_, block_size_.C) * CO_TILE * sizeof_FLT_;
packed_bias_ = allocator->Malloc(packed_bias_size, lite::opencl::MemType::BUF);

@@ -294,10 +295,10 @@ void Conv2DOpenCLKernel::InitBias() {
memset(packed_bias_, 0x00, packed_bias_size);
if (bias_tensor->data_type() == kNumberTypeFloat16) {
if (use_fp16_) {
memcpy(packed_bias_, bias_tensor->data_c(), CO_ * sizeof_FLT_);
memcpy(packed_bias_, src_data, CO_ * sizeof_FLT_);
} else {
auto packed_bias_fp32 = reinterpret_cast<float *>(packed_bias_);
auto origin_bias_fp16 = reinterpret_cast<float16_t *>(bias_tensor->data_c());
auto origin_bias_fp16 = reinterpret_cast<float16_t *>(src_data);
MS_ASSERT(origin_bias_fp16);
for (int i = 0; i < CO_; ++i) {
packed_bias_fp32[i] = static_cast<float>(origin_bias_fp16[i]);
@@ -306,17 +307,17 @@ void Conv2DOpenCLKernel::InitBias() {
} else {
if (use_fp16_) {
auto packed_bias_fp16 = reinterpret_cast<float16_t *>(packed_bias_);
auto origin_bias_fp32 = reinterpret_cast<float *>(bias_tensor->data_c());
auto origin_bias_fp32 = reinterpret_cast<float *>(src_data);
MS_ASSERT(origin_bias_fp32);
for (int i = 0; i < CO_; ++i) {
packed_bias_fp16[i] = static_cast<float16_t>(origin_bias_fp32[i]);
}
} else {
memcpy(packed_bias_, bias_tensor->data_c(), CO_ * sizeof_FLT_);
memcpy(packed_bias_, src_data, CO_ * sizeof_FLT_);
}
}
allocator->UnmapBuffer(packed_bias_);
FreeTmpWeight(in_tensors_.at(kBiasIndex));
FreeStoredData(stored_bias_);
}

void Conv2DOpenCLKernel::SetConstArgs() {
@@ -403,6 +404,24 @@ std::vector<BaseTuningParameter> Conv2DOpenCLKernel::GenerateTuningParam() {
return tuning_params;
}

int Conv2DOpenCLKernel::StoreConstData() {
if (!op_parameter_->infer_flag_) {
stored_filter_ = StoreTensorData(in_tensors_.at(kWeightIndex));
if (stored_filter_ == nullptr) {
MS_LOG(ERROR) << "Store weight failed.";
return RET_ERROR;
}
if (in_tensors_.size() > kBiasIndex) {
stored_bias_ = StoreTensorData(in_tensors_.at(kBiasIndex));
if (stored_bias_ == nullptr) {
MS_LOG(ERROR) << "Store bias failed.";
return RET_ERROR;
}
}
}
return RET_OK;
}

bool UseFcReplaceConv(const std::vector<lite::Tensor *> &inputs, const std::vector<lite::Tensor *> &outputs,
ConvParameter *param) {
MS_ASSERT(param);
@@ -528,11 +547,12 @@ kernel::LiteKernel *OpenCLConv2DCreator(const std::vector<lite::Tensor *> &input
}
}
if (!infer_shape_done) {
StoreTmpWeight(inputs.at(kWeightIndex));
if (inputs.size() > kBiasIndex) {
StoreTmpWeight(inputs.at(kBiasIndex));
auto ret = reinterpret_cast<Conv2DOpenCLKernel *>(kernel)->StoreConstData();
if (ret != mindspore::lite::RET_OK) {
MS_LOG(ERROR) << "Store " << opParameter->name_ << " const data failed!";
delete kernel;
return nullptr;
}
MS_LOG(WARNING) << "kernel don't infer shape yet!";
return kernel;
}
if (kernel->CheckSpecs() != RET_OK || kernel->OpenCLKernel::CheckSpecs() != RET_OK) {


+ 4
- 0
mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h View File

@@ -58,6 +58,8 @@ class Conv2DOpenCLKernel : public OpenCLKernel {
void SetGlobalLocal() override;
int Run() override;

int StoreConstData() override;

std::string Key() override {
auto key = OpenCLKernel::Key();
key += "_" + std::to_string(KH_) + "_" + std::to_string(KW_) + "_" + std::to_string(param_->stride_h_) + "_" +
@@ -94,7 +96,9 @@ class Conv2DOpenCLKernel : public OpenCLKernel {
int KH_{};
int KW_{};
void *packed_filter_{nullptr};
void *stored_filter_{nullptr};
void *packed_bias_{nullptr};
void *stored_bias_{nullptr};
MemType filter_type_{MemType::BUF};
bool has_bias_{false};
int TILE_HW_{};


+ 25
- 4
mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc View File

@@ -148,7 +148,7 @@ int Conv2dTransposeOpenCLKernel::InitFilter() {
padWeight_ = allocator->Malloc(div_ci * div_co * C4NUM * C4NUM * kh * kw * data_size, lite::opencl::MemType::BUF);
padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true);
memset(padWeight_, 0x00, div_ci * div_co * C4NUM * C4NUM * kh * kw * data_size);
auto origin_weight = in_tensors_.at(kWeightIndex)->data_c();
auto origin_weight = stored_weight_ == nullptr ? in_tensors_.at(kWeightIndex)->data_c() : stored_weight_;
auto weight_dtype = in_tensors_.at(kWeightIndex)->data_type();
int index = 0;
for (int co_i = 0; co_i < div_co; co_i++) {
@@ -188,6 +188,7 @@ int Conv2dTransposeOpenCLKernel::InitFilter() {
}
}
allocator->UnmapBuffer(padWeight_);
FreeStoredData(stored_weight_);
return RET_OK;
}

@@ -209,20 +210,22 @@ int Conv2dTransposeOpenCLKernel::InitBias() {
bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true);
memset(bias_, 0x00, div_co * C4NUM * data_size);
if (in_tensors_.size() == 3) {
void *src_data = stored_bias_ == nullptr ? in_tensors_.at(kBiasIndex)->data_c() : stored_bias_;
auto bias_dtype = in_tensors_[2]->data_type();
if (bias_dtype == kNumberTypeFloat32 && enable_fp16_) {
for (int i = 0; i < co; i++) {
reinterpret_cast<float16_t *>(bias_)[i] = reinterpret_cast<float *>(in_tensors_[2]->data_c())[i];
reinterpret_cast<float16_t *>(bias_)[i] = reinterpret_cast<float *>(src_data)[i];
}
} else if (bias_dtype == kNumberTypeFloat16 && !enable_fp16_) {
for (int i = 0; i < co; i++) {
reinterpret_cast<float *>(bias_)[i] = reinterpret_cast<float16_t *>(in_tensors_[2]->data_c())[i];
reinterpret_cast<float *>(bias_)[i] = reinterpret_cast<float16_t *>(src_data)[i];
}
} else {
memcpy(bias_, in_tensors_[2]->data_c(), co * data_size);
memcpy(bias_, src_data, co * data_size);
}
}
allocator->UnmapBuffer(bias_);
FreeStoredData(stored_bias_);
return RET_OK;
}

@@ -243,6 +246,24 @@ int Conv2dTransposeOpenCLKernel::InferShape() {
return RET_OK;
}

int Conv2dTransposeOpenCLKernel::StoreConstData() {
if (!op_parameter_->infer_flag_) {
stored_weight_ = StoreTensorData(in_tensors_.at(kWeightIndex));
if (stored_weight_ == nullptr) {
MS_LOG(ERROR) << "Store weight failed.";
return RET_ERROR;
}
if (in_tensors_.size() > kBiasIndex) {
stored_bias_ = StoreTensorData(in_tensors_.at(kBiasIndex));
if (stored_bias_ == nullptr) {
MS_LOG(ERROR) << "Store bias failed.";
return RET_ERROR;
}
}
}
return RET_OK;
}

kernel::LiteKernel *OpenCLConv2dTransposeCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
const lite::Context *ctx, const kernel::KernelKey &desc) {


+ 3
- 0
mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.h View File

@@ -39,10 +39,13 @@ class Conv2dTransposeOpenCLKernel : public OpenCLKernel {
void SetConstArgs() override;
void SetGlobalLocal() override;
int InferShape() override;
int StoreConstData() override;

private:
void *padWeight_{nullptr};
void *bias_{nullptr};
void *stored_weight_{nullptr};
void *stored_bias_{nullptr};
bool enable_fp16_{false};
};
} // namespace mindspore::kernel


+ 23
- 2
mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc View File

@@ -110,7 +110,7 @@ int DepthwiseConv2dOpenCLKernel::InitWeights() {
size_t dtype_size = is_fp16 ? sizeof(int16_t) : sizeof(float);
auto out_info = GpuTensorInfo(out_tensors_[0]);
// weight: o, h, w, i; o == group, i == 1
void *origin_weight = in_tensors_.at(kWeightIndex)->data_c();
void *origin_weight = stored_weight_ == nullptr ? in_tensors_.at(kWeightIndex)->data_c() : stored_weight_;
int CO4 = UP_DIV(out_info.C, C4NUM * block_size_.C);
int pack_weight_size = C4NUM * CO4 * parameter->kernel_h_ * parameter->kernel_w_;

@@ -162,6 +162,7 @@ int DepthwiseConv2dOpenCLKernel::InitWeights() {
if (packed_weight_ == nullptr) {
return RET_ERROR;
}
FreeStoredData(stored_weight_);
return mindspore::lite::RET_OK;
}

@@ -196,12 +197,14 @@ int DepthwiseConv2dOpenCLKernel::InitBias() {
src_type = in_tensors_.at(kBiasIndex)->data_type();
dst_type = is_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32;
auto element_size = in_tensors_.at(kBiasIndex)->ElementsNum();
ConvertBias(in_tensors_.at(kBiasIndex)->data_c(), temp_bias.data(), element_size, dtype_size, src_type, dst_type);
void *src_data = stored_bias_ == nullptr ? in_tensors_.at(kBiasIndex)->data_c() : stored_bias_;
ConvertBias(src_data, temp_bias.data(), element_size, dtype_size, src_type, dst_type);
}
bias_data_ = allocator->Malloc(bias_size, temp_bias.data());
if (bias_data_ == nullptr) {
return RET_ERROR;
}
FreeStoredData(stored_bias_);
return mindspore::lite::RET_OK;
}

@@ -250,6 +253,24 @@ void DepthwiseConv2dOpenCLKernel::SetGlobalLocal() {
OpenCLKernel::AlignGlobalLocal(global_size_, local_size_);
}

int DepthwiseConv2dOpenCLKernel::StoreConstData() {
if (!op_parameter_->infer_flag_) {
stored_weight_ = StoreTensorData(in_tensors_.at(kWeightIndex));
if (stored_weight_ == nullptr) {
MS_LOG(ERROR) << "Store weight failed.";
return RET_ERROR;
}
if (in_tensors_.size() > kBiasIndex) {
stored_bias_ = StoreTensorData(in_tensors_.at(kBiasIndex));
if (stored_bias_ == nullptr) {
MS_LOG(ERROR) << "Store bias failed.";
return RET_ERROR;
}
}
}
return RET_OK;
}

int DepthwiseConv2dOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_[0]->data_c());


+ 3
- 0
mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h View File

@@ -44,10 +44,13 @@ class DepthwiseConv2dOpenCLKernel : public OpenCLKernel {
int InitBias();
void SetConstArgs() override;
void SetGlobalLocal() override;
int StoreConstData() override;

private:
void *packed_weight_{nullptr};
void *stored_weight_{nullptr};
void *bias_data_{nullptr};
void *stored_bias_{nullptr};
struct {
int H{2};
int W{2};


+ 29
- 7
mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc View File

@@ -140,8 +140,9 @@ int FullConnectionOpenCLKernel::InitFilter() {
auto padWeightFp32 = reinterpret_cast<float *>(padWeight_);
auto padWeightFp16 = reinterpret_cast<float16_t *>(padWeight_);
memset(padWeight_, 0x00, nhw_remainder * intensor_shape.Slice * co4 * C4NUM * C4NUM * dtype_size);
auto originWeightFp32 = reinterpret_cast<float *>(in_tensors_.at(kWeightIndex)->data_c());
auto originWeightFp16 = reinterpret_cast<float16_t *>(in_tensors_.at(kWeightIndex)->data_c());
void *src_data = stored_weight_ == nullptr ? in_tensors_.at(kWeightIndex)->data_c() : stored_weight_;
auto originWeightFp32 = reinterpret_cast<float *>(src_data);
auto originWeightFp16 = reinterpret_cast<float16_t *>(src_data);
bool isModelFp16 = in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16;

// pad weight
@@ -182,6 +183,7 @@ int FullConnectionOpenCLKernel::InitFilter() {
}
}
allocator->UnmapBuffer(padWeight_);
FreeStoredData(stored_weight_);
return RET_OK;
}

@@ -202,19 +204,21 @@ int FullConnectionOpenCLKernel::InitBias() {
bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true);
memset(bias_, 0x00, co4 * C4NUM * dtype_size);
if (in_tensors_.size() == 3) {
if (in_tensors_[2]->data_type() == kNumberTypeFloat32 && enable_fp16_) {
void *src_data = stored_bias_ == nullptr ? in_tensors_.at(kBiasIndex)->data_c() : stored_bias_;
if (in_tensors_[kBiasIndex]->data_type() == kNumberTypeFloat32 && enable_fp16_) {
for (int i = 0; i < CO_; i++) {
reinterpret_cast<float16_t *>(bias_)[i] = reinterpret_cast<float *>(in_tensors_[2]->data_c())[i];
reinterpret_cast<float16_t *>(bias_)[i] = reinterpret_cast<float *>(src_data)[i];
}
} else if (in_tensors_[2]->data_type() == kNumberTypeFloat16 && !enable_fp16_) {
} else if (in_tensors_[kBiasIndex]->data_type() == kNumberTypeFloat16 && !enable_fp16_) {
for (int i = 0; i < CO_; i++) {
reinterpret_cast<float *>(bias_)[i] = reinterpret_cast<float16_t *>(in_tensors_[2]->data_c())[i];
reinterpret_cast<float *>(bias_)[i] = reinterpret_cast<float16_t *>(src_data)[i];
}
} else {
memcpy(bias_, in_tensors_[2]->data_c(), CO_ * dtype_size);
memcpy(bias_, src_data, CO_ * dtype_size);
}
}
allocator->UnmapBuffer(bias_);
FreeStoredData(stored_bias_);
return RET_OK;
}

@@ -244,6 +248,24 @@ void FullConnectionOpenCLKernel::SetConstArgs() {
ocl_runtime_->SetKernelArg(kernel_, arg_count, static_cast<cl_int>(param->act_type_));
}

int FullConnectionOpenCLKernel::StoreConstData() {
if (!op_parameter_->infer_flag_) {
stored_weight_ = StoreTensorData(in_tensors_.at(kWeightIndex));
if (stored_weight_ == nullptr) {
MS_LOG(ERROR) << "Store weight failed.";
return RET_ERROR;
}
if (in_tensors_.size() > kBiasIndex) {
stored_bias_ = StoreTensorData(in_tensors_.at(kBiasIndex));
if (stored_bias_ == nullptr) {
MS_LOG(ERROR) << "Store bias failed.";
return RET_ERROR;
}
}
}
return RET_OK;
}

int FullConnectionOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
int arg_count = 0;


+ 3
- 0
mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h View File

@@ -36,12 +36,15 @@ class FullConnectionOpenCLKernel : public OpenCLKernel {
void SetConstArgs() override;
void SetGlobalLocal() override;
int Tune() override { return lite::RET_OK; }
int StoreConstData() override;

private:
int InitFilter();
int InitBias();
void *padWeight_{nullptr};
void *bias_{nullptr};
void *stored_weight_{nullptr};
void *stored_bias_{nullptr};
bool enable_fp16_{false};
bool transposeA{false};
bool transposeB{true};


+ 35
- 7
mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc View File

@@ -136,8 +136,9 @@ int MatMulOpenCLKernel::InitWeights() {
auto padWeightFp32 = reinterpret_cast<float *>(padWeight_);
auto padWeightFp16 = reinterpret_cast<float16_t *>(padWeight_);
memset(padWeight_, 0x00, a * b * ci4 * co4 * C4NUM * C4NUM * dtype_size);
auto originWeightFp32 = reinterpret_cast<float *>(in_tensors_.at(kWeightIndex)->data_c());
auto originWeightFp16 = reinterpret_cast<float16_t *>(in_tensors_.at(kWeightIndex)->data_c());
void *src_data = stored_weight_ == nullptr ? in_tensors_.at(kWeightIndex)->data_c() : stored_weight_;
auto originWeightFp32 = reinterpret_cast<float *>(src_data);
auto originWeightFp16 = reinterpret_cast<float16_t *>(src_data);
bool isModelFp16 = in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16;
// pad weight
// ABCICO -> AB(CI4)(CO4)(4 from CO)(4 from CI)
@@ -181,6 +182,7 @@ int MatMulOpenCLKernel::InitWeights() {
}

allocator->UnmapBuffer(padWeight_);
FreeStoredData(stored_weight_);
return InitBias();
}

@@ -202,19 +204,21 @@ int MatMulOpenCLKernel::InitBias() {
bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true);
memset(bias_, 0x00, co4 * C4NUM * dtype_size);
if (in_tensors_.size() == 3) {
if (in_tensors_[2]->data_type() == kNumberTypeFloat32 && enable_fp16_) {
void *src_data = stored_bias_ == nullptr ? in_tensors_.at(kBiasIndex)->data_c() : stored_bias_;
if (in_tensors_[kBiasIndex]->data_type() == kNumberTypeFloat32 && enable_fp16_) {
for (int i = 0; i < CO_; i++) {
reinterpret_cast<float16_t *>(bias_)[i] = reinterpret_cast<float *>(in_tensors_[2]->data_c())[i];
reinterpret_cast<float16_t *>(bias_)[i] = reinterpret_cast<float *>(src_data)[i];
}
} else if (in_tensors_[2]->data_type() == kNumberTypeFloat16 && !enable_fp16_) {
} else if (in_tensors_[kBiasIndex]->data_type() == kNumberTypeFloat16 && !enable_fp16_) {
for (int i = 0; i < CO_; i++) {
reinterpret_cast<float *>(bias_)[i] = reinterpret_cast<float16_t *>(in_tensors_[2]->data_c())[i];
reinterpret_cast<float *>(bias_)[i] = reinterpret_cast<float16_t *>(src_data)[i];
}
} else {
memcpy(bias_, in_tensors_[2]->data_c(), CO_ * dtype_size);
memcpy(bias_, src_data, CO_ * dtype_size);
}
}
allocator->UnmapBuffer(bias_);
FreeStoredData(stored_bias_);
return RET_OK;
}

@@ -254,6 +258,24 @@ int MatMulOpenCLKernel::Run() {
return RET_OK;
}

int MatMulOpenCLKernel::StoreConstData() {
if (!op_parameter_->infer_flag_) {
stored_weight_ = StoreTensorData(in_tensors_.at(kWeightIndex));
if (stored_weight_ == nullptr) {
MS_LOG(ERROR) << "Store weight failed.";
return RET_ERROR;
}
if (in_tensors_.size() > kBiasIndex) {
stored_bias_ = StoreTensorData(in_tensors_.at(kBiasIndex));
if (stored_bias_ == nullptr) {
MS_LOG(ERROR) << "Store bias failed.";
return RET_ERROR;
}
}
}
return RET_OK;
}

kernel::LiteKernel *OpenCLMatMulKernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
const lite::Context *ctx, const kernel::KernelKey &desc) {
@@ -274,6 +296,12 @@ kernel::LiteKernel *OpenCLMatMulKernelCreator(const std::vector<lite::Tensor *>
}
if (!infer_shape_done) {
MS_LOG(WARNING) << "kernel don't infer shape yet!";
auto ret = reinterpret_cast<MatMulOpenCLKernel *>(kernel)->StoreConstData();
if (ret != mindspore::lite::RET_OK) {
MS_LOG(ERROR) << "Store " << opParameter->name_ << " const data failed!";
delete kernel;
return nullptr;
}
return kernel;
}
if (kernel->CheckSpecs() != RET_OK || kernel->OpenCLKernel::CheckSpecs() != RET_OK) {


+ 3
- 0
mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h View File

@@ -38,6 +38,7 @@ class MatMulOpenCLKernel : public OpenCLKernel {
void SetGlobalLocal() override;
int Tune() override { return lite::RET_OK; }
int InitBias();
int StoreConstData() override;

protected:
void *padWeight_{nullptr};
@@ -47,6 +48,8 @@ class MatMulOpenCLKernel : public OpenCLKernel {
int dims{};
void *bias_{nullptr};
int CO_{1};
void *stored_weight_{nullptr};
void *stored_bias_{nullptr};
static constexpr int MAX_DIMS{4}; // max supported matmul dims
bool act_weight_{false};
std::vector<int> inShape{std::vector<int>(MAX_DIMS, 1)};


+ 4
- 2
mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.cc View File

@@ -108,12 +108,13 @@ void WinogradOpenCLKernel::InitFilter() {

// rearrange filter
auto filter_tensor = in_tensors_.at(1);
void *src_filter_data = stored_filter_ == nullptr ? filter_tensor->data_c() : stored_filter_;
#ifndef ENABLE_ARM64
auto winograd_filter = GenerateWinogradFilter(filter_tensor->data_c(), filter_tensor->data_type(), CO_, CI_);
auto winograd_filter = GenerateWinogradFilter(src_filter_data, filter_tensor->data_type(), CO_, CI_);
void *src_data = winograd_filter.data();
#else
std::unique_ptr<float[]> winograd_filter(new float[CO_ * 6 * 6 * CI_]);
WinogradWeightTransform(reinterpret_cast<const float *>(filter_tensor->data_c()),
WinogradWeightTransform(reinterpret_cast<const float *>(src_filter_data),
reinterpret_cast<float *>(winograd_filter.get()), nullptr, Gt, 1, 6, 3, CI_, CO_, false);

void *src_data = winograd_filter.get();
@@ -136,6 +137,7 @@ void WinogradOpenCLKernel::InitFilter() {
memcpy(packed_filter_, tmp.data(), size);
allocator->UnmapBuffer(packed_filter_);
}
FreeStoredData(stored_filter_);
}

void WinogradOpenCLKernel::AllocateMemory() {


+ 7
- 0
mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h View File

@@ -195,6 +195,7 @@ class OpenCLKernel : public LiteKernel {
virtual std::vector<BaseTuningParameter> GenerateTuningParam();
virtual int AssignTuningParam(const BaseTuningParameter &param);
virtual int Tune();
virtual int StoreConstData() { return RET_OK; }

int GetImageSize(size_t idx, lite::opencl::ImageSize *img_size);
void PrintOutput(int print_num = 10, const std::string &out_file = "");
@@ -259,6 +260,12 @@ kernel::LiteKernel *OpenCLKernelCreator(const std::vector<lite::Tensor *> &input
delete kernel;
return nullptr;
}
ret = reinterpret_cast<OpenCLKernel *>(kernel)->StoreConstData();
if (ret != mindspore::lite::RET_OK) {
MS_LOG(ERROR) << "Store " << opParameter->name_ << " const data failed!";
delete kernel;
return nullptr;
}
return kernel;
}
} // namespace mindspore::kernel


+ 10
- 17
mindspore/lite/src/runtime/kernel/opencl/utils.cc View File

@@ -301,29 +301,22 @@ int CheckParamLikeTensor(const std::string &kernel_name, const std::string &tens
return RET_OK;
}

static std::set<void *> tmp_weights;

void StoreTmpWeight(lite::Tensor *tensor) {
MS_LOG(WARNING) << "store weight when kernel don't infer shape!";
void *StoreTensorData(lite::Tensor *tensor) {
if ((tensor != nullptr) && (tensor->data_c() != nullptr) && (tensor->Size() > 0)) {
void *new_data = malloc(tensor->Size());
MS_ASSERT(new_data);
if (new_data == nullptr) {
return;
void *stored_data = malloc(tensor->Size());
if (stored_data == nullptr) {
MS_LOG(ERROR) << "StoreTensorData Malloc Failed.";
return nullptr;
}
memcpy(new_data, tensor->data_c(), tensor->Size());
tensor->set_data(new_data);
tmp_weights.insert(new_data);
memcpy(stored_data, tensor->data_c(), tensor->Size());
return stored_data;
}
return nullptr;
}

void FreeTmpWeight(lite::Tensor *tensor) {
MS_ASSERT(tensor != nullptr);
auto data = tensor->data_c();
if (tmp_weights.count(data)) {
tmp_weights.erase(data);
void FreeStoredData(void *data) {
if (data != nullptr) {
free(data);
tensor->set_data(nullptr);
}
}



+ 3
- 2
mindspore/lite/src/runtime/kernel/opencl/utils.h View File

@@ -64,8 +64,9 @@ void PackNHWCToNHWC4(void *src, void *dst, bool src_is_fp16, bool dst_is_fp16, c
int CheckParamLikeTensor(const std::string &kernel_name, const std::string &tensor_name, lite::Tensor *tensor,
TypeId expect_data_type, const std::vector<int> &expect_shape);

void StoreTmpWeight(lite::Tensor *tensor);
void FreeTmpWeight(lite::Tensor *tensor);
void *StoreTensorData(lite::Tensor *tensor);

void FreeStoredData(void *data);

std::vector<std::string> CreateBuildOptionsExtByDType(TypeId type_id);



+ 2
- 2
mindspore/lite/src/scheduler.cc View File

@@ -271,13 +271,13 @@ int CastConstTensorsData(const std::vector<Tensor *> &tensors, std::map<Tensor *
if (tensor->data_type() == kNumberTypeFloat32 && dst_data_type == kNumberTypeFloat16) {
auto ret = CastConstTensorData(tensor, restored_origin_tensors, kNumberTypeFloat16);
if (ret != RET_OK) {
MS_LOG(ERROR) << "Cast const tensor from fp32 to fp16 failed, tensor name : " << tensor->tensor_name();
MS_LOG(DEBUG) << "Cast const tensor from fp32 to fp16 failed, tensor name : " << tensor->tensor_name();
return ret;
}
} else if (tensor->data_type() == kNumberTypeFloat16 && dst_data_type == kNumberTypeFloat32) {
auto ret = CastConstTensorData(tensor, restored_origin_tensors, kNumberTypeFloat32);
if (ret != RET_OK) {
MS_LOG(ERROR) << "Cast const tensor from fp16 to fp32 failed, tensor name : " << tensor->tensor_name();
MS_LOG(DEBUG) << "Cast const tensor from fp16 to fp32 failed, tensor name : " << tensor->tensor_name();
return ret;
}
} else {


Loading…
Cancel
Save