Browse Source

add tuning

tags/v1.1.0
chenzupeng 5 years ago
parent
commit
1d0ee5ca04
66 changed files with 378 additions and 248 deletions
  1. +4
    -3
      mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
  2. +0
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h
  3. +4
    -4
      mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc
  4. +1
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.h
  5. +8
    -7
      mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
  6. +0
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h
  7. +5
    -5
      mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc
  8. +0
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.h
  9. +4
    -4
      mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.cc
  10. +1
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.h
  11. +5
    -5
      mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc
  12. +0
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.h
  13. +5
    -5
      mindspore/lite/src/runtime/kernel/opencl/kernel/cast.cc
  14. +0
    -2
      mindspore/lite/src/runtime/kernel/opencl/kernel/cast.h
  15. +7
    -7
      mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
  16. +0
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h
  17. +29
    -20
      mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
  18. +1
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h
  19. +1
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
  20. +0
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.h
  21. +8
    -9
      mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
  22. +0
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h
  23. +0
    -3
      mindspore/lite/src/runtime/kernel/opencl/kernel/fill.h
  24. +4
    -4
      mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc
  25. +1
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h
  26. +4
    -4
      mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc
  27. +1
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/gather.h
  28. +0
    -50
      mindspore/lite/src/runtime/kernel/opencl/kernel/hswish.h
  29. +6
    -6
      mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
  30. +1
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h
  31. +4
    -2
      mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.cc
  32. +0
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.h
  33. +4
    -3
      mindspore/lite/src/runtime/kernel/opencl/kernel/pad.cc
  34. +0
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/pad.h
  35. +4
    -3
      mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
  36. +0
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h
  37. +0
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/power.h
  38. +0
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.h
  39. +13
    -6
      mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc
  40. +1
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h
  41. +4
    -4
      mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
  42. +0
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.h
  43. +4
    -3
      mindspore/lite/src/runtime/kernel/opencl/kernel/resize.cc
  44. +0
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/resize.h
  45. +0
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h
  46. +8
    -2
      mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
  47. +1
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.h
  48. +4
    -4
      mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.cc
  49. +0
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.h
  50. +4
    -2
      mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_depth.cc
  51. +0
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_depth.h
  52. +4
    -4
      mindspore/lite/src/runtime/kernel/opencl/kernel/sparse_to_dense.cc
  53. +0
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/sparse_to_dense.h
  54. +0
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/stack.h
  55. +7
    -7
      mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.cc
  56. +0
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.h
  57. +5
    -5
      mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
  58. +0
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h
  59. +5
    -3
      mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
  60. +0
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h
  61. +140
    -4
      mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
  62. +11
    -2
      mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc
  63. +23
    -5
      mindspore/lite/src/runtime/opencl/opencl_executor.cc
  64. +3
    -0
      mindspore/lite/src/runtime/opencl/opencl_executor.h
  65. +20
    -19
      mindspore/lite/src/runtime/opencl/opencl_runtime.cc
  66. +9
    -1
      mindspore/lite/src/runtime/opencl/opencl_runtime.h

+ 4
- 3
mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc View File

@@ -92,8 +92,9 @@ void ActivationOpenCLKernel::SetConstArgs() {
} }


void ActivationOpenCLKernel::SetGlobalLocal() { void ActivationOpenCLKernel::SetGlobalLocal() {
local_range_ = cl::NullRange;
global_range_ = {outShape.width, outShape.height};
local_size_ = {};
global_size_ = {outShape.width, outShape.height};
AlignGlobalLocal(global_size_, local_size_);
} }


int ActivationOpenCLKernel::Run() { int ActivationOpenCLKernel::Run() {
@@ -101,7 +102,7 @@ int ActivationOpenCLKernel::Run() {
int arg_idx = 0; int arg_idx = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
auto ret = ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
auto ret = ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
if (ret != RET_OK) { if (ret != RET_OK) {
MS_LOG(ERROR) << "Run kernel:" << this->name() << " fail."; MS_LOG(ERROR) << "Run kernel:" << this->name() << " fail.";
return RET_ERROR; return RET_ERROR;


+ 0
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h View File

@@ -42,7 +42,6 @@ class ActivationOpenCLKernel : public OpenCLKernel {


private: private:
static std::string GetActTypeString(int act_type); static std::string GetActTypeString(int act_type);
cl::Kernel kernel_;
int type_; int type_;
float alpha_; float alpha_;
GpuTensorInfo outShape = GpuTensorInfo(nullptr); GpuTensorInfo outShape = GpuTensorInfo(nullptr);


+ 4
- 4
mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc View File

@@ -115,9 +115,9 @@ void ArgMinMaxOpenCLKernel::SetGlobalLocal() {
default: // 3 default: // 3
break; break;
} }
std::vector<size_t> local = {1, 1, 1};
std::vector<size_t> global = {static_cast<size_t>(strides_.s[0]), static_cast<size_t>(src_size_.s[1]), 1};
OpenCLKernel::AlignGlobalLocal(global, local);
local_size_ = {1, 1, 1};
global_size_ = {static_cast<size_t>(strides_.s[0]), static_cast<size_t>(src_size_.s[1]), 1};
OpenCLKernel::AlignGlobalLocal(global_size_, local_size_);
} }


int ArgMinMaxOpenCLKernel::InitWeights() { int ArgMinMaxOpenCLKernel::InitWeights() {
@@ -153,7 +153,7 @@ int ArgMinMaxOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running! "; MS_LOG(DEBUG) << this->name() << " Running! ";
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c(), lite::opencl::MemType::BUF); ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c(), lite::opencl::MemType::BUF);
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF); ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
return RET_OK; return RET_OK;
} }




+ 1
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.h View File

@@ -38,9 +38,9 @@ class ArgMinMaxOpenCLKernel : public OpenCLKernel {
void SetConstArgs() override; void SetConstArgs() override;
void SetGlobalLocal() override; void SetGlobalLocal() override;
int InitWeights() override; int InitWeights() override;
int Tune() override { return lite::RET_OK; }


private: private:
cl::Kernel kernel_;
void *buff_{nullptr}; void *buff_{nullptr};
void *ids_{nullptr}; void *ids_{nullptr};
GpuTensorInfo im_in_{GpuTensorInfo(nullptr)}; GpuTensorInfo im_in_{GpuTensorInfo(nullptr)};


+ 8
- 7
mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc View File

@@ -124,23 +124,24 @@ int ArithmeticOpenCLKernel::CheckSpecs() {


void ArithmeticOpenCLKernel::SetGlobalLocal() { void ArithmeticOpenCLKernel::SetGlobalLocal() {
if (element_flag_) { if (element_flag_) {
local_range_ = {};
local_size_ = {};
auto out_shape = out_tensors_[0]->shape(); auto out_shape = out_tensors_[0]->shape();
if (out_shape.size() == 2) { if (out_shape.size() == 2) {
size_t H = out_shape[0]; size_t H = out_shape[0];
size_t W = UP_DIV(out_shape[1], C4NUM); size_t W = UP_DIV(out_shape[1], C4NUM);
global_range_ = {W, H};
global_size_ = {W, H};
} else { } else {
size_t H = out_shape[0] * out_shape[1]; size_t H = out_shape[0] * out_shape[1];
size_t W = out_shape[2] * UP_DIV(out_shape[3], C4NUM); size_t W = out_shape[2] * UP_DIV(out_shape[3], C4NUM);
global_range_ = {W, H};
global_size_ = {W, H};
} }
} else { } else {
local_range_ = {};
local_size_ = {};
auto out_shape = GetNHWCShape(out_tensors_[0]->shape()); auto out_shape = GetNHWCShape(out_tensors_[0]->shape());
global_range_ = {static_cast<size_t>(UP_DIV(out_shape[3], C4NUM)), static_cast<size_t>(out_shape[2]),
static_cast<size_t>(out_shape[1] * out_shape[0])};
global_size_ = {static_cast<size_t>(UP_DIV(out_shape[3], C4NUM)), static_cast<size_t>(out_shape[2]),
static_cast<size_t>(out_shape[1] * out_shape[0])};
} }
AlignGlobalLocal(global_size_, local_size_);
} }


int ArithmeticOpenCLKernel::InitWeights() { int ArithmeticOpenCLKernel::InitWeights() {
@@ -269,7 +270,7 @@ int ArithmeticOpenCLKernel::Run() {
auto input_1_ptr = inputs_weight_ptrs_[1] == nullptr ? in_tensors_[1]->data_c() : inputs_weight_ptrs_[1]; auto input_1_ptr = inputs_weight_ptrs_[1] == nullptr ? in_tensors_[1]->data_c() : inputs_weight_ptrs_[1];
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_1_ptr); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_1_ptr);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
return RET_OK; return RET_OK;
} }




+ 0
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h View File

@@ -39,7 +39,6 @@ class ArithmeticOpenCLKernel : public OpenCLKernel {
void SetGlobalLocal() override; void SetGlobalLocal() override;


private: private:
cl::Kernel kernel_;
bool element_flag_{true}; bool element_flag_{true};
float activation_min_{-FLT_MAX}; float activation_min_{-FLT_MAX};
float activation_max_{FLT_MAX}; float activation_max_{FLT_MAX};


+ 5
- 5
mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc View File

@@ -131,10 +131,10 @@ void ArithmeticSelfOpenCLKernel::SetGlobalLocal() {
OC = UP_DIV(output_shape[1], C4NUM); OC = UP_DIV(output_shape[1], C4NUM);
} }
const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize(); const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize();
std::vector<size_t> local = {1, 1, 1}; // init local
std::vector<size_t> global = {OH, OW, OC};
ArithmeticSelfGetWorkGroup(global, &local, max_global[0]);
OpenCLKernel::AlignGlobalLocal(global, local);
local_size_ = {1, 1, 1}; // init local
global_size_ = {OH, OW, OC};
ArithmeticSelfGetWorkGroup(global_size_, &local_size_, max_global[0]);
OpenCLKernel::AlignGlobalLocal(global_size_, local_size_);
} }


int ArithmeticSelfOpenCLKernel::Prepare() { int ArithmeticSelfOpenCLKernel::Prepare() {
@@ -159,7 +159,7 @@ int ArithmeticSelfOpenCLKernel::Run() {
int arg_cn = 0; int arg_cn = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
return RET_OK; return RET_OK;
} }




+ 0
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.h View File

@@ -43,7 +43,6 @@ class ArithmeticSelfOpenCLKernel : public OpenCLKernel {
private: private:
void GetKernelName(std::string *kernel_name, ArithmeticSelfParameter *param); void GetKernelName(std::string *kernel_name, ArithmeticSelfParameter *param);
cl_int4 output_shape_ = {}; cl_int4 output_shape_ = {};
cl::Kernel kernel_;
}; };


} // namespace mindspore::kernel } // namespace mindspore::kernel


+ 4
- 4
mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.cc View File

@@ -75,9 +75,9 @@ void BatchToSpaceNDOpenCLKernel::SetGlobalLocal() {
size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM); size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
std::vector<int> out_shape = out_tensors_[0]->shape(); std::vector<int> out_shape = out_tensors_[0]->shape();
cl_int4 dst_size = {(cl_int)CO4, out_shape[2], out_shape[1], out_shape[0]}; cl_int4 dst_size = {(cl_int)CO4, out_shape[2], out_shape[1], out_shape[0]};
std::vector<size_t> local = {1, 1, 1};
std::vector<size_t> global = {(size_t)dst_size.s[0], (size_t)dst_size.s[1], (size_t)dst_size.s[2]};
OpenCLKernel::AlignGlobalLocal(global, local);
local_size_ = {1, 1, 1};
global_size_ = {(size_t)dst_size.s[0], (size_t)dst_size.s[1], (size_t)dst_size.s[2]};
OpenCLKernel::AlignGlobalLocal(global_size_, local_size_);
} }


int BatchToSpaceNDOpenCLKernel::Prepare() { int BatchToSpaceNDOpenCLKernel::Prepare() {
@@ -103,7 +103,7 @@ int BatchToSpaceNDOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running! "; MS_LOG(DEBUG) << this->name() << " Running! ";
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);


return RET_OK; return RET_OK;
} }


+ 1
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.h View File

@@ -37,9 +37,9 @@ class BatchToSpaceNDOpenCLKernel : public OpenCLKernel {
int CheckSpecs() override; int CheckSpecs() override;
void SetConstArgs() override; void SetConstArgs() override;
void SetGlobalLocal() override; void SetGlobalLocal() override;
int Tune() override { return lite::RET_OK; }


private: private:
cl::Kernel kernel_;
}; };
} // namespace mindspore::kernel } // namespace mindspore::kernel
#endif #endif

+ 5
- 5
mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc View File

@@ -62,10 +62,10 @@ void BatchNormOpenCLKernel::SetGlobalLocal() {
uint32_t OC = UP_DIV(output_shape[3], C4NUM); uint32_t OC = UP_DIV(output_shape[3], C4NUM);


const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize(); const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize();
std::vector<size_t> local = {1, 1, 1}; // init local
std::vector<size_t> global = {OH, OW, OC};
BatchNormGetWorkGroup(global, &local, max_global[0]);
OpenCLKernel::AlignGlobalLocal(global, local);
local_size_ = {1, 1, 1}; // init local
global_size_ = {OH, OW, OC};
BatchNormGetWorkGroup(global_size_, &local_size_, max_global[0]);
OpenCLKernel::AlignGlobalLocal(global_size_, local_size_);
} }


int BatchNormOpenCLKernel::Prepare() { int BatchNormOpenCLKernel::Prepare() {
@@ -91,7 +91,7 @@ int BatchNormOpenCLKernel::Run() {
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[3]->data_c()); // mean ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[3]->data_c()); // mean
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[4]->data_c()); // variance ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[4]->data_c()); // variance
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); // out tensor ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); // out tensor
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
return RET_OK; return RET_OK;
} }




+ 0
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.h View File

@@ -40,7 +40,6 @@ class BiasAddOpenCLKernel : public OpenCLKernel {
private: private:
cl_int4 GetGlobalshape(); cl_int4 GetGlobalshape();


cl::Kernel kernel_;
void *BiasAdd_{nullptr}; void *BiasAdd_{nullptr};
int in_size_{}; int in_size_{};
int out_size_{}; int out_size_{};


+ 5
- 5
mindspore/lite/src/runtime/kernel/opencl/kernel/cast.cc View File

@@ -73,10 +73,10 @@ void CastOpenCLKernel::SetGlobalLocal() {
uint32_t OC = UP_DIV(input_shape[3], C4NUM); uint32_t OC = UP_DIV(input_shape[3], C4NUM);


const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize(); const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize();
std::vector<size_t> local = {1, 1, 1}; // init local
std::vector<size_t> global = {OH, OW, OC};
CastGetWorkGroup(global, &local, max_global[0]);
OpenCLKernel::AlignGlobalLocal(global, local);
local_size_ = {1, 1, 1}; // init local
global_size_ = {OH, OW, OC};
CastGetWorkGroup(global_size_, &local_size_, max_global[0]);
OpenCLKernel::AlignGlobalLocal(global_size_, local_size_);
} }


int CastOpenCLKernel::Prepare() { int CastOpenCLKernel::Prepare() {
@@ -100,7 +100,7 @@ int CastOpenCLKernel::Run() {
int arg_cn = 0; int arg_cn = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); // input tensor ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); // input tensor
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); // out tensor ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); // out tensor
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
return RET_OK; return RET_OK;
} }




+ 0
- 2
mindspore/lite/src/runtime/kernel/opencl/kernel/cast.h View File

@@ -41,8 +41,6 @@ class CastOpenCLKernel : public OpenCLKernel {


private: private:
int GetKernelName(std::string *kernel_name, CastParameter *param); int GetKernelName(std::string *kernel_name, CastParameter *param);

cl::Kernel kernel_;
}; };


} // namespace mindspore::kernel } // namespace mindspore::kernel


+ 7
- 7
mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc View File

@@ -132,17 +132,17 @@ void ConcatOpenCLKernel::SetGlobalLocal() {
if (axis_ == 3 && !Align_) { if (axis_ == 3 && !Align_) {
OH = out_shape_.s[0] * out_shape_.s[1]; OH = out_shape_.s[0] * out_shape_.s[1];
OW = out_shape_.s[2]; OW = out_shape_.s[2];
global = {OH, OW, 1};
local = {1, 1, 1};
global_size_ = {OH, OW, 1};
local_size_ = {1, 1, 1};
} else { } else {
OH = out_shape_.s[0] * out_shape_.s[1]; OH = out_shape_.s[0] * out_shape_.s[1];
OW = out_shape_.s[2]; OW = out_shape_.s[2];
OC = out_shape_.s[3]; OC = out_shape_.s[3];
global = {OH, OW, OC};
local = {1, 1, 1};
global_size_ = {OH, OW, OC};
local_size_ = {1, 1, 1};
} }
ConcatGetWorkGroup(global, &local, max_global[0]);
OpenCLKernel::AlignGlobalLocal(global, local);
ConcatGetWorkGroup(global_size_, &local_size_, max_global[0]);
OpenCLKernel::AlignGlobalLocal(global_size_, local_size_);
} }


int ConcatOpenCLKernel::Prepare() { int ConcatOpenCLKernel::Prepare() {
@@ -196,7 +196,7 @@ int ConcatOpenCLKernel::Run() {
MS_LOG(ERROR) << "unsupported input size :" << in_tensors_.size(); MS_LOG(ERROR) << "unsupported input size :" << in_tensors_.size();
return RET_ERROR; return RET_ERROR;
} }
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
return RET_OK; return RET_OK;
} }




+ 0
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h View File

@@ -53,7 +53,6 @@ class ConcatOpenCLKernel : public OpenCLKernel {


private: private:
int RunAxis0(); int RunAxis0();
cl::Kernel kernel_;
}; };


} // namespace mindspore::kernel } // namespace mindspore::kernel


+ 29
- 20
mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc View File

@@ -93,7 +93,7 @@ int Conv2DOpenCLKernel::Prepare() {
std::string program_name = "winograd"; std::string program_name = "winograd";
ocl_runtime_->LoadSource(program_name, winograd_source); ocl_runtime_->LoadSource(program_name, winograd_source);
ocl_runtime_->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36"); ocl_runtime_->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36");
ocl_runtime_->BuildKernel(kernel_conv_, program_name, "WinogradConvolution");
ocl_runtime_->BuildKernel(kernel_, program_name, "WinogradConvolution");
ocl_runtime_->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4"); ocl_runtime_->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4");
} else { } else {
SetBlockSize(); SetBlockSize();
@@ -101,7 +101,7 @@ int Conv2DOpenCLKernel::Prepare() {
std::string kernel_name = "Conv2D_H" + std::to_string(block_size_.H) + "W" + std::to_string(block_size_.W) + "C" + std::string kernel_name = "Conv2D_H" + std::to_string(block_size_.H) + "W" + std::to_string(block_size_.W) + "C" +
std::to_string(block_size_.C); std::to_string(block_size_.C);
ocl_runtime_->LoadSource(program_name, conv2d_source); ocl_runtime_->LoadSource(program_name, conv2d_source);
ocl_runtime_->BuildKernel(kernel_conv_, program_name, kernel_name);
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
} }


// allocate winograd memory // allocate winograd memory
@@ -329,7 +329,9 @@ void Conv2DOpenCLKernel::SetGlobalLocal() {
local_h = std::min(global_h, local_hw); local_h = std::min(global_h, local_hw);
local_w = std::min(local_hw / local_h, global_w); local_w = std::min(local_hw / local_h, global_w);
} }
AlignGlobalLocal({global_h, global_w, global_c}, {local_h, local_w, local_c});
global_size_ = {global_h, global_w, global_c};
local_size_ = {local_h, local_w, local_c};
AlignGlobalLocal(global_size_, local_size_);
} }
} }


@@ -355,11 +357,11 @@ void Conv2DOpenCLKernel::SetConstArgs() {
arg_cn = 0; arg_cn = 0;
cl_int4 conv_in_shape = {1, 36, TILES_XY_, CI_SLICES_}; cl_int4 conv_in_shape = {1, 36, TILES_XY_, CI_SLICES_};
cl_int4 conv_out_shape = {1, 36, TILES_XY_, CO_SLICES_}; cl_int4 conv_out_shape = {1, 36, TILES_XY_, CO_SLICES_};
ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, winograd_mem0_);
ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, winograd_mem1_);
ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF);
ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, conv_in_shape);
ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn, conv_out_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, winograd_mem0_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, winograd_mem1_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, conv_in_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_cn, conv_out_shape);


arg_cn = 2; arg_cn = 2;
cl_int4 _36to4x4_in_shape = {1, 16, TILES_XY_, CO_SLICES_}; cl_int4 _36to4x4_in_shape = {1, 16, TILES_XY_, CO_SLICES_};
@@ -373,30 +375,37 @@ void Conv2DOpenCLKernel::SetConstArgs() {
cl_int4 kernel_stride = {KH_, KW_, param->stride_h_, param->stride_w_}; cl_int4 kernel_stride = {KH_, KW_, param->stride_h_, param->stride_w_};
cl_int4 pad = {param->pad_u_, param->pad_d_, param->pad_l_, param->pad_r_}; cl_int4 pad = {param->pad_u_, param->pad_d_, param->pad_l_, param->pad_r_};
cl_int2 dilation = {param->dilation_h_, param->dilation_w_}; cl_int2 dilation = {param->dilation_h_, param->dilation_w_};
ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF);
ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, input_shape);
ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, output_shape);
ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, kernel_stride);
ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, pad);
ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, dilation);
ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn, act_type);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, kernel_stride);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, pad);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, dilation);
ocl_runtime_->SetKernelArg(kernel_, arg_cn, act_type);
} }
} }


int Conv2DOpenCLKernel::Tune() {
if (use_winograd_) {
return RET_OK;
}
return OpenCLKernel::Tune();
}

int Conv2DOpenCLKernel::Run() { int Conv2DOpenCLKernel::Run() {
if (use_winograd_) { if (use_winograd_) {
ocl_runtime_->SetKernelArg(kernel_4x4to36_, 0, in_tensors_.front()->data_c()); ocl_runtime_->SetKernelArg(kernel_4x4to36_, 0, in_tensors_.front()->data_c());
ocl_runtime_->RunKernel(kernel_4x4to36_, global_4x4to36_, local_4x4to36_); ocl_runtime_->RunKernel(kernel_4x4to36_, global_4x4to36_, local_4x4to36_);


ocl_runtime_->RunKernel(kernel_conv_, global_conv_, local_conv_);
ocl_runtime_->RunKernel(kernel_, global_conv_, local_conv_);


ocl_runtime_->SetKernelArg(kernel_36to4x4_, 1, out_tensors_.front()->data_c()); ocl_runtime_->SetKernelArg(kernel_36to4x4_, 1, out_tensors_.front()->data_c());
ocl_runtime_->RunKernel(kernel_36to4x4_, global_36to4x4_, local_36to4x4_); ocl_runtime_->RunKernel(kernel_36to4x4_, global_36to4x4_, local_36to4x4_);
} else { } else {
ocl_runtime_->SetKernelArg(kernel_conv_, 0, in_tensors_.front()->data_c());
ocl_runtime_->SetKernelArg(kernel_conv_, 1, out_tensors_.front()->data_c());
ocl_runtime_->RunKernel(kernel_conv_, global_range_, local_range_);
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c());
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
} }
return RET_OK; return RET_OK;
} }


+ 1
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h View File

@@ -42,6 +42,7 @@ class Conv2DOpenCLKernel : public OpenCLKernel {
void SetConstArgs() override; void SetConstArgs() override;


int Run() override; int Run() override;
int Tune() override;


private: private:
void SetBlockSize(); void SetBlockSize();
@@ -60,7 +61,6 @@ class Conv2DOpenCLKernel : public OpenCLKernel {
} }


cl::Kernel kernel_4x4to36_; cl::Kernel kernel_4x4to36_;
cl::Kernel kernel_conv_;
cl::Kernel kernel_36to4x4_; cl::Kernel kernel_36to4x4_;
cl::NDRange global_4x4to36_, local_4x4to36_; cl::NDRange global_4x4to36_, local_4x4to36_;
cl::NDRange global_conv_, local_conv_; cl::NDRange global_conv_, local_conv_;


+ 1
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc View File

@@ -193,7 +193,7 @@ int Conv2dTransposeOpenCLKernel::Run() {
int arg_cnt = 0; int arg_cnt = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, out_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, out_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
return mindspore::lite::RET_OK; return mindspore::lite::RET_OK;
} }




+ 0
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.h View File

@@ -40,7 +40,6 @@ class Conv2dTransposeOpenCLKernel : public OpenCLKernel {
void SetGlobalLocal() override; void SetGlobalLocal() override;


private: private:
cl::Kernel kernel_;
void *padWeight_{nullptr}; void *padWeight_{nullptr};
void *bias_{nullptr}; void *bias_{nullptr};
bool enable_fp16_{false}; bool enable_fp16_{false};


+ 8
- 9
mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc View File

@@ -175,24 +175,23 @@ void DepthwiseConv2dOpenCLKernel::SetConstArgs() {
void DepthwiseConv2dOpenCLKernel::SetGlobalLocal() { void DepthwiseConv2dOpenCLKernel::SetGlobalLocal() {
// set global // set global
size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM * block_size_[2]); size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM * block_size_[2]);
std::vector<size_t> global_size = {CO4, (size_t)UP_DIV(out_tensors_[0]->Width(), block_size_[1]),
(size_t)UP_DIV(out_tensors_[0]->Height(), block_size_[0])};
global_size_ = {CO4, (size_t)UP_DIV(out_tensors_[0]->Width(), block_size_[1]),
(size_t)UP_DIV(out_tensors_[0]->Height(), block_size_[0])};
// set local // set local
const int max_group_size = ocl_runtime_->DeviceMaxWorkGroupSize(); const int max_group_size = ocl_runtime_->DeviceMaxWorkGroupSize();
int z = global_size[0];
int y = std::min(max_group_size / z, GetMaxDivisorStrategy0(global_size[2], 8));
int x = std::max(1, std::min(static_cast<int>(global_size[1]), max_group_size / (y * z)));
std::vector<size_t> local_size =
std::vector<size_t>({static_cast<size_t>(z), static_cast<size_t>(x), static_cast<size_t>(y)});
int z = global_size_[0];
int y = std::min(max_group_size / z, GetMaxDivisorStrategy0(global_size_[2], 8));
int x = std::max(1, std::min(static_cast<int>(global_size_[1]), max_group_size / (y * z)));
local_size_ = std::vector<size_t>({static_cast<size_t>(z), static_cast<size_t>(x), static_cast<size_t>(y)});


OpenCLKernel::AlignGlobalLocal(global_size, local_size);
OpenCLKernel::AlignGlobalLocal(global_size_, local_size_);
} }


int DepthwiseConv2dOpenCLKernel::Run() { int DepthwiseConv2dOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!"; MS_LOG(DEBUG) << this->name() << " Running!";
ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, 1, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, 1, in_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
return mindspore::lite::RET_OK; return mindspore::lite::RET_OK;
} }




+ 0
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h View File

@@ -42,7 +42,6 @@ class DepthwiseConv2dOpenCLKernel : public OpenCLKernel {
private: private:
void *packed_weight_{nullptr}; void *packed_weight_{nullptr};
void *bias_data_{nullptr}; void *bias_data_{nullptr};
cl::Kernel kernel_;
std::vector<int> block_size_{2, 2, 1}; std::vector<int> block_size_{2, 2, 1};
}; };
} // namespace mindspore::kernel } // namespace mindspore::kernel


+ 0
- 3
mindspore/lite/src/runtime/kernel/opencl/kernel/fill.h View File

@@ -43,9 +43,6 @@ class FillOpenCLKernel : public OpenCLKernel {
private: private:
int RunFill(); int RunFill();
int RunShape(); int RunShape();
cl::Kernel kernel_;

private:
float default_{0.0f}; float default_{0.0f};
}; };




+ 4
- 4
mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc View File

@@ -179,9 +179,9 @@ int FullConnectionOpenCLKernel::InitWeights() {
} }


void FullConnectionOpenCLKernel::SetGlobalLocal() { void FullConnectionOpenCLKernel::SetGlobalLocal() {
std::vector<size_t> local = {32, 4, 1};
std::vector<size_t> global = {UP_DIV(outShape.C, C4NUM), 4, outShape.N};
AlignGlobalLocal(global, local);
local_size_ = {32, 4, 1};
global_size_ = {UP_DIV(outShape.C, C4NUM), 4, outShape.N};
AlignGlobalLocal(global_size_, local_size_);
} }


void FullConnectionOpenCLKernel::SetConstArgs() { void FullConnectionOpenCLKernel::SetConstArgs() {
@@ -202,7 +202,7 @@ int FullConnectionOpenCLKernel::Run() {
int arg_count = 0; int arg_count = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
return RET_OK; return RET_OK;
} }




+ 1
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h View File

@@ -38,9 +38,9 @@ class FullConnectionOpenCLKernel : public OpenCLKernel {
void SetConstArgs() override; void SetConstArgs() override;
void SetGlobalLocal() override; void SetGlobalLocal() override;
int Init() override; int Init() override;
int Tune() override { return lite::RET_OK; }


private: private:
cl::Kernel kernel_;
void *padWeight_{nullptr}; void *padWeight_{nullptr};
void *bias_{nullptr}; void *bias_{nullptr};
bool enable_fp16_{false}; bool enable_fp16_{false};


+ 4
- 4
mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc View File

@@ -93,9 +93,9 @@ void GatherOpenCLKernel::SetConstArgs() {


void GatherOpenCLKernel::SetGlobalLocal() { void GatherOpenCLKernel::SetGlobalLocal() {
auto output = GpuTensorInfo(out_tensors_.front()); auto output = GpuTensorInfo(out_tensors_.front());
std::vector<size_t> local = {1, 1, 1};
std::vector<size_t> global = {output.W, output.N * output.H, output.Slice};
OpenCLKernel::AlignGlobalLocal(global, local);
local_size_ = {1, 1, 1};
global_size_ = {output.W, output.N * output.H, output.Slice};
OpenCLKernel::AlignGlobalLocal(global_size_, local_size_);
} }


int GatherOpenCLKernel::Prepare() { int GatherOpenCLKernel::Prepare() {
@@ -155,7 +155,7 @@ int GatherOpenCLKernel::Run() {
ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_.front()->data_c()); ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_.front()->data_c());
ocl_runtime_->SetKernelArg(kernel_, 1, in_tensors_.front()->data_c()); ocl_runtime_->SetKernelArg(kernel_, 1, in_tensors_.front()->data_c());
ocl_runtime_->SetKernelArg(kernel_, 2, indices_data_, lite::opencl::MemType::BUF); ocl_runtime_->SetKernelArg(kernel_, 2, indices_data_, lite::opencl::MemType::BUF);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
return RET_OK; return RET_OK;
} }




+ 1
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/gather.h View File

@@ -38,12 +38,12 @@ class GatherOpenCLKernel : public OpenCLKernel {
int CheckSpecs() override; int CheckSpecs() override;
void SetConstArgs() override; void SetConstArgs() override;
void SetGlobalLocal() override; void SetGlobalLocal() override;
int Tune() override { return lite::RET_OK; }


protected: protected:
int UpdateWeights(); int UpdateWeights();


private: private:
cl::Kernel kernel_;
int32_t *indices_data_{nullptr}; int32_t *indices_data_{nullptr};
int axis_ = {0}; int axis_ = {0};
}; };


+ 0
- 50
mindspore/lite/src/runtime/kernel/opencl/kernel/hswish.h View File

@@ -1,50 +0,0 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_HSWISH_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_HSWISH_H_

#include <vector>
#include "mindspore/lite/nnacl/fp32/activation_fp32.h"
#include "src/runtime/kernel/opencl/opencl_kernel.h"

namespace mindspore::kernel {

class HswishOpenCLKernel : public OpenCLKernel {
public:
HswishOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs)
: OpenCLKernel(parameter, inputs, outputs) {}

~HswishOpenCLKernel() override = default;

int Init() override;

int Run() override;

private:
int InferShapeTo4D();
cl::Kernel kernel_;

private:
size_t N_{1};
size_t H_{1};
size_t W_{1};
size_t C_{1};
};

} // namespace mindspore::kernel
#endif

+ 6
- 6
mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc View File

@@ -137,11 +137,11 @@ int MatMulOpenCLKernel::InitWeights() {


void MatMulOpenCLKernel::SetGlobalLocal() { void MatMulOpenCLKernel::SetGlobalLocal() {
// local size should less than MAX_GROUP_SIZE // local size should less than MAX_GROUP_SIZE
std::vector<size_t> local = {32, 4, 1};
std::vector<size_t> global = {UP_DIV(static_cast<size_t>(outShape[3]), C4NUM),
4 * static_cast<size_t>(outShape[0]) * static_cast<size_t>(outShape[1]),
static_cast<size_t>(outShape[2])};
AlignGlobalLocal(global, local);
local_size_ = {32, 4, 1};
global_size_ = {UP_DIV(static_cast<size_t>(outShape[3]), C4NUM),
4 * static_cast<size_t>(outShape[0]) * static_cast<size_t>(outShape[1]),
static_cast<size_t>(outShape[2])};
AlignGlobalLocal(global_size_, local_size_);
} }


void MatMulOpenCLKernel::SetConstArgs() { void MatMulOpenCLKernel::SetConstArgs() {
@@ -158,7 +158,7 @@ int MatMulOpenCLKernel::Run() {
int arg_count = 0; int arg_count = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
return mindspore::lite::RET_OK; return mindspore::lite::RET_OK;
} }




+ 1
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h View File

@@ -37,9 +37,9 @@ class MatMulOpenCLKernel : public OpenCLKernel {
int InitWeights() override; int InitWeights() override;
void SetConstArgs() override; void SetConstArgs() override;
void SetGlobalLocal() override; void SetGlobalLocal() override;
int Tune() override { return lite::RET_OK; }


private: private:
cl::Kernel kernel_;
void *padWeight_{nullptr}; void *padWeight_{nullptr};
bool enable_fp16_{false}; bool enable_fp16_{false};
bool transposeA{false}; bool transposeA{false};


+ 4
- 2
mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.cc View File

@@ -85,14 +85,16 @@ void OneHotOpenCLKernel::SetConstArgs() {
ocl_runtime_->SetKernelArg(kernel_, arg_idx, static_cast<int>(out_shape_.C)); ocl_runtime_->SetKernelArg(kernel_, arg_idx, static_cast<int>(out_shape_.C));
} }
void OneHotOpenCLKernel::SetGlobalLocal() { void OneHotOpenCLKernel::SetGlobalLocal() {
global_range_ = {out_shape_.Slice, out_shape_.W, out_shape_.H * out_shape_.N};
local_size_ = {};
global_size_ = {out_shape_.Slice, out_shape_.W, out_shape_.H * out_shape_.N};
AlignGlobalLocal(global_size_, local_size_);
} }


int OneHotOpenCLKernel::Run() { int OneHotOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!"; MS_LOG(DEBUG) << this->name() << " Running!";
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
return mindspore::lite::RET_OK; return mindspore::lite::RET_OK;
} }




+ 0
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.h View File

@@ -39,7 +39,6 @@ class OneHotOpenCLKernel : public OpenCLKernel {
void SetGlobalLocal() override; void SetGlobalLocal() override;


private: private:
cl::Kernel kernel_;
int depth_{0}; int depth_{0};
float on_value_{1.0f}; float on_value_{1.0f};
float off_value_{0.0f}; float off_value_{0.0f};


+ 4
- 3
mindspore/lite/src/runtime/kernel/opencl/kernel/pad.cc View File

@@ -97,14 +97,15 @@ void PadOpenCLKernel::SetConstArgs() {
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, io_slices); ocl_runtime_->SetKernelArg(kernel_, arg_cn++, io_slices);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, pad_before); ocl_runtime_->SetKernelArg(kernel_, arg_cn++, pad_before);
ocl_runtime_->SetKernelArg(kernel_, arg_cn, static_cast<cl_float>(param_->constant_value_)); ocl_runtime_->SetKernelArg(kernel_, arg_cn, static_cast<cl_float>(param_->constant_value_));

AlignGlobalLocal({output.N * output.H, output.W, output.Slice}, {8, 4, 1});
local_size_ = {8, 4, 1};
global_size_ = {output.N * output.H, output.W, output.Slice};
AlignGlobalLocal(global_size_, local_size_);
} }


int PadOpenCLKernel::Run() { int PadOpenCLKernel::Run() {
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c()); ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c());
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c()); ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
return RET_OK; return RET_OK;
} }




+ 0
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/pad.h View File

@@ -41,7 +41,6 @@ class PadOpenCLKernel : public OpenCLKernel {
int Run() override; int Run() override;


private: private:
cl::Kernel kernel_;
PadParameter *param_; PadParameter *param_;
}; };
} // namespace mindspore::kernel } // namespace mindspore::kernel


+ 4
- 3
mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc View File

@@ -86,8 +86,9 @@ void PoolingOpenCLKernel::SetGlobalLocal() {
const size_t global_x = out_tensors_[0]->shape()[1]; const size_t global_x = out_tensors_[0]->shape()[1];
const size_t global_y = out_tensors_[0]->shape()[2]; const size_t global_y = out_tensors_[0]->shape()[2];
const size_t global_z = UP_DIV(out_tensors_[0]->shape()[3], C4NUM); const size_t global_z = UP_DIV(out_tensors_[0]->shape()[3], C4NUM);
global_range_ = {global_z, global_y, global_x};
local_range_ = {};
global_size_ = {global_z, global_y, global_x};
local_size_ = {};
AlignGlobalLocal(global_size_, local_size_);
} }


void PoolingOpenCLKernel::SetConstArgs() { void PoolingOpenCLKernel::SetConstArgs() {
@@ -111,7 +112,7 @@ int PoolingOpenCLKernel::Run() {
int arg_idx = 0; int arg_idx = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
return mindspore::lite::RET_OK; return mindspore::lite::RET_OK;
} }




+ 0
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h View File

@@ -39,7 +39,6 @@ class PoolingOpenCLKernel : public OpenCLKernel {


private: private:
PoolingParameter *parameter_; PoolingParameter *parameter_;
cl::Kernel kernel_;
std::vector<size_t> local_size_; std::vector<size_t> local_size_;
std::vector<size_t> global_size_; std::vector<size_t> global_size_;
}; };


+ 0
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/power.h View File

@@ -37,7 +37,6 @@ class PowerOpenCLKernel : public OpenCLKernel {


private: private:
int InferShapeTo4D(); int InferShapeTo4D();
cl::Kernel kernel_;


private: private:
size_t N_{1}; size_t N_{1};


+ 0
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.h View File

@@ -37,7 +37,6 @@ class PReluOpenCLKernel : public OpenCLKernel {
int InitWeights() override; int InitWeights() override;


private: private:
cl::Kernel kernel_;
bool enable_fp16_{false}; bool enable_fp16_{false};
int batch_size_{}; int batch_size_{};
int C_{}; int C_{};


+ 13
- 6
mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc View File

@@ -148,15 +148,22 @@ void ReduceOpenCLKernel::SetGlobalLocal() {
int h = shapex[1]; int h = shapex[1];
int c = shapex[3]; int c = shapex[3];
int c4 = UP_DIV(c, C4NUM); int c4 = UP_DIV(c, C4NUM);
std::vector<size_t> local = {};
local_size_ = {};
if (use_local_) { if (use_local_) {
local = {1, LOCAL_CACHE_THREAD, LOCAL_CACHE_THREAD};
local_size_ = {1, LOCAL_CACHE_THREAD, LOCAL_CACHE_THREAD};
} }
std::vector<size_t> global = {static_cast<size_t>(c4), 1, 1};
global_size_ = {static_cast<size_t>(c4), 1, 1};
if (wc_reduce_) { if (wc_reduce_) {
global = {static_cast<size_t>(h), 1, 1};
global_size_ = {static_cast<size_t>(h), 1, 1};
} }
AlignGlobalLocal(global, local);
AlignGlobalLocal(global_size_, local_size_);
}

int ReduceOpenCLKernel::Tune() {
if (use_local_) {
return RET_OK;
}
return OpenCLKernel::Tune();
} }


int ReduceOpenCLKernel::Run() { int ReduceOpenCLKernel::Run() {
@@ -164,7 +171,7 @@ int ReduceOpenCLKernel::Run() {
int arg_idx = 0; int arg_idx = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
return mindspore::lite::RET_OK; return mindspore::lite::RET_OK;
} }




+ 1
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h View File

@@ -36,11 +36,11 @@ class ReduceOpenCLKernel : public OpenCLKernel {
int CheckSpecs() override; int CheckSpecs() override;
void SetConstArgs() override; void SetConstArgs() override;
void SetGlobalLocal() override; void SetGlobalLocal() override;
int Tune() override;


private: private:
cl_float4 GenC4Mask(); cl_float4 GenC4Mask();
static std::string GetReduceTypeStr(int type); static std::string GetReduceTypeStr(int type);
cl::Kernel kernel_;
GpuTensorInfo outShape = GpuTensorInfo(nullptr); GpuTensorInfo outShape = GpuTensorInfo(nullptr);
bool use_local_{false}; bool use_local_{false};
bool wc_reduce_{false}; bool wc_reduce_{false};


+ 4
- 4
mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc View File

@@ -55,9 +55,9 @@ void ReshapeOpenCLKernel::SetConstArgs() {


void ReshapeOpenCLKernel::SetGlobalLocal() { void ReshapeOpenCLKernel::SetGlobalLocal() {
auto out = GpuTensorInfo(out_tensors_.front()); auto out = GpuTensorInfo(out_tensors_.front());
std::vector<size_t> local = {};
std::vector<size_t> global{out.width, out.height};
OpenCLKernel::AlignGlobalLocal(global, local);
local_size_ = {};
global_size_ = {out.width, out.height};
OpenCLKernel::AlignGlobalLocal(global_size_, local_size_);
} }


int ReshapeOpenCLKernel::Prepare() { int ReshapeOpenCLKernel::Prepare() {
@@ -81,7 +81,7 @@ int ReshapeOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!"; MS_LOG(DEBUG) << this->name() << " Running!";
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
return RET_OK; return RET_OK;
} }




+ 0
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.h View File

@@ -37,7 +37,6 @@ class ReshapeOpenCLKernel : public OpenCLKernel {
void SetGlobalLocal() override; void SetGlobalLocal() override;


private: private:
cl::Kernel kernel_;
}; };
} // namespace mindspore::kernel } // namespace mindspore::kernel




+ 4
- 3
mindspore/lite/src/runtime/kernel/opencl/kernel/resize.cc View File

@@ -100,9 +100,10 @@ void ResizeOpenCLKernel::SetConstArgs() {
} }


void ResizeOpenCLKernel::SetGlobalLocal() { void ResizeOpenCLKernel::SetGlobalLocal() {
local_range_ = {};
local_size_ = {};
auto out_shape = GpuTensorInfo(out_tensors_[0]); auto out_shape = GpuTensorInfo(out_tensors_[0]);
global_range_ = {out_shape.Slice, out_shape.W, out_shape.H};
global_size_ = {out_shape.Slice, out_shape.W, out_shape.H};
AlignGlobalLocal(global_size_, local_size_);
} }


int ResizeOpenCLKernel::Run() { int ResizeOpenCLKernel::Run() {
@@ -110,7 +111,7 @@ int ResizeOpenCLKernel::Run() {
int arg_idx = 0; int arg_idx = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
return RET_OK; return RET_OK;
} }




+ 0
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/resize.h View File

@@ -40,7 +40,6 @@ class ResizeOpenCLKernel : public OpenCLKernel {
private: private:
float getResizeScaleFactor(int input_size, int output_size); float getResizeScaleFactor(int input_size, int output_size);


cl::Kernel kernel_;
bool alignCorner{false}; bool alignCorner{false};
bool preserveAspectRatio{false}; bool preserveAspectRatio{false};
}; };


+ 0
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h View File

@@ -37,7 +37,6 @@ class ScaleOpenCLKernel : public OpenCLKernel {
private: private:
void Image2dGetWorkGroupSize(); void Image2dGetWorkGroupSize();


cl::Kernel kernel_;
bool weight_vector_flag_{true}; bool weight_vector_flag_{true};
bool broadcast_flag_{false}; bool broadcast_flag_{false};
bool broadcast_H_flag_{false}; bool broadcast_H_flag_{false};


+ 8
- 2
mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc View File

@@ -116,6 +116,13 @@ void SoftmaxOpenCLKernel::SetGlobalLocal() {
AlignGlobalLocal(global_size_, local_size_); AlignGlobalLocal(global_size_, local_size_);
} }


int SoftmaxOpenCLKernel::Tune() {
if (onexone_flag_) {
return RET_OK;
}
return OpenCLKernel::Tune();
}

void SoftmaxOpenCLKernel::SetConstArgs() { void SoftmaxOpenCLKernel::SetConstArgs() {
int arg_idx = 2; int arg_idx = 2;
int channel = out_shape.C; int channel = out_shape.C;
@@ -133,8 +140,7 @@ int SoftmaxOpenCLKernel::Run() {
int arg_idx = 0; int arg_idx = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
// run opengl kernel
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
return lite::RET_OK; return lite::RET_OK;
} }




+ 1
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.h View File

@@ -38,6 +38,7 @@ class SoftmaxOpenCLKernel : public OpenCLKernel {
int CheckSpecs() override; int CheckSpecs() override;
void SetConstArgs() override; void SetConstArgs() override;
void SetGlobalLocal() override; void SetGlobalLocal() override;
int Tune() override;


private: private:
int InitGlobalSize(); int InitGlobalSize();
@@ -45,7 +46,6 @@ class SoftmaxOpenCLKernel : public OpenCLKernel {
int SetWorkGroupSize(); int SetWorkGroupSize();
std::vector<float> GetMaskForLastChannel(int channels); std::vector<float> GetMaskForLastChannel(int channels);


cl::Kernel kernel_;
SoftmaxParameter *parameter_; SoftmaxParameter *parameter_;
bool onexone_flag_{false}; bool onexone_flag_{false};
std::vector<size_t> local_size_; std::vector<size_t> local_size_;


+ 4
- 4
mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.cc View File

@@ -80,9 +80,9 @@ void SpaceToBatchNDOpenCLKernel::SetConstArgs() {
void SpaceToBatchNDOpenCLKernel::SetGlobalLocal() { void SpaceToBatchNDOpenCLKernel::SetGlobalLocal() {
size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM); size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
cl_int4 dst_size = {(cl_int)CO4, out_tensors_[0]->Width(), out_tensors_[0]->Height(), out_tensors_[0]->Batch()}; cl_int4 dst_size = {(cl_int)CO4, out_tensors_[0]->Width(), out_tensors_[0]->Height(), out_tensors_[0]->Batch()};
std::vector<size_t> local = {1, 1, 1};
std::vector<size_t> global = {(size_t)dst_size.s[0], (size_t)dst_size.s[1], (size_t)dst_size.s[2]};
OpenCLKernel::AlignGlobalLocal(global, local);
local_size_ = {1, 1, 1};
global_size_ = {(size_t)dst_size.s[0], (size_t)dst_size.s[1], (size_t)dst_size.s[2]};
OpenCLKernel::AlignGlobalLocal(global_size_, local_size_);
} }


int SpaceToBatchNDOpenCLKernel::Prepare() { int SpaceToBatchNDOpenCLKernel::Prepare() {
@@ -109,7 +109,7 @@ int SpaceToBatchNDOpenCLKernel::Run() {


ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);


return RET_OK; return RET_OK;
} }


+ 0
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.h View File

@@ -39,7 +39,6 @@ class SpaceToBatchNDOpenCLKernel : public OpenCLKernel {
void SetGlobalLocal() override; void SetGlobalLocal() override;


private: private:
cl::Kernel kernel_;
}; };
} // namespace mindspore::kernel } // namespace mindspore::kernel
#endif #endif

+ 4
- 2
mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_depth.cc View File

@@ -69,7 +69,9 @@ void SpaceToDepthOpenCLKernel::SetConstArgs() {
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, ci_size); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, ci_size);
} }
void SpaceToDepthOpenCLKernel::SetGlobalLocal() { void SpaceToDepthOpenCLKernel::SetGlobalLocal() {
global_range_ = {out_shape_.Slice, out_shape_.W, out_shape_.H * out_shape_.N};
local_size_ = {};
global_size_ = {out_shape_.Slice, out_shape_.W, out_shape_.H * out_shape_.N};
AlignGlobalLocal(global_size_, local_size_);
} }


int SpaceToDepthOpenCLKernel::Run() { int SpaceToDepthOpenCLKernel::Run() {
@@ -77,7 +79,7 @@ int SpaceToDepthOpenCLKernel::Run() {
int arg_idx = 0; int arg_idx = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
return mindspore::lite::RET_OK; return mindspore::lite::RET_OK;
} }




+ 0
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_depth.h View File

@@ -38,7 +38,6 @@ class SpaceToDepthOpenCLKernel : public OpenCLKernel {
void SetGlobalLocal() override; void SetGlobalLocal() override;


private: private:
cl::Kernel kernel_;
GpuTensorInfo in_shape_ = GpuTensorInfo(nullptr); GpuTensorInfo in_shape_ = GpuTensorInfo(nullptr);
GpuTensorInfo out_shape_ = GpuTensorInfo(nullptr); GpuTensorInfo out_shape_ = GpuTensorInfo(nullptr);
}; };


+ 4
- 4
mindspore/lite/src/runtime/kernel/opencl/kernel/sparse_to_dense.cc View File

@@ -136,11 +136,11 @@ void SparseToDenseOpenCLKernel::SetConstArgs() {
} }


void SparseToDenseOpenCLKernel::SetGlobalLocal() { void SparseToDenseOpenCLKernel::SetGlobalLocal() {
std::vector<size_t> local = {1, 1};
local_size_ = {1, 1};
size_t OH = n_ * h_; size_t OH = n_ * h_;
size_t OW = w_ * UP_DIV(c_, C4NUM); size_t OW = w_ * UP_DIV(c_, C4NUM);
std::vector<size_t> global = {OH, OW};
OpenCLKernel::AlignGlobalLocal(global, local);
global_size_ = {OH, OW};
OpenCLKernel::AlignGlobalLocal(global_size_, local_size_);
} }


int SparseToDenseOpenCLKernel::Prepare() { int SparseToDenseOpenCLKernel::Prepare() {
@@ -209,7 +209,7 @@ int SparseToDenseOpenCLKernel::Run() {
} else { } else {
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, weight_scalar_); ocl_runtime_->SetKernelArg(kernel_, arg_cn++, weight_scalar_);
} }
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
return RET_OK; return RET_OK;
} }




+ 0
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/sparse_to_dense.h View File

@@ -43,7 +43,6 @@ class SparseToDenseOpenCLKernel : public OpenCLKernel {
int InitOutputToDefault(); int InitOutputToDefault();


private: private:
cl::Kernel kernel_;
// bool IndicesIsScalar{false}; // bool IndicesIsScalar{false};
bool enable_fp16_{false}; bool enable_fp16_{false};
float default_{0.0f}; float default_{0.0f};


+ 0
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/stack.h View File

@@ -44,7 +44,6 @@ class StackOpenCLKernel : public OpenCLKernel {


int InferOutTensorShapeTo4D(cl_int4 *output_shape); int InferOutTensorShapeTo4D(cl_int4 *output_shape);


cl::Kernel kernel_;
int axis_{0}; int axis_{0};
size_t N_{1}; size_t N_{1};
size_t H_{1}; size_t H_{1};


+ 7
- 7
mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.cc View File

@@ -164,24 +164,24 @@ void StridedSliceOpenCLKernel::SetConstArgs() {


void StridedSliceOpenCLKernel::SetGlobalLocal() { void StridedSliceOpenCLKernel::SetGlobalLocal() {
auto output_info = GpuTensorInfo(out_tensors_.front()); auto output_info = GpuTensorInfo(out_tensors_.front());
std::vector<size_t> global = {output_info.N * output_info.H, output_info.W, output_info.Slice};
global_size_ = {output_info.N * output_info.H, output_info.W, output_info.Slice};


const int max_divider = 8; const int max_divider = 8;
auto max_work_group_size = ocl_runtime_->DeviceMaxWorkGroupSize(); auto max_work_group_size = ocl_runtime_->DeviceMaxWorkGroupSize();
size_t local_c = GetMaxDivisorStrategy0(global[2], max_divider);
size_t local_c = GetMaxDivisorStrategy0(global_size_[2], max_divider);
local_c = std::max<size_t>(local_c, 1); local_c = std::max<size_t>(local_c, 1);
size_t local_hw = max_work_group_size / local_c; size_t local_hw = max_work_group_size / local_c;
size_t local_h = std::min(UP_DIV(global[0], 2), local_hw);
size_t local_w = std::min(local_hw / local_h, global[1]);
std::vector<size_t> local = {local_h, local_w, local_c};
AlignGlobalLocal(global, local);
size_t local_h = std::min(UP_DIV(global_size_[0], 2), local_hw);
size_t local_w = std::min(local_hw / local_h, global_size_[1]);
local_size_ = {local_h, local_w, local_c};
AlignGlobalLocal(global_size_, local_size_);
} }


int StridedSliceOpenCLKernel::Run() { int StridedSliceOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running! "; MS_LOG(DEBUG) << this->name() << " Running! ";
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
return RET_OK; return RET_OK;
} }




+ 0
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.h View File

@@ -42,7 +42,6 @@ class StridedSliceOpenCLKernel : public OpenCLKernel {
private: private:
int InitConstArgs(); int InitConstArgs();


cl::Kernel kernel_;
cl_int4 input_shape_{}; cl_int4 input_shape_{};
cl_int4 output_shape_{}; cl_int4 output_shape_{};
cl_int2 io_slices_{}; cl_int2 io_slices_{};


+ 5
- 5
mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc View File

@@ -51,13 +51,13 @@ void ToFormatOpenCLKernel::SetConstArgs() {
} }


void ToFormatOpenCLKernel::SetGlobalLocal() { void ToFormatOpenCLKernel::SetGlobalLocal() {
std::vector<size_t> global = {N_ * H_, W_, UP_DIV(C_, C4NUM)};
std::vector<size_t> local = {8, 16, 3};
global_size_ = {N_ * H_, W_, UP_DIV(C_, C4NUM)};
local_size_ = {8, 16, 3};
size_t max_work_group_size = ocl_runtime_->DeviceMaxWorkGroupSize(); size_t max_work_group_size = ocl_runtime_->DeviceMaxWorkGroupSize();
if (max_work_group_size < 384) { if (max_work_group_size < 384) {
local[2] = 1;
local_size_[2] = 1;
} }
OpenCLKernel::AlignGlobalLocal(global, local);
OpenCLKernel::AlignGlobalLocal(global_size_, local_size_);
} }


int ToFormatOpenCLKernel::Prepare() { int ToFormatOpenCLKernel::Prepare() {
@@ -97,7 +97,7 @@ int ToFormatOpenCLKernel::Run() {
auto dst_mem_type = out_mem_type_; auto dst_mem_type = out_mem_type_;
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c(), src_mem_type); ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c(), src_mem_type);
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c(), dst_mem_type); ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c(), dst_mem_type);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
return RET_OK; return RET_OK;
} }




+ 0
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h View File

@@ -38,7 +38,6 @@ class ToFormatOpenCLKernel : public OpenCLKernel {
void SetGlobalLocal() override; void SetGlobalLocal() override;


private: private:
cl::Kernel kernel_;
size_t N_{1}; size_t N_{1};
size_t H_{1}; size_t H_{1};
size_t W_{1}; size_t W_{1};


+ 5
- 3
mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc View File

@@ -96,11 +96,13 @@ void TransposeOpenCLKernel::SetGlobalLocal() {
size_t w = shapex[2]; size_t w = shapex[2];
size_t c = shapex[3]; size_t c = shapex[3];
size_t c4 = UP_DIV(c, 4); size_t c4 = UP_DIV(c, 4);
local_size_ = {};
if (type == TransposeType::AXIS0312) { // NHWC -> NCHW if (type == TransposeType::AXIS0312) { // NHWC -> NCHW
global_range_ = {UP_DIV(h, C4NUM), w, c4};
global_size_ = {UP_DIV(h, C4NUM), w, c4};
} else if (type == TransposeType::AXIS0231) { // NCHW -> NHWC } else if (type == TransposeType::AXIS0231) { // NCHW -> NHWC
global_range_ = {h, UP_DIV(w, C4NUM), c4};
global_size_ = {h, UP_DIV(w, C4NUM), c4};
} }
AlignGlobalLocal(global_size_, local_size_);
} }


int TransposeOpenCLKernel::Run() { int TransposeOpenCLKernel::Run() {
@@ -108,7 +110,7 @@ int TransposeOpenCLKernel::Run() {
int arg_idx = 0; int arg_idx = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
return mindspore::lite::RET_OK; return mindspore::lite::RET_OK;
} }




+ 0
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h View File

@@ -41,7 +41,6 @@ class TransposeOpenCLKernel : public OpenCLKernel {
void SetGlobalLocal() override; void SetGlobalLocal() override;


private: private:
cl::Kernel kernel_;
TransposeType type{TransposeType::AXIS0312}; TransposeType type{TransposeType::AXIS0312};
}; };
} // namespace mindspore::kernel } // namespace mindspore::kernel


+ 140
- 4
mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h View File

@@ -16,8 +16,10 @@


#ifndef MINDSPORE_LITE_SRC_OPENCL_KERNEL_H_ #ifndef MINDSPORE_LITE_SRC_OPENCL_KERNEL_H_
#define MINDSPORE_LITE_SRC_OPENCL_KERNEL_H_ #define MINDSPORE_LITE_SRC_OPENCL_KERNEL_H_
#define MAX_PROFILING_TIME_MILLI_SECOND 10 * 1000 // 10 seconds


#include <vector> #include <vector>
#include <set>
#include "src/lite_kernel.h" #include "src/lite_kernel.h"
#include "include/errorcode.h" #include "include/errorcode.h"
#include "src/runtime/opencl/opencl_runtime.h" #include "src/runtime/opencl/opencl_runtime.h"
@@ -137,6 +139,16 @@ struct GpuTensorInfo {
size_t NDim{}; size_t NDim{};
}; };


struct BaseTuningParameter {
std::vector<size_t> local_size;
friend std::ostream &operator<<(std::ostream &ostrm, const BaseTuningParameter &a) {
ostrm << "LocalSize:";
for (auto i : a.local_size) {
ostrm << i << ",";
}
return ostrm;
}
};
class OpenCLKernel : public LiteKernel { class OpenCLKernel : public LiteKernel {
public: public:
OpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, OpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
@@ -158,7 +170,9 @@ class OpenCLKernel : public LiteKernel {
for (size_t i = 0; i < local.size(); i++) { for (size_t i = 0; i < local.size(); i++) {
MS_LOG(DEBUG) << "local[" << i << "] = " << local.at(i); MS_LOG(DEBUG) << "local[" << i << "] = " << local.at(i);
} }

if (local.empty()) {
local_range_ = cl::NullRange;
}
if (global.size() == 1) { if (global.size() == 1) {
global_range_ = cl::NDRange(internal_global_ws.at(0)); global_range_ = cl::NDRange(internal_global_ws.at(0));
if (!local.empty()) { if (!local.empty()) {
@@ -209,13 +223,135 @@ class OpenCLKernel : public LiteKernel {
lite::opencl::MemType GetMemType() { return out_mem_type_; } lite::opencl::MemType GetMemType() { return out_mem_type_; }
void SetMemType(lite::opencl::MemType mem_type) { out_mem_type_ = mem_type; } void SetMemType(lite::opencl::MemType mem_type) { out_mem_type_ = mem_type; }


virtual std::vector<BaseTuningParameter> GenerateTuningParam() {
size_t ndim = global_size_.size();
std::vector<BaseTuningParameter> tuning_params = {};
if (ndim == 0) {
MS_LOG(ERROR) << "Generate tuning param failed, global_size_ is null.";
return tuning_params;
}
BaseTuningParameter default_tuning_param = BaseTuningParameter();
tuning_params.push_back(default_tuning_param);
std::vector<size_t> max_work_items = ocl_runtime_->GetWorkItemSize();
size_t max_workgroup_size = ocl_runtime_->GetMaxWorkGroupSize(kernel_);
size_t MIN_WORKGROUP_SIZE = 8;
std::set<size_t> candidate_x = GenerateLocalByGlobal(global_size_[0]);
std::set<size_t> candidate_y = {1};
std::set<size_t> candidate_z = {1};
if (ndim > 1) {
candidate_y = GenerateLocalByGlobal(global_size_[1]);
}
if (ndim > 2) {
candidate_z = GenerateLocalByGlobal(global_size_[2]);
}
for (auto x : candidate_x) {
if (x < max_work_items[0]) {
for (auto y : candidate_y) {
if (y < max_work_items[1]) {
for (auto z : candidate_z) {
auto group_size = x * y * z;
if (z < max_work_items[2] && group_size < max_workgroup_size && group_size > MIN_WORKGROUP_SIZE) {
BaseTuningParameter tuning_param = BaseTuningParameter();
tuning_param.local_size = {x, y, z};
tuning_params.push_back(tuning_param);
}
}
}
}
}
}
return tuning_params;
}

virtual int AssignTuningParam(const BaseTuningParameter param) {
std::vector<size_t> local_size_tmp = param.local_size;
if (local_size_tmp.size() > global_size_.size()) {
local_size_tmp = std::vector<size_t>(local_size_tmp.begin(), local_size_tmp.begin() + global_size_.size());
}
AlignGlobalLocal(global_size_, local_size_tmp);
return RET_OK;
}

virtual int Tune() {
if (!ocl_runtime_->isProfiling()) {
MS_LOG(WARNING) << "Tuning mode require opencl runtime profiling.";
return RET_OK;
}
lite::opencl::TuningMode mode = ocl_runtime_->GetTuningMode();
if (mode == lite::opencl::TuningMode::DEFAULT) {
return RET_OK;
}
static const std::set<int> FAST_MODE_OPS = {schema::PrimitiveType_Conv2D, schema::PrimitiveType_DepthwiseConv2D,
schema::PrimitiveType_DeConv2D};
if (mode == lite::opencl::TuningMode::FAST && FAST_MODE_OPS.find(op_parameter_->type_) == FAST_MODE_OPS.end()) {
return RET_OK;
}
auto tuning_params = GenerateTuningParam();
if (tuning_params.empty()) {
MS_LOG(WARNING) << "Tuning param size is 0.";
return RET_OK;
}
int index = -1;
double min_time = MAX_PROFILING_TIME_MILLI_SECOND;
for (int i = 0; i < tuning_params.size(); i++) {
AssignTuningParam(tuning_params[i]);
auto ret = Run();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Tuning " << name() << " failed for tuning param " << tuning_params[i];
return ret;
}
double current_time = GetProfilingTimeMs();
MS_LOG(DEBUG) << "Tuning " << name() << " param (" << tuning_params[i] << ") exectime " << current_time << "ms";
if (current_time < min_time) {
min_time = current_time;
index = i;
}
}
if (index != -1) {
MS_LOG(INFO) << "Tuning " << name() << " result: param (" << tuning_params[index] << ") exectime " << min_time
<< "ms";
AssignTuningParam(tuning_params[index]);
} else {
MS_LOG(WARNING) << "Cannot find suitable param.";
}
return RET_OK;
}

double GetProfilingTimeMs() {
if (!ocl_runtime_->isProfiling()) {
return MAX_PROFILING_TIME_MILLI_SECOND;
}
cl_ulong time_start;
cl_ulong time_end;
event_.getProfilingInfo(CL_PROFILING_COMMAND_START, &time_start);
event_.getProfilingInfo(CL_PROFILING_COMMAND_END, &time_end);
cl_ulong time_ns = time_end - time_start;
return static_cast<double>(time_ns) * 1e-6;
}

protected: protected:
lite::opencl::OpenCLRuntime *ocl_runtime_; lite::opencl::OpenCLRuntime *ocl_runtime_;
lite::opencl::MemType out_mem_type_{lite::opencl::MemType::IMG}; lite::opencl::MemType out_mem_type_{lite::opencl::MemType::IMG};
cl::NDRange global_range_{cl::NullRange}; cl::NDRange global_range_{cl::NullRange};
cl::NDRange local_range_{cl::NullRange}; cl::NDRange local_range_{cl::NullRange};
std::vector<size_t> global_size_; // !!!To be deleted
std::vector<size_t> local_size_; // !!!To be deleted
std::vector<size_t> global_size_;
std::vector<size_t> local_size_;
cl::Kernel kernel_;
cl::Event event_;
static std::set<size_t> GenerateLocalByGlobal(size_t global_i) {
std::set<size_t> local_ = {};
int index = 1;
while (index < global_i) {
local_.insert(index);
index *= 2;
}
for (size_t i = 1; i < 16; i++) {
if (global_i % i == 0) {
local_.insert(i);
}
}
return local_;
}


private: private:
lite::opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_; lite::opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_;
@@ -233,8 +369,8 @@ kernel::LiteKernel *OpenCLKernelCreator(const std::vector<lite::Tensor *> &input
} }
auto ret = kernel->CheckSpecs(); auto ret = kernel->CheckSpecs();
if (ret != mindspore::lite::RET_OK) { if (ret != mindspore::lite::RET_OK) {
delete kernel;
MS_LOG(ERROR) << "Check " << opParameter->name_ << " specification failed!"; MS_LOG(ERROR) << "Check " << opParameter->name_ << " specification failed!";
delete kernel;
return nullptr; return nullptr;
} }
return kernel; return kernel;


+ 11
- 2
mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc View File

@@ -228,8 +228,17 @@ int OpenCLSubGraph::Init() {
MS_LOG(ERROR) << "OpenCL prepare fail"; MS_LOG(ERROR) << "OpenCL prepare fail";
return ret; return ret;
} }

MallocTensorWithReuse();
auto opencl_exec = reinterpret_cast<lite::opencl::OpenCLExecutor *>(executor_);
ocl_runtime_->SetProfiling(true);
ret = opencl_exec->RunOrTune(in_tensors_, out_tensors_, nodes_, allocator_, nullptr, nullptr, true);
if (ret != RET_OK) {
MS_LOG(ERROR) << "Run opencl executor failed: " << ret;
return ret;
}
ocl_runtime_->SetProfiling(false);
#ifdef Debug
ocl_runtime_->SetProfiling(true);
#endif
return RET_OK; return RET_OK;
} }




+ 23
- 5
mindspore/lite/src/runtime/opencl/opencl_executor.cc View File

@@ -24,6 +24,12 @@ namespace mindspore::lite::opencl {
int OpenCLExecutor::Run(std::vector<Tensor *> &inputs, std::vector<Tensor *> &outputs, int OpenCLExecutor::Run(std::vector<Tensor *> &inputs, std::vector<Tensor *> &outputs,
std::vector<kernel::LiteKernel *> &kernels, Allocator *allocator, const KernelCallBack &before, std::vector<kernel::LiteKernel *> &kernels, Allocator *allocator, const KernelCallBack &before,
const KernelCallBack &after) { const KernelCallBack &after) {
return RunOrTune(inputs, outputs, kernels, allocator, before, after, false);
}

int OpenCLExecutor::RunOrTune(std::vector<Tensor *> &inputs, std::vector<Tensor *> &outputs,
std::vector<kernel::LiteKernel *> &kernels, Allocator *allocator,
const KernelCallBack &before, const KernelCallBack &after, bool is_tune) {
int ret; int ret;
kernel::LiteKernelUtil::InitTensorRefCount(kernels); kernel::LiteKernelUtil::InitTensorRefCount(kernels);
for (auto *kernel : kernels) { for (auto *kernel : kernels) {
@@ -57,14 +63,26 @@ int OpenCLExecutor::Run(std::vector<Tensor *> &inputs, std::vector<Tensor *> &ou
return ret; return ret;
} }
} }
output->set_allocator(allocator_);
} }
if (is_tune) {
ret = op_kernel->Tune();
if (ret != RET_OK) {
MS_LOG(ERROR) << "tuning kernel failed, name: " << kernel->name();
return ret;
}
} else {
ret = kernel->Run();
if (ret != RET_OK) {
MS_LOG(ERROR) << "run kernel failed, name: " << kernel->name();
return ret;
}
#ifdef Debug
MS_LOG(INFO) << "OpenCl kernel " << kernel->name() << "(" << kernel->type_str()
<< ") execute time is: " << op_kernel->GetProfilingTimeMs() << "ms";


ret = kernel->Run();
if (ret != RET_OK) {
MS_LOG(ERROR) << "run kernel failed, name: " << kernel->name();
return ret;
#endif
} }

if (after != nullptr) { if (after != nullptr) {
if (!after(TensorVectorCast(kernel->in_tensors()), TensorVectorCast(kernel->out_tensors()), callbackParam)) { if (!after(TensorVectorCast(kernel->in_tensors()), TensorVectorCast(kernel->out_tensors()), callbackParam)) {
MS_LOG(ERROR) << "run kernel after_callback failed, name: " << kernel->name(); MS_LOG(ERROR) << "run kernel after_callback failed, name: " << kernel->name();


+ 3
- 0
mindspore/lite/src/runtime/opencl/opencl_executor.h View File

@@ -34,6 +34,9 @@ class OpenCLExecutor : public Executor {
int Run(std::vector<Tensor *> &inputs, std::vector<Tensor *> &outputs, std::vector<kernel::LiteKernel *> &kernels, int Run(std::vector<Tensor *> &inputs, std::vector<Tensor *> &outputs, std::vector<kernel::LiteKernel *> &kernels,
Allocator *allocator = nullptr, const KernelCallBack &before = nullptr, Allocator *allocator = nullptr, const KernelCallBack &before = nullptr,
const KernelCallBack &after = nullptr) override; const KernelCallBack &after = nullptr) override;
int RunOrTune(std::vector<Tensor *> &inputs, std::vector<Tensor *> &outputs,
std::vector<kernel::LiteKernel *> &kernels, Allocator *allocator = nullptr,
const KernelCallBack &before = nullptr, const KernelCallBack &after = nullptr, bool is_tune = false);


protected: protected:
InnerContext *context = nullptr; InnerContext *context = nullptr;


+ 20
- 19
mindspore/lite/src/runtime/opencl/opencl_runtime.cc View File

@@ -230,12 +230,7 @@ int OpenCLRuntime::Init() {
MS_LOG(INFO) << "Compute Unit: " << compute_units_; MS_LOG(INFO) << "Compute Unit: " << compute_units_;
MS_LOG(INFO) << "Clock Frequency: " << max_freq_ << " MHz"; MS_LOG(INFO) << "Clock Frequency: " << max_freq_ << " MHz";


#ifdef Debug
const cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE;
#else
const cl_command_queue_properties properties = 0; const cl_command_queue_properties properties = 0;
#endif

default_command_queue_ = new (std::nothrow) cl::CommandQueue(*context_, *device_, properties, &ret); default_command_queue_ = new (std::nothrow) cl::CommandQueue(*context_, *device_, properties, &ret);
if (ret != CL_SUCCESS) { if (ret != CL_SUCCESS) {
delete device_; delete device_;
@@ -244,6 +239,16 @@ int OpenCLRuntime::Init() {
return RET_ERROR; return RET_ERROR;
} }


const cl_command_queue_properties profiling_properties = CL_QUEUE_PROFILING_ENABLE;
profiling_command_queue_ = new (std::nothrow) cl::CommandQueue(*context_, *device_, profiling_properties, &ret);
if (ret != CL_SUCCESS) {
delete device_;
delete context_;
delete default_command_queue_;
MS_LOG(ERROR) << "Profiling command Queue create failed: " << CLErrorCode(ret);
return RET_ERROR;
}

allocator_ = new (std::nothrow) OpenCLAllocator(this); allocator_ = new (std::nothrow) OpenCLAllocator(this);
if (allocator_ == nullptr) { if (allocator_ == nullptr) {
delete device_; delete device_;
@@ -473,15 +478,17 @@ int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const std::vector<size_t>
} }
// Run Kernel with 1D, 2D, 3D group size, and local size can be empty. // Run Kernel with 1D, 2D, 3D group size, and local size can be empty.
int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const cl::NDRange &global, const cl::NDRange &local, int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const cl::NDRange &global, const cl::NDRange &local,
cl::CommandQueue *command_queue) {
cl::CommandQueue *command_queue, cl::Event *event) {
if (command_queue == nullptr) { if (command_queue == nullptr) {
command_queue = default_command_queue_;
if (profiling_) {
command_queue = profiling_command_queue_;
} else {
command_queue = default_command_queue_;
}
} }
MS_ASSERT(local.size() == 0 || local.size() == global.size()); MS_ASSERT(local.size() == 0 || local.size() == global.size());

cl::Event event;
cl_int ret = CL_SUCCESS; cl_int ret = CL_SUCCESS;
ret = command_queue->enqueueNDRangeKernel(kernel, cl::NullRange, global, local, nullptr, &event);
ret = command_queue->enqueueNDRangeKernel(kernel, cl::NullRange, global, local, nullptr, event);
if (ret != CL_SUCCESS) { if (ret != CL_SUCCESS) {
MS_LOG(ERROR) << "Kernel execute failed:" << CLErrorCode(ret); MS_LOG(ERROR) << "Kernel execute failed:" << CLErrorCode(ret);
return RET_ERROR; return RET_ERROR;
@@ -496,15 +503,9 @@ int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const cl::NDRange &global
} }
cnt++; cnt++;
MS_LOG(DEBUG) << "RunKernel success!"; MS_LOG(DEBUG) << "RunKernel success!";
#ifdef Debug
event.wait();
cl_ulong time_start;
cl_ulong time_end;
event.getProfilingInfo(CL_PROFILING_COMMAND_START, &time_start);
event.getProfilingInfo(CL_PROFILING_COMMAND_END, &time_end);
double nanoSeconds = time_end - time_start;
MS_LOG(INFO) << "OpenCl Execution time is: " << nanoSeconds / 1000000.0 << "ms";
#endif
if (profiling_) {
event->wait();
}
return RET_OK; return RET_OK;
} }
// get gpu divce type // get gpu divce type


+ 9
- 1
mindspore/lite/src/runtime/opencl/opencl_runtime.h View File

@@ -32,6 +32,7 @@ j* you may not use this file except in compliance with the License.
namespace mindspore::lite::opencl { namespace mindspore::lite::opencl {


enum GpuType { OTHER = 0, ADRENO = 1, MALI = 2, MALI_T = 3, MALI_G = 4 }; enum GpuType { OTHER = 0, ADRENO = 1, MALI = 2, MALI_T = 3, MALI_G = 4 };
enum TuningMode { DEFAULT = 0, FAST = 1, EXTREME = 2 };


struct GpuInfo { struct GpuInfo {
GpuType type = OTHER; GpuType type = OTHER;
@@ -117,7 +118,7 @@ class OpenCLRuntime {
int RunKernel(const cl::Kernel &kernel, const std::vector<size_t> &global, const std::vector<size_t> &local, int RunKernel(const cl::Kernel &kernel, const std::vector<size_t> &global, const std::vector<size_t> &local,
cl::CommandQueue *command_queue = nullptr); // !!!To be deleted cl::CommandQueue *command_queue = nullptr); // !!!To be deleted
int RunKernel(const cl::Kernel &kernel, const cl::NDRange &global, const cl::NDRange &local, int RunKernel(const cl::Kernel &kernel, const cl::NDRange &global, const cl::NDRange &local,
cl::CommandQueue *command_queue = nullptr);
cl::CommandQueue *command_queue = nullptr, cl::Event *event = nullptr);
bool CopyDeviceMemToHost(void *dst, const void *src, size_t size, cl::CommandQueue *command_queue = nullptr, bool CopyDeviceMemToHost(void *dst, const void *src, size_t size, cl::CommandQueue *command_queue = nullptr,
bool sync = false) const; bool sync = false) const;
bool CopyHostMemToDevice(const void *dst, const void *src, size_t size, cl::CommandQueue *command_queue = nullptr, bool CopyHostMemToDevice(const void *dst, const void *src, size_t size, cl::CommandQueue *command_queue = nullptr,
@@ -139,10 +140,14 @@ class OpenCLRuntime {
* @return max_work_group_size * @return max_work_group_size
*/ */
int GetKernelMaxWorkGroupSize(cl_kernel kernel, cl_device_id device_id); int GetKernelMaxWorkGroupSize(cl_kernel kernel, cl_device_id device_id);
void SetTuningMode(TuningMode mode) { tuning_mode_ = mode; }
TuningMode GetTuningMode() const { return tuning_mode_; }


void InitGpuCache(); void InitGpuCache();
int LoadCache(const void *buf); int LoadCache(const void *buf);
void StoreCache(); void StoreCache();
bool isProfiling() const { return profiling_; }
void SetProfiling(bool profiling) { profiling_ = profiling; }


private: private:
static OpenCLRuntime *GetInstance(); static OpenCLRuntime *GetInstance();
@@ -158,6 +163,7 @@ class OpenCLRuntime {
static size_t instance_count_; static size_t instance_count_;
static OpenCLRuntime *ocl_runtime_instance_; static OpenCLRuntime *ocl_runtime_instance_;
cl::CommandQueue *default_command_queue_{nullptr}; cl::CommandQueue *default_command_queue_{nullptr};
cl::CommandQueue *profiling_command_queue_{nullptr};
cl::Context *context_{nullptr}; cl::Context *context_{nullptr};
cl::Device *device_{nullptr}; cl::Device *device_{nullptr};
OpenCLAllocator *allocator_{nullptr}; OpenCLAllocator *allocator_{nullptr};
@@ -181,6 +187,8 @@ class OpenCLRuntime {
const std::string version_{"V0.1"}; const std::string version_{"V0.1"};
bool need_write_{false}; bool need_write_{false};
bool enable_cache_{false}; bool enable_cache_{false};
TuningMode tuning_mode_{TuningMode::DEFAULT};
bool profiling_{false};
}; };


class OpenCLRuntimeWrapper { class OpenCLRuntimeWrapper {


Loading…
Cancel
Save