Browse Source

optimize conv2d tuning

tags/v1.1.0
chenzupeng 5 years ago
parent
commit
ce4b3e743f
4 changed files with 63 additions and 2 deletions
  1. +39
    -0
      mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
  2. +2
    -0
      mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h
  3. +2
    -2
      mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
  4. +20
    -0
      mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h

+ 39
- 0
mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc View File

@@ -363,6 +363,45 @@ void Conv2DOpenCLKernel::SetGlobalLocal() {
}
}

std::vector<BaseTuningParameter> Conv2DOpenCLKernel::GenerateTuningParam() {
// don't need to tune local_c
std::vector<BaseTuningParameter> tuning_params = {};
if (use_winograd_) {
return tuning_params;
}
BaseTuningParameter default_tuning_param = BaseTuningParameter();
default_tuning_param.local_size = local_size_;
tuning_params.push_back(default_tuning_param);

std::vector<size_t> max_work_items = ocl_runtime_->GetWorkItemSize();
size_t max_workgroup_size = ocl_runtime_->GetMaxWorkGroupSize(kernel_);
std::set<size_t> candidate_x = GenerateLocalByGlobal(global_size_[0]);
std::set<size_t> candidate_y = GenerateLocalByGlobal(global_size_[1]);
for (auto x : candidate_x) {
if (x <= max_work_items[0]) {
for (auto y : candidate_y) {
if (y <= max_work_items[1]) {
auto group_size = x * y * local_size_[2];
if (group_size <= max_workgroup_size) {
BaseTuningParameter tuning_param = BaseTuningParameter();
tuning_param.local_size = {x, y, local_size_[2]};
tuning_params.push_back(tuning_param);
}
}
}
}
}
return tuning_params;
}

std::string Conv2DOpenCLKernel::Key() {
auto key = OpenCLKernel::Key();
key += "_" + std::to_string(KH_) + "_" + std::to_string(KW_) + "_" + std::to_string(param_->stride_h_) + "_" +
std::to_string(param_->stride_w_) + "_" + std::to_string(param_->dilation_h_) + "_" +
std::to_string(param_->dilation_w_);
return key;
}

void Conv2DOpenCLKernel::SetConstArgs() {
cl_int4 input_shape = {batch_size_, IH_, IW_, CI_SLICES_};
cl_int4 output_shape = {batch_size_, OH_, OW_, CO_SLICES_};


+ 2
- 0
mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h View File

@@ -43,6 +43,8 @@ class Conv2DOpenCLKernel : public OpenCLKernel {

int Run() override;
int Tune() override;
std::vector<BaseTuningParameter> GenerateTuningParam() override;
std::string Key() override;

// for opencl fusion: Conv2D + PReLU(weight is scalar) -> param_.act_type=ActivationType_LEAKY_RELU
float alpha_{0.0f};


+ 2
- 2
mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc View File

@@ -31,8 +31,8 @@ using mindspore::schema::PrimitiveType_Squeeze;
namespace mindspore::kernel {

int ReshapeOpenCLKernel::CheckSpecs() {
if (in_tensors_.size() != 1 && out_tensors_.size() != 1) {
MS_LOG(ERROR) << "Reshape in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size();
if ((in_tensors_.size() != 1 && in_tensors_.size() != 2) || out_tensors_.size() != 1) {
MS_LOG(ERROR) << "Reshape input output size unsupported.";
return RET_ERROR;
}
if (in_tensors_[0]->data_type() != kNumberTypeFloat32 && in_tensors_[0]->data_type() != kNumberTypeFloat16) {


+ 20
- 0
mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h View File

@@ -20,6 +20,8 @@

#include <vector>
#include <set>
#include <map>
#include <string>
#include "src/lite_kernel.h"
#include "include/errorcode.h"
#include "src/runtime/opencl/opencl_runtime.h"
@@ -287,6 +289,13 @@ class OpenCLKernel : public LiteKernel {
if (mode == lite::opencl::TuningMode::FAST && FAST_MODE_OPS.find(op_parameter_->type_) == FAST_MODE_OPS.end()) {
return RET_OK;
}
auto key = Key();
auto finded = tuned_param_cache_.find(key);
if (finded != tuned_param_cache_.end()) {
auto cache_param = finded->second;
MS_LOG(INFO) << "Tuning " << name() << ", found cached param(" << cache_param << ")";
return RET_OK;
}
auto tuning_params = GenerateTuningParam();
if (tuning_params.empty()) {
MS_LOG(WARNING) << "Tuning param size is 0.";
@@ -312,6 +321,7 @@ class OpenCLKernel : public LiteKernel {
MS_LOG(INFO) << "Tuning " << name() << " result: param (" << tuning_params[index] << ") exectime " << min_time
<< "ms";
AssignTuningParam(tuning_params[index]);
tuned_param_cache_[key] = tuning_params[index];
} else {
MS_LOG(WARNING) << "Cannot find suitable param.";
}
@@ -330,6 +340,15 @@ class OpenCLKernel : public LiteKernel {
return static_cast<double>(time_ns) * 1e-6;
}

virtual std::string Key() {
std::string key = type_str();
key += "_global";
for (auto i : global_size_) {
key += "_" + std::to_string(i);
}
return key;
}

protected:
lite::opencl::OpenCLRuntime *ocl_runtime_;
lite::opencl::MemType out_mem_type_{lite::opencl::MemType::IMG};
@@ -356,6 +375,7 @@ class OpenCLKernel : public LiteKernel {

private:
lite::opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_;
static inline std::map<std::string, BaseTuningParameter> tuned_param_cache_;
};
template <class T>
kernel::LiteKernel *OpenCLKernelCreator(const std::vector<lite::Tensor *> &inputs,


Loading…
Cancel
Save