auto unmap buffer for opencl and clean code

5 years ago · 4b60297832
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
@@ -70,7 +70,7 @@ int ArithmeticOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_si
  img_size->clear();
  std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype};
  *img_size = vec;
  return 0;
  return RET_OK;
 }

 int ArithmeticOpenCLKernel::Init() {
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
@@ -59,7 +59,7 @@ int Conv2dTransposeOpenCLKernel::Init() {
  return RET_OK;
 }

 int Conv2dTransposeOpenCLKernel::ReSize() { return 0; }
 int Conv2dTransposeOpenCLKernel::ReSize() { return RET_OK; }

 void Conv2dTransposeOpenCLKernel::PadWeight() {
  ConvParameter *param = reinterpret_cast<ConvParameter *>(op_parameter_);
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
@@ -67,10 +67,10 @@ int MatMulOpenCLKernel::Init() {
    in_tensors_[0]->SetFormat(schema::Format_NC4);
  }
  MS_LOG(DEBUG) << kernel_name << " Init Done!";
  return 0;
  return RET_OK;
 }

 int MatMulOpenCLKernel::ReSize() { return 0; }
 int MatMulOpenCLKernel::ReSize() { return RET_OK; }

 void MatMulOpenCLKernel::PadWeight() {
  auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();
@@ -147,7 +147,7 @@ int MatMulOpenCLKernel::Run() {
  ocl_runtime->SetKernelArg(kernel_, arg_count++, sizeCO);
  ocl_runtime->SetKernelArg(kernel_, arg_count++, hasBias_ ? 1 : 0);
  ocl_runtime->RunKernel(kernel_, global, local, nullptr);
  return 0;
  return RET_OK;
 }

 kernel::LiteKernel *OpenCLMatMulKernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
@@ -63,7 +63,7 @@ int ReshapeOpenCLKernel::Init() {
  return RET_OK;
 }

 int ReshapeOpenCLKernel::ReSize() { return 0; }
 int ReshapeOpenCLKernel::ReSize() { return RET_OK; }

 int ReshapeOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
  size_t im_dst_x, im_dst_y;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
@@ -64,7 +64,7 @@ int TransposeOpenCLKernel::Init() {
  return RET_OK;
 }

 int TransposeOpenCLKernel::ReSize() { return 0; }
 int TransposeOpenCLKernel::ReSize() { return RET_OK; }

 int TransposeOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
  size_t im_dst_x, im_dst_y;
@@ -100,7 +100,7 @@ int TransposeOpenCLKernel::Run() {
  ocl_runtime->SetKernelArg(kernel_, 2, HW);
  ocl_runtime->SetKernelArg(kernel_, 3, C);
  ocl_runtime->RunKernel(kernel_, global, local, nullptr);
  return 0;
  return RET_OK;
 }

 kernel::LiteKernel *OpenCLTransposeKernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
--- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
@@ -19,6 +19,7 @@

 #include <vector>
 #include "src/lite_kernel.h"
 #include "include/errorcode.h"

 namespace mindspore::kernel {

@@ -37,15 +38,15 @@ class OpenCLKernel : public LiteKernel {
                        const std::vector<lite::tensor::Tensor *> &outputs)
      : LiteKernel(parameter, inputs, outputs, nullptr, nullptr) {}

  virtual int Init() { return -1; }
  virtual int Prepare() { return -1; }
  virtual int InferShape() { return -1; }
  virtual int ReSize() { return -1; }
  virtual int Run() { return -1; }
  virtual int GetImageSize(size_t idx, std::vector<size_t> *img_size) { return -1; }
  virtual int GetGlobalSize(size_t idx, std::vector<size_t> *global_size) { return -1; }
  virtual int Init() { return RET_ERROR; }
  virtual int Prepare() { return RET_ERROR; }
  virtual int InferShape() { return RET_ERROR; }
  virtual int ReSize() { return RET_ERROR; }
  virtual int Run() { return RET_ERROR; }
  virtual int GetImageSize(size_t idx, std::vector<size_t> *img_size) { return RET_ERROR; }
  virtual int GetGlobalSize(size_t idx, std::vector<size_t> *global_size) { return RET_ERROR; }
  virtual int GetLocalSize(size_t idx, const std::vector<size_t> &global_size, std::vector<size_t> *local_size) {
    return -1;
    return RET_ERROR;
  }
  OpenCLMemType GetMemType() { return out_mem_type_; }
  void SetMemType(OpenCLMemType mem_type) { out_mem_type_ = mem_type; }
--- a/mindspore/lite/src/runtime/opencl/opencl_allocator.cc
+++ b/mindspore/lite/src/runtime/opencl/opencl_allocator.cc
@@ -91,15 +91,15 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t> &img_size)
    cl::Buffer *buffer = new (std::nothrow)
      cl::Buffer(*ocl_runtime->Context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, size, NULL, &ret);
    if (buffer == nullptr || ret != CL_SUCCESS) {
      MS_LOG(ERROR) << "Create OpenCL buffer failed! (ERROR CODE: " << ret << ")";
      UnLock();
      MS_LOG(ERROR) << "Create OpenCL buffer failed! (ERROR CODE: " << ret << ")";
      return nullptr;
    }
    device_ptr = static_cast<void *>(buffer);
    host_ptr = ocl_runtime->MapBuffer(*buffer, CL_MAP_READ | CL_MAP_WRITE, size);
    if (host_ptr == nullptr) {
      MS_LOG(ERROR) << "Map buffer failed, can not found buffer :" << device_ptr << ", host_ptr=" << host_ptr;
      UnLock();
      MS_LOG(ERROR) << "Map buffer failed, can not found buffer :" << device_ptr << ", host_ptr=" << host_ptr;
      return nullptr;
    }
    cl::Memory *mem = buffer;
@@ -199,12 +199,15 @@ void OpenCLAllocator::Free(void *buf) {
  Lock();
  auto iter = allocated_list_.find(buf);
  if (iter != allocated_list_.end()) {
    if (iter->second->map_flags) {
      UnmapBuffer(buf);
      iter->second->map_flags = false;
    }
    auto mem_buf = iter->second;
    allocated_list_.erase(iter);
    free_list_.insert(std::make_pair(mem_buf->size_, mem_buf));
    UnLock();
    buf = nullptr;
    MS_LOG(DEBUG) << "Free a new Image2D. size: " << mem_buf->size_ << ", host addr: " << mem_buf->host_ptr_
    MS_LOG(DEBUG) << "Free device buffer. size: " << mem_buf->size_ << ", host addr: " << mem_buf->host_ptr_
                  << ", device addr: " << mem_buf->device_ptr_ << ", image addr: " << mem_buf->image_ptr_
                  << ", free list size: " << free_list_.size();
    return;
@@ -291,10 +294,16 @@ void *OpenCLAllocator::MapBuffer(void *host_ptr, int flags, void *command_queue,
  Lock();
  auto it = allocated_list_.find(host_ptr);
  if (it == allocated_list_.end()) {
    MS_LOG(ERROR) << "Map buffer failed, can not found buffer :" << host_ptr;
    UnLock();
    MS_LOG(ERROR) << "Map buffer failed, can not found buffer :" << host_ptr;
    return nullptr;
  }

  if (it->second->map_flags) {
    UnLock();
    MS_LOG(WARNING) << "Host ptr " << host_ptr << " has mapped";
    return host_ptr;
  }
  MemBuf *mem_buf = it->second;
  void *new_host_ptr{nullptr};
  if (mem_buf->img_size.empty()) {
@@ -307,11 +316,13 @@ void *OpenCLAllocator::MapBuffer(void *host_ptr, int flags, void *command_queue,
    new_host_ptr = ocl_runtime->MapBuffer(*image, 0, CL_MAP_READ | CL_MAP_WRITE, region);
  }
  if (new_host_ptr == nullptr) {
    UnLock();
    MS_LOG(WARNING) << "Map buffer failed, can not found buffer or already mapped, dev_ptr=" << mem_buf->device_ptr_
                    << ", host_ptr=" << host_ptr;
    UnLock();
    return nullptr;
  }

  mem_buf->map_flags = true;
  mem_buf->host_ptr_ = new_host_ptr;
  allocated_list_.erase(it);
  allocated_list_[new_host_ptr] = mem_buf;
@@ -327,16 +338,22 @@ int OpenCLAllocator::UnmapBuffer(void *host_ptr, void *command_queue) {
    if (!(svm_capabilities & CL_DEVICE_SVM_FINE_GRAIN_BUFFER)) {
      return ocl_runtime->UnmapBuffer(host_ptr);
    }
    return 0;
    return RET_OK;
  }
  auto it = allocated_list_.find(host_ptr);
  if (it == allocated_list_.end()) {
    MS_LOG(ERROR) << "Map buffer failed, can not found buffer :" << host_ptr;
    return 1;
    return RET_ERROR;
  }
  if (it->second->map_flags) {
    it->second->map_flags = false;
    cl::Memory *mem =
      static_cast<cl::Memory *>(it->second->img_size.empty() ? it->second->device_ptr_ : it->second->image_ptr_);
    return ocl_runtime->UnmapBuffer(*mem, it->second->host_ptr_, static_cast<cl::CommandQueue *>(command_queue));
  } else {
    MS_LOG(WARNING) << "Host ptr " << host_ptr << " do not mapped";
    return RET_OK;
  }
  cl::Memory *mem =
    static_cast<cl::Memory *>(it->second->img_size.empty() ? it->second->device_ptr_ : it->second->image_ptr_);
  return ocl_runtime->UnmapBuffer(*mem, it->second->host_ptr_, static_cast<cl::CommandQueue *>(command_queue));
 }

 MEM_TYPE OpenCLAllocator::GetMemType(void *host_ptr) {
@@ -344,8 +361,8 @@ MEM_TYPE OpenCLAllocator::GetMemType(void *host_ptr) {
  Lock();
  auto it = allocated_list_.find(host_ptr);
  if (it == allocated_list_.end()) {
    MS_LOG(ERROR) << "Can not found buffer :" << host_ptr;
    UnLock();
    MS_LOG(ERROR) << "Can not found buffer :" << host_ptr;
    return mem_type;
  }
  MemBuf *mem_buf = it->second;
@@ -362,8 +379,8 @@ int OpenCLAllocator::GetImageSize(void *host_ptr, std::vector<size_t> *img_size)
  Lock();
  auto it = allocated_list_.find(host_ptr);
  if (it == allocated_list_.end()) {
    MS_LOG(ERROR) << "Can not found buffer :" << host_ptr;
    UnLock();
    MS_LOG(ERROR) << "Can not found buffer :" << host_ptr;
    return RET_OK;
  }
  MemBuf *mem_buf = it->second;
--- a/mindspore/lite/src/runtime/opencl/opencl_allocator.h
+++ b/mindspore/lite/src/runtime/opencl/opencl_allocator.h
@@ -76,6 +76,7 @@ class OpenCLAllocator : public Allocator {
    void *host_ptr_;
    void *image_ptr_;
    std::vector<size_t> img_size;
    bool map_flags{false};
  };

  std::mutex lock;
--- a/mindspore/lite/src/runtime/opencl/opencl_executor.cc
+++ b/mindspore/lite/src/runtime/opencl/opencl_executor.cc
@@ -21,6 +21,9 @@
 #include "include/errorcode.h"

 namespace mindspore::lite::opencl {

 int OpenCLExecutor::Prepare(const std::vector<kernel::LiteKernel *> &kernels) { return RET_OK; }

 int OpenCLExecutor::Run(std::vector<tensor::Tensor *> &inputs, std::vector<tensor::Tensor *> &outputs,
                        std::vector<kernel::LiteKernel *> &kernels, Allocator *allocator,
                        const session::KernelCallBack &before, const session::KernelCallBack &after) {
@@ -71,136 +74,4 @@ int OpenCLExecutor::Run(std::vector<tensor::Tensor *> &inputs, std::vector<tenso
  }
  return RET_OK;
 }

 int OpenCLExecutor::TransformTensorLayout(tensor::Tensor *tensor, schema::Format src_format, schema::Format dst_format,
                                          bool trans_dir) {
  MS_ASSERT(nullptr != tensor);
  MS_ASSERT(4 == tensor->shape().size());
  auto data_type = tensor->data_type();
  switch (data_type) {
    case kNumberTypeInt8:
      return TransformTensorLayoutUint8(tensor, src_format, dst_format, trans_dir);
    case kNumberTypeFloat32:
      return TransformTensorLayoutFp32(tensor, src_format, dst_format, trans_dir);
    default:
      MS_LOG(ERROR) << "Unsupported layout transform: " << schema::EnumNameFormat(tensor->GetFormat()) << " to "
                    << schema::EnumNameFormat(dst_format);
      return RET_ERROR;
  }
 }

 int OpenCLExecutor::TransformTensorLayoutFp32(tensor::Tensor *tensor, schema::Format src_format,
                                              schema::Format dst_format, bool trans_dir) {
  MS_ASSERT(nullptr != tensor);
  MS_ASSERT(nullptr != allocator_);
  MS_ASSERT(4 == tensor->shape().size());
  if (trans_dir) {
    if (is_image2d_out_) {
      return TransformTensorLayoutToImage(tensor, src_format, dst_format);
    } else {
      return TransformTensorLayoutToBuffer(tensor, src_format, dst_format);
    }
  } else {
    if (is_image2d_out_) {
      return TransformTensorLayoutFromImage(tensor, src_format, dst_format);
    } else {
      return TransformTensorLayoutToBuffer(tensor, src_format, dst_format);
    }
  }
 }

 int OpenCLExecutor::TransformTensorLayoutToBuffer(tensor::Tensor *tensor, schema::Format src_format,
                                                  schema::Format dst_format) {
  if (dst_format == schema::Format_NHWC4) {
    auto *src_data = tensor->Data();
    size_t C4 = UP_DIV(tensor->Channel(), C4NUM);
    std::vector<size_t> img_size{tensor->Width() * C4, (size_t)tensor->Height(), CL_FLOAT};
    if (src_format == schema::Format_NHWC) {
      auto *dst_data = allocator_->Malloc(tensor->Size(), img_size);
      if (dst_data == nullptr) {
        MS_LOG(ERROR) << "Malloc data failed";
        return RET_ERROR;
      }
      dst_data = reinterpret_cast<FLOAT_t *>(allocator_->MapBuffer(dst_data, CL_MAP_WRITE, nullptr, true));
      PackNHWCToNHWC4Fp32(src_data, dst_data, tensor->Batch(), tensor->Height() * tensor->Width(), tensor->Channel());
      tensor->SetData(dst_data);
      allocator_->Free(src_data);
      allocator_->UnmapBuffer(dst_data);
    }
    tensor->SetFormat(dst_format);
    return RET_OK;
  } else if (dst_format == schema::Format_NHWC) {
    return RET_OK;
  } else {
    MS_LOG(ERROR) << "Unsupported layout transform: " << schema::EnumNameFormat(tensor->GetFormat()) << " to "
                  << schema::EnumNameFormat(dst_format) << " in float32";
    return RET_ERROR;
  }
 }

 int OpenCLExecutor::TransformTensorLayoutToImage(tensor::Tensor *tensor, schema::Format src_format,
                                                 schema::Format dst_format) {
  if (dst_format == schema::Format_NHWC4) {
    tensor->SetFormat(schema::Format_NHWC4);
    // convert to nhwc4
    auto *src_data = tensor->Data();
    auto *dst_data{src_data};
    if (src_format == schema::Format_NHWC) {
      dst_data = allocator_->Malloc(tensor->Size());
      if (dst_data == nullptr) {
        MS_LOG(ERROR) << "Malloc data failed";
        return RET_ERROR;
      }
      dst_data = reinterpret_cast<FLOAT_t *>(allocator_->MapBuffer(dst_data, CL_MAP_WRITE, nullptr, true));
      PackNHWCToNHWC4Fp32(src_data, dst_data, tensor->Batch(), tensor->Height() * tensor->Width(), tensor->Channel());
      tensor->SetData(dst_data);
      allocator_->Free(src_data);
      allocator_->UnmapBuffer(dst_data);
    }
    // copy to image2d
    src_data = dst_data;
    size_t C4 = UP_DIV(tensor->Channel(), C4NUM);
    std::vector<size_t> img_size{tensor->Width() * C4, (size_t)tensor->Height(), CL_FLOAT};
    dst_data = allocator_->CreateImageFromHost(src_data, tensor->Size(), img_size);
    tensor->SetData(dst_data);
    allocator_->Free(src_data);
    return RET_OK;
  } else {
    MS_LOG(ERROR) << "Unsupported layout transform: " << schema::EnumNameFormat(tensor->GetFormat()) << " to "
                  << schema::EnumNameFormat(dst_format) << " in float32";
    return RET_ERROR;
  }
 }

 int OpenCLExecutor::TransformTensorLayoutFromImage(tensor::Tensor *tensor, schema::Format src_format,
                                                   schema::Format dst_format) {
  if (dst_format == schema::Format_NHWC) {
    auto src_data = tensor->Data();
    auto dst_data = allocator_->Malloc(tensor->Size());
    cl::Image2D *out_mem = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(src_data));
    std::vector<size_t> img_size;
    allocator_->GetImageSize(src_data, &img_size);
    auto origin = cl::array<cl::size_type, 3U>{0, 0, 0};
    auto region = cl::array<cl::size_type, 3U>{img_size[0], img_size[1], 1};
    auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
    ocl_runtime->GetDefaultCommandQueue()->enqueueReadImage(*out_mem, CL_TRUE, origin, region, 0, 0, dst_data);
    tensor->SetData(dst_data);
    allocator_->Free(src_data);
    return RET_OK;
  } else {
    MS_LOG(ERROR) << "Unsupported layout transform: " << schema::EnumNameFormat(tensor->GetFormat()) << " to "
                  << schema::EnumNameFormat(dst_format) << " in float32";
    return RET_ERROR;
  }
 }

 int OpenCLExecutor::TransformTensorLayoutUint8(tensor::Tensor *tensor, schema::Format src_format,
                                               schema::Format dst_format, bool is_image) {
  MS_ASSERT(nullptr != tensor);
  MS_ASSERT(4 == tensor->shape().size());
  //  auto src_format = tensor->GetFormat();
  MS_LOG(ERROR) << "Unsupported layout transform: " << schema::EnumNameFormat(tensor->GetFormat()) << " to "
                << schema::EnumNameFormat(dst_format) << " in uint8";
  return RET_ERROR;
 }
 }  // namespace mindspore::lite::opencl
--- a/mindspore/lite/src/runtime/opencl/opencl_executor.h
+++ b/mindspore/lite/src/runtime/opencl/opencl_executor.h
@@ -27,38 +27,17 @@
 namespace mindspore::lite::opencl {
 class OpenCLExecutor : Executor {
 public:
  OpenCLExecutor() : Executor() {
    allocator_ = OpenCLRuntime::GetInstance()->GetAllocator();
  }
  OpenCLExecutor() : Executor() { allocator_ = OpenCLRuntime::GetInstance()->GetAllocator(); }

  int Prepare(const std::vector<kernel::LiteKernel *> &kernels) { return 0; }
  int Prepare(const std::vector<kernel::LiteKernel *> &kernels);

  int Run(std::vector<tensor::Tensor *> &inputs, std::vector<tensor::Tensor *> &outputs,
          std::vector<kernel::LiteKernel *> &kernels, Allocator *allocator = nullptr,
          const session::KernelCallBack &before = nullptr, const session::KernelCallBack &after = nullptr);

 protected:
  int TransformTensorLayoutFp32(tensor::Tensor *tensor, schema::Format src_format, schema::Format dst_format,
      bool trans_dir = false);

  int TransformTensorLayoutUint8(tensor::Tensor *tensor, schema::Format src_format, schema::Format dst_format,
      bool trans_dir = false);

  int TransformTensorLayout(tensor::Tensor *tensor, schema::Format src_format, schema::Format dst_format,
      bool trans_dir = false);

  int TransformTensorLayoutToBuffer(tensor::Tensor *tensor, schema::Format src_format, schema::Format dst_format);

  int TransformTensorLayoutToImage(tensor::Tensor *tensor, schema::Format src_format, schema::Format dst_format);

  int TransformTensorLayoutFromImage(tensor::Tensor *tensor, schema::Format src_format, schema::Format dst_format);

 protected:
  Context *context = nullptr;
  OpenCLAllocator *allocator_;
  bool is_image2d_out_{true};
 };

 }  // namespace mindspore::lite::opencl
 #endif

--- a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc
+++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc
@@ -20,6 +20,7 @@
 #ifdef SHARING_MEM_WITH_OPENGL
 #include <EGL/egl.h>
 #endif
 #include "include/errorcode.h"
 #include "src/runtime/kernel/opencl/utils.h"
 #include "src/runtime/opencl/opencl_allocator.h"
 #ifdef PROGRAM_WITH_IL
@@ -80,7 +81,7 @@ int OpenCLRuntime::Init() {
  std::unique_lock<std::mutex> lck(g_init_mtx);

  if (init_done_) {
    return 0;
    return RET_OK;
  }
  MS_LOG(INFO) << "OpenCL version: CL_TARGET_OPENCL_VERSION " << CL_TARGET_OPENCL_VERSION;
  MS_LOG(INFO) << "CL_HPP_TARGET_OPENCL_VERSION " << CL_HPP_TARGET_OPENCL_VERSION;
@@ -89,7 +90,7 @@ int OpenCLRuntime::Init() {
 #ifdef USE_OPENCL_WRAPPER
  if (false == OpenCLWrapper::GetInstance()->LoadOpenCLLibrary()) {
    MS_LOG(ERROR) << "Load OpenCL symbols failed!";
    return 1;
    return RET_ERROR;
  }
 #endif  // USE_OPENCL_WRAPPER

@@ -97,7 +98,7 @@ int OpenCLRuntime::Init() {
  cl::Platform::get(&platforms);
  if (platforms.size() == 0) {
    MS_LOG(ERROR) << "OpenCL Platform not found!";
    return 1;
    return RET_ERROR;
  }

  // search GPU
@@ -119,7 +120,7 @@ int OpenCLRuntime::Init() {
  // not found, return error code.
  if (devices.size() == 0) {
    MS_LOG(ERROR) << "OpenCL Device not found!";
    return 1;
    return RET_ERROR;
  }

  device_ = std::make_shared<cl::Device>();
@@ -158,7 +159,7 @@ int OpenCLRuntime::Init() {
 #endif
  if (err != CL_SUCCESS) {
    MS_LOG(ERROR) << "Context create failed: " << CLErrorCode(err);
    return 1;
    return RET_ERROR;
  }

  // get cache size, compute units and frequency.
@@ -206,7 +207,7 @@ int OpenCLRuntime::Init() {
  default_command_queue_ = std::make_shared<cl::CommandQueue>(*context_, *device_, properties, &err);
  if (err != CL_SUCCESS) {
    MS_LOG(ERROR) << "Command Queue create failed: " << CLErrorCode(err);
    return 1;
    return RET_ERROR;
  }

  allocator_ = std::make_shared<OpenCLAllocator>();
@@ -217,7 +218,7 @@ int OpenCLRuntime::Init() {
  init_done_ = true;
  MS_LOG(INFO) << "OpenCLRuntime init done!";

  return 0;
  return RET_OK;
 }

 OpenCLRuntime::~OpenCLRuntime() {
@@ -314,12 +315,12 @@ int OpenCLRuntime::BuildKernel(cl::Kernel &kernel, const std::string &program_na
    auto status = this->LoadProgram(program_name, &program);
    if (!status) {
      MS_LOG(ERROR) << "load program (" << program_name << ") failed!";
      return 1;
      return RET_ERROR;
    }
    status = this->BuildProgram(build_options_str, &program);
    if (!status) {
      MS_LOG(ERROR) << program_name << " build failed!";
      return 1;
      return RET_ERROR;
    }
    program_map_.emplace(build_program_key, program);
  }
@@ -328,9 +329,9 @@ int OpenCLRuntime::BuildKernel(cl::Kernel &kernel, const std::string &program_na
  kernel = cl::Kernel(program, kernel_name.c_str(), &err);
  if (err != CL_SUCCESS) {
    MS_LOG(ERROR) << kernel_name << " Kernel create failed:" << CLErrorCode(err);
    return 1;
    return RET_ERROR;
  }
  return 0;
  return RET_OK;
 }

 // Run Kernel with 1D, 2D, 3D group size, and local size can be empty.
@@ -365,10 +366,10 @@ int OpenCLRuntime::RunKernel(const cl_kernel &kernel, const std::vector<size_t>

  if (error != CL_SUCCESS) {
    MS_LOG(ERROR) << "Kernel execute failed:" << CLErrorCode(error);
    return 1;
    return RET_ERROR;
  }
  MS_LOG(DEBUG) << "RunKernel success!";
  return 0;
  return RET_OK;
 }

 // Run Kernel with 1D, 2D, 3D group size, and local size can be empty.
@@ -413,14 +414,14 @@ int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const std::vector<size_t>
    }
  } else {
    MS_LOG(ERROR) << "Not supported NDRange!";
    return 1;
    return RET_ERROR;
  }

  err = command_queue->enqueueNDRangeKernel(kernel, cl::NullRange, global_range, local_range, nullptr, &event);

  if (err != CL_SUCCESS) {
    MS_LOG(ERROR) << "Kernel execute failed:" << CLErrorCode(err);
    return 1;
    return RET_ERROR;
  }
  MS_LOG(DEBUG) << "RunKernel success!";
 #if MS_OPENCL_PROFILE
@@ -432,7 +433,7 @@ int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const std::vector<size_t>
  double nanoSeconds = time_end - time_start;
  MS_LOG(INFO) << "OpenCl Execution time is: " << nanoSeconds / 1000000.0 << "ms";
 #endif
  return 0;
  return RET_OK;
 }

 // get gpu divce type
@@ -534,7 +535,7 @@ void *OpenCLRuntime::MapBuffer(const cl::Buffer buffer, int flags, size_t size,

 int OpenCLRuntime::MapBuffer(void *host_ptr, int flags, size_t size, cl::CommandQueue *command_queue, bool sync) const {
  if (svm_capabilities_ & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) {
    return 0;
    return RET_OK;
  }
  if (command_queue == nullptr) {
    command_queue = default_command_queue_.get();
@@ -563,7 +564,7 @@ int OpenCLRuntime::UnmapBuffer(const cl::Memory buffer, void *host_ptr, cl::Comm

 int OpenCLRuntime::UnmapBuffer(void *host_ptr, cl::CommandQueue *command_queue) const {
  if (svm_capabilities_ & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) {
    return 0;
    return RET_OK;
  }
  if (command_queue == nullptr) {
    command_queue = default_command_queue_.get();
@@ -578,7 +579,7 @@ bool OpenCLRuntime::SyncCommandQueue(cl::CommandQueue *command_queue) {
  cl_int ret = command_queue->finish();
  if (ret != CL_SUCCESS) {
    MS_LOG(ERROR) << "Command queue sync failed: " << CLErrorCode(ret);
    return 1;
    return RET_ERROR;
  }
  return ret == CL_SUCCESS;
 }
--- a/mindspore/lite/src/runtime/opencl/opencl_wrapper.cc
+++ b/mindspore/lite/src/runtime/opencl/opencl_wrapper.cc
@@ -41,11 +41,14 @@ static const std::vector<std::string> g_opencl_library_paths = {
  "/system/lib64/libOpenCL.so",
 #else
  // Qualcomm Adreno
  "/system/vendor/lib/libOpenCL.so", "/system/lib/libOpenCL.so",
  "/system/vendor/lib/libOpenCL.so",
  "/system/lib/libOpenCL.so",
  // Mali
  "/system/vendor/lib/egl/libGLES_mali.so", "/system/lib/egl/libGLES_mali.so",
  "/system/vendor/lib/egl/libGLES_mali.so",
  "/system/lib/egl/libGLES_mali.so",
  // other
  "/system/vendor/lib/libPVROCL.so", "/data/data/org.pocl.libs/files/lib/libpocl.so"
  "/system/vendor/lib/libPVROCL.so",
  "/data/data/org.pocl.libs/files/lib/libpocl.so"
 #endif
  "libOpenCL.so",
  "libGLES_mali.so",
@@ -680,4 +683,3 @@ cl_int clSetKernelArgSVMPointer(cl_kernel kernel, cl_uint index, const void *hos
 #endif

 #endif  // USE_OPENCL_WRAPPER

--- a/mindspore/lite/src/runtime/opencl/opencl_wrapper.h
+++ b/mindspore/lite/src/runtime/opencl/opencl_wrapper.h
@@ -237,4 +237,3 @@ class OpenCLWrapper {
 }  // namespace mindspore::lite::opencl
 #endif  // USE_OPENCL_WRAPPER
 #endif  // MINDSPORE_LITE_SRC_OPENCL_WRAPPER_H_

--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc
@@ -179,13 +179,13 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b)

  memcpy(data_c_ocl, outputs[0]->Data(), sizeof(float) * element_num);

  // ocl_runtime->SyncCommandQueue();
  LogData(data_a, 10, "Data A : ");
  LogData(data_b, tensor_b->shape().empty() ? 1 : 10, "Data B : ");
  LogData(data_c_cpu, 10, "Expect compute : ");
  LogData(outputs[0]->Data(), 10, "OpenCL compute : ");
  bool cmp = DataCompare(data_c_cpu, data_c_ocl, element_num);
  MS_LOG(INFO) << "Compare " << (cmp ? "success!" : "failed!");
  EXPECT_EQ(true, cmp);

  // free
  delete[] data_a;