From: @ddwsky Reviewed-by: @HilbertDavid,@zhanghaibo5 Signed-off-by: @HilbertDavidtags/v1.1.0
| @@ -0,0 +1,38 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| namespace mindspore.schema; | |||||
| table TuneParam { | |||||
| local: [int]; | |||||
| block: [int]; | |||||
| shape: [int]; | |||||
| opPara: [int]; | |||||
| } | |||||
| table KernelBin { | |||||
| name: string; | |||||
| tune: TuneParam; | |||||
| data: [ubyte]; | |||||
| } | |||||
| table GpuCache { | |||||
| name: string; | |||||
| version: string; | |||||
| allBins: [KernelBin]; | |||||
| } | |||||
| root_type GpuCache; | |||||
| @@ -17,6 +17,7 @@ if (PLATFORM_ARM32 OR PLATFORM_ARM64) | |||||
| endif () | endif () | ||||
| set(LITE_SRC | set(LITE_SRC | ||||
| ${CMAKE_CURRENT_SOURCE_DIR}/common/file_utils.cc | |||||
| ${CMAKE_CURRENT_SOURCE_DIR}/common/utils.cc | ${CMAKE_CURRENT_SOURCE_DIR}/common/utils.cc | ||||
| ${CMAKE_CURRENT_SOURCE_DIR}/common/graph_util.cc | ${CMAKE_CURRENT_SOURCE_DIR}/common/graph_util.cc | ||||
| ${CMAKE_CURRENT_SOURCE_DIR}/common/log_adapter.cc | ${CMAKE_CURRENT_SOURCE_DIR}/common/log_adapter.cc | ||||
| @@ -67,7 +67,7 @@ void *OpenCLAllocator::MinimumFit(size_t size, const std::vector<size_t> &img_si | |||||
| void *OpenCLAllocator::CreateBuffer(size_t size, void *data, size_t flags, cl::Buffer **buffer) { | void *OpenCLAllocator::CreateBuffer(size_t size, void *data, size_t flags, cl::Buffer **buffer) { | ||||
| cl_int ret = CL_SUCCESS; | cl_int ret = CL_SUCCESS; | ||||
| MS_ASSERT(buffer); | MS_ASSERT(buffer); | ||||
| *buffer = new (std::nothrow) cl::Buffer(*ocl_runtime_->Context(), flags, size, data, &ret); | |||||
| *buffer = new (std::nothrow) cl::Buffer(*ocl_runtime_->Context(), static_cast<cl_mem_flags>(flags), size, data, &ret); | |||||
| if (*buffer == nullptr) { | if (*buffer == nullptr) { | ||||
| MS_LOG(ERROR) << "Create OpenCL buffer failed! (ERROR CODE: " << ret << ")"; | MS_LOG(ERROR) << "Create OpenCL buffer failed! (ERROR CODE: " << ret << ")"; | ||||
| return nullptr; | return nullptr; | ||||
| @@ -90,6 +90,9 @@ void *OpenCLAllocator::CreateBuffer(size_t size, void *data, size_t flags, cl::B | |||||
| void *OpenCLAllocator::CreateImage2D(size_t size, const std::vector<size_t> &img_size, void *data, size_t flags, | void *OpenCLAllocator::CreateImage2D(size_t size, const std::vector<size_t> &img_size, void *data, size_t flags, | ||||
| bool is_map, cl::Buffer **buffer, cl::Image2D **image) { | bool is_map, cl::Buffer **buffer, cl::Image2D **image) { | ||||
| cl_int ret = CL_SUCCESS; | cl_int ret = CL_SUCCESS; | ||||
| MS_ASSERT(buffer); | |||||
| MS_ASSERT(image); | |||||
| MS_ASSERT(img_size.size() == 3); | |||||
| cl::ImageFormat image_format(CL_RGBA, img_size[2]); | cl::ImageFormat image_format(CL_RGBA, img_size[2]); | ||||
| if (data == nullptr) { | if (data == nullptr) { | ||||
| *image = new (std::nothrow) | *image = new (std::nothrow) | ||||
| @@ -332,7 +335,7 @@ void *OpenCLAllocator::MapBuffer(void *host_ptr, int flags, void *command_queue, | |||||
| } | } | ||||
| MemBuf *mem_buf = it->second; | MemBuf *mem_buf = it->second; | ||||
| MS_ASSERT(mem_buf); | MS_ASSERT(mem_buf); | ||||
| void *new_host_ptr; | |||||
| void *new_host_ptr{nullptr}; | |||||
| if (mem_buf->img_size.empty()) { | if (mem_buf->img_size.empty()) { | ||||
| cl::Buffer *buffer = static_cast<cl::Buffer *>(mem_buf->device_ptr_); | cl::Buffer *buffer = static_cast<cl::Buffer *>(mem_buf->device_ptr_); | ||||
| MS_ASSERT(buffer); | MS_ASSERT(buffer); | ||||
| @@ -17,12 +17,14 @@ | |||||
| #include "src/runtime/opencl/opencl_runtime.h" | #include "src/runtime/opencl/opencl_runtime.h" | ||||
| #include <vector> | #include <vector> | ||||
| #include <numeric> | #include <numeric> | ||||
| #include <utility> | |||||
| #ifdef SHARING_MEM_WITH_OPENGL | #ifdef SHARING_MEM_WITH_OPENGL | ||||
| #include <EGL/egl.h> | #include <EGL/egl.h> | ||||
| #endif | #endif | ||||
| #include "include/errorcode.h" | #include "include/errorcode.h" | ||||
| #include "src/runtime/kernel/opencl/utils.h" | #include "src/runtime/kernel/opencl/utils.h" | ||||
| #include "src/runtime/opencl/opencl_allocator.h" | #include "src/runtime/opencl/opencl_allocator.h" | ||||
| #include "src/common/file_utils.h" | |||||
| #ifdef PROGRAM_WITH_IL | #ifdef PROGRAM_WITH_IL | ||||
| #include "src/backend/opencl/cl/program.inc" | #include "src/backend/opencl/cl/program.inc" | ||||
| #endif | #endif | ||||
| @@ -254,6 +256,9 @@ int OpenCLRuntime::Init() { | |||||
| std::string flag = ""; | std::string flag = ""; | ||||
| binary_program_ = CreateProgramFromIL(g_program_binary, flag); | binary_program_ = CreateProgramFromIL(g_program_binary, flag); | ||||
| #endif | #endif | ||||
| if (enable_cache_) { | |||||
| InitGpuCache(); | |||||
| } | |||||
| init_done_ = true; | init_done_ = true; | ||||
| MS_LOG(INFO) << "OpenCLRuntime init done!"; | MS_LOG(INFO) << "OpenCLRuntime init done!"; | ||||
| @@ -261,6 +266,10 @@ int OpenCLRuntime::Init() { | |||||
| } | } | ||||
| int OpenCLRuntime::Uninit() { | int OpenCLRuntime::Uninit() { | ||||
| if (enable_cache_) { | |||||
| StoreCache(); | |||||
| } | |||||
| binary_map_.clear(); | |||||
| program_map_.clear(); | program_map_.clear(); | ||||
| delete allocator_; | delete allocator_; | ||||
| delete default_command_queue_; | delete default_command_queue_; | ||||
| @@ -374,6 +383,12 @@ int OpenCLRuntime::BuildKernel(cl::Kernel &kernel, const std::string &program_na | |||||
| MS_LOG(ERROR) << program_name << " build failed!"; | MS_LOG(ERROR) << program_name << " build failed!"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| if (enable_cache_) { | |||||
| need_write_ = true; | |||||
| auto bin = GetProgramBinaries(program); | |||||
| MS_ASSERT(bin.size() >= 1); | |||||
| binary_map_.emplace(build_program_key, bin[0]); | |||||
| } | |||||
| program_map_.emplace(build_program_key, program); | program_map_.emplace(build_program_key, program); | ||||
| } | } | ||||
| @@ -673,9 +688,8 @@ cl::Program OpenCLRuntime::CreateProgramFromIL(const std::vector<char> &binary, | |||||
| } | } | ||||
| // build program with binary | // build program with binary | ||||
| cl::Program OpenCLRuntime::CreateProgramFromBinary(const std::vector<std::vector<unsigned char>> &binary, | |||||
| const std::string &flag) { | |||||
| cl::Program program = cl::Program(*context_, {*device_}, binary); | |||||
| cl::Program OpenCLRuntime::CreateProgramFromBinary(const std::vector<unsigned char> &binary, const std::string &flag) { | |||||
| cl::Program program = cl::Program(*context_, {*device_}, {binary}); | |||||
| bool status = BuildProgram(default_build_opts_, program); | bool status = BuildProgram(default_build_opts_, program); | ||||
| if (!status) { | if (!status) { | ||||
| MS_LOG(ERROR) << "Build program with binary failed!"; | MS_LOG(ERROR) << "Build program with binary failed!"; | ||||
| @@ -691,4 +705,75 @@ std::vector<std::vector<unsigned char>> OpenCLRuntime::GetProgramBinaries(const | |||||
| } | } | ||||
| return binary; | return binary; | ||||
| } | } | ||||
| void OpenCLRuntime::InitGpuCache() { | |||||
| size_t len; | |||||
| char *buf = lite::ReadFile(cache_path_.c_str(), &len); | |||||
| if (LoadCache(buf) != RET_OK) { | |||||
| MS_LOG(ERROR) << "Load opencl cache fail"; | |||||
| } | |||||
| delete buf; | |||||
| MS_LOG(INFO) << "Init opencl cache success"; | |||||
| } | |||||
| int OpenCLRuntime::LoadCache(const void *buf) { | |||||
| if (buf == nullptr) { | |||||
| return RET_ERROR; | |||||
| } | |||||
| auto gpu_cache = schema::GetGpuCache(buf); | |||||
| if (gpu_cache == nullptr) { | |||||
| return RET_ERROR; | |||||
| } | |||||
| auto *bins = gpu_cache->allBins(); | |||||
| if (bins == nullptr) { | |||||
| return RET_ERROR; | |||||
| } | |||||
| auto n = bins->size(); | |||||
| for (auto i = 0; i < n; ++i) { | |||||
| auto *kernel_bin = bins->template GetAs<schema::KernelBin>(i); | |||||
| if (kernel_bin == nullptr) { | |||||
| MS_LOG(ERROR) << "kernel_bin[" << i << "] null"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| auto *pdata = kernel_bin->data(); | |||||
| MS_ASSERT(pdata); | |||||
| if (pdata->size() == 0) { | |||||
| continue; | |||||
| } | |||||
| std::vector<unsigned char> bin(pdata->begin(), pdata->end()); | |||||
| auto program = CreateProgramFromBinary(bin, kernel_bin->name()->str()); | |||||
| program_map_.emplace(kernel_bin->name()->str(), program); | |||||
| binary_map_.emplace(kernel_bin->name()->str(), bin); | |||||
| MS_LOG(INFO) << "LoadCache " << kernel_bin->name()->str() << " success, size=" << pdata->size(); | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| void OpenCLRuntime::StoreCache() { | |||||
| if (need_write_) { | |||||
| auto fbb_ = new (std::nothrow) flatbuffers::FlatBufferBuilder; | |||||
| if (fbb_ == nullptr) { | |||||
| MS_LOG(ERROR) << "new opencl FlatBufferBuilder fail"; | |||||
| return; | |||||
| } | |||||
| std::vector<flatbuffers::Offset<schema::KernelBin>> vec_kernel_bin; | |||||
| for (auto iv : binary_map_) { | |||||
| auto name = fbb_->CreateString(iv.first); | |||||
| auto data = fbb_->CreateVector<uint8_t>(iv.second); | |||||
| std::vector<int32_t> shape; | |||||
| auto tune = schema::CreateTuneParam(*fbb_, fbb_->CreateVector<int32_t>(shape), fbb_->CreateVector<int32_t>(shape), | |||||
| fbb_->CreateVector<int32_t>(shape), fbb_->CreateVector<int32_t>(shape)); | |||||
| auto kbin = schema::CreateKernelBin(*fbb_, name, tune, data); | |||||
| vec_kernel_bin.emplace_back(kbin); | |||||
| MS_LOG(INFO) << "StoreCache " << iv.first << " success, size=" << iv.second.size(); | |||||
| } | |||||
| auto data = fbb_->CreateVector<flatbuffers::Offset<schema::KernelBin>>(vec_kernel_bin); | |||||
| auto name = fbb_->CreateString("OpenCLCache"); | |||||
| auto version = fbb_->CreateString(version_); | |||||
| auto gpu_cache = schema::CreateGpuCache(*fbb_, name, version, data); | |||||
| fbb_->Finish(gpu_cache); | |||||
| uint8_t *buf = fbb_->GetBufferPointer(); | |||||
| lite::WriteToBin(cache_path_, reinterpret_cast<void *>(buf), fbb_->GetSize()); | |||||
| MS_LOG(INFO) << "store opencl cache ok, size=" << fbb_->GetSize(); | |||||
| delete fbb_; | |||||
| } | |||||
| } | |||||
| } // namespace mindspore::lite::opencl | } // namespace mindspore::lite::opencl | ||||
| @@ -27,6 +27,7 @@ j* you may not use this file except in compliance with the License. | |||||
| #include "src/common/log_adapter.h" | #include "src/common/log_adapter.h" | ||||
| #include "src/runtime/opencl/opencl_wrapper.h" | #include "src/runtime/opencl/opencl_wrapper.h" | ||||
| #include "src/runtime/opencl/opencl_allocator.h" | #include "src/runtime/opencl/opencl_allocator.h" | ||||
| #include "schema/gpu_cache_generated.h" | |||||
| namespace mindspore::lite::opencl { | namespace mindspore::lite::opencl { | ||||
| @@ -107,7 +108,7 @@ class OpenCLRuntime { | |||||
| } | } | ||||
| cl::Program CreateProgramFromIL(const std::vector<char> &binary, const std::string &flag); | cl::Program CreateProgramFromIL(const std::vector<char> &binary, const std::string &flag); | ||||
| cl::Program CreateProgramFromBinary(const std::vector<std::vector<unsigned char>> &binary, const std::string &flag); | |||||
| cl::Program CreateProgramFromBinary(const std::vector<unsigned char> &binary, const std::string &flag); | |||||
| cl::Kernel GetKernelFromBinary(const std::string &kernel_name); | cl::Kernel GetKernelFromBinary(const std::string &kernel_name); | ||||
| std::vector<std::vector<unsigned char>> GetProgramBinaries(const cl::Program &program); | std::vector<std::vector<unsigned char>> GetProgramBinaries(const cl::Program &program); | ||||
| bool LoadSource(const std::string &program_name, const std::string &source); | bool LoadSource(const std::string &program_name, const std::string &source); | ||||
| @@ -139,6 +140,10 @@ class OpenCLRuntime { | |||||
| */ | */ | ||||
| int GetKernelMaxWorkGroupSize(cl_kernel kernel, cl_device_id device_id); | int GetKernelMaxWorkGroupSize(cl_kernel kernel, cl_device_id device_id); | ||||
| void InitGpuCache(); | |||||
| int LoadCache(const void *buf); | |||||
| void StoreCache(); | |||||
| private: | private: | ||||
| static OpenCLRuntime *GetInstance(); | static OpenCLRuntime *GetInstance(); | ||||
| static void DeleteInstance(); | static void DeleteInstance(); | ||||
| @@ -171,6 +176,11 @@ class OpenCLRuntime { | |||||
| cl_uint image_pitch_align_{0}; | cl_uint image_pitch_align_{0}; | ||||
| std::vector<size_t> max_work_item_sizes_; | std::vector<size_t> max_work_item_sizes_; | ||||
| void *handle_{nullptr}; | void *handle_{nullptr}; | ||||
| std::map<std::string, std::vector<unsigned char>> binary_map_; | |||||
| std::string cache_path_{"/data/local/tmp/opencl_cache"}; | |||||
| const std::string version_{"V0.1"}; | |||||
| bool need_write_{false}; | |||||
| bool enable_cache_{false}; | |||||
| }; | }; | ||||
| class OpenCLRuntimeWrapper { | class OpenCLRuntimeWrapper { | ||||