| @@ -31,6 +31,7 @@ | |||
| #include "proto/node_def.pb.h" | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| #include "backend/kernel_compiler/aicpu/aicpu_util.h" | |||
| #include "backend/kernel_compiler/aicpu/aicpu_kernel_load.h" | |||
| #include "backend/session/kernel_graph.h" | |||
| #include "backend/kernel_compiler/common_utils.h" | |||
| #include "backend/kernel_compiler/oplib/oplib.h" | |||
| @@ -423,6 +424,11 @@ KernelModPtr AicpuOpBuild(const std::shared_ptr<AnfNode> &anf_node) { | |||
| if (!SetIOSize(anf_node, kernel_mod_ptr)) { | |||
| MS_LOG(EXCEPTION) << "Set input output size list failed."; | |||
| } | |||
| if (!AicpuOpKernelLoad::GetInstance().LoadAicpuKernelSo(anf_node, kernel_mod_ptr)) { | |||
| MS_LOG(EXCEPTION) << "Aicpu kernel so load failed. task is " << anf_node->fullname_with_scope(); | |||
| } | |||
| return kernel_mod_ptr; | |||
| } | |||
| } // namespace kernel | |||
| @@ -0,0 +1,373 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/aicpu/aicpu_kernel_load.h" | |||
| #include <dlfcn.h> | |||
| #include <unistd.h> | |||
| #include <utility> | |||
| #include <string> | |||
| #include <ios> | |||
| #include <fstream> | |||
| #include "runtime/kernel.h" | |||
| #include "runtime/mem.h" | |||
| #include "runtime/context.h" | |||
| #include "utils/utils.h" | |||
| #include "utils/file_utils.h" | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| bool AicpuOpKernelLoad::GetBinaryFileName(const std::string &so_name, const std::string &bin_folder_path, | |||
| std::string *bin_file_path) { | |||
| MS_EXCEPTION_IF_NULL(bin_file_path); | |||
| const auto &iter = so_name_and_realpath_map_.find(so_name); | |||
| if (iter != so_name_and_realpath_map_.end()) { | |||
| *bin_file_path = iter->second; | |||
| MS_LOG(INFO) << "so " << so_name << " has bin file path " << bin_file_path; | |||
| return true; | |||
| } | |||
| std::string bin_file_name(bin_folder_path); | |||
| if (bin_file_name.empty()) { | |||
| bin_file_name = "./"; | |||
| } else if (bin_file_name.back() != '/') { | |||
| bin_file_name.append("/"); | |||
| } | |||
| bin_file_name += so_name; | |||
| auto real_file_path = FileUtils::GetRealPath(bin_file_name.c_str()); | |||
| if (!real_file_path.has_value()) { | |||
| MS_LOG(ERROR) << "Get real path failed, path=" << bin_file_name; | |||
| return false; | |||
| } | |||
| auto real_file_path_value = real_file_path.value(); | |||
| if (access(real_file_path_value.c_str(), F_OK) == -1) { | |||
| MS_LOG(ERROR) << "Kernel so path:" << real_file_path_value << " is not existed!"; | |||
| return false; | |||
| } | |||
| *bin_file_path = real_file_path_value; | |||
| so_name_and_realpath_map_[so_name] = *bin_file_path; | |||
| return true; | |||
| } | |||
| bool AicpuOpKernelLoad::ReadBytesFromBinaryFile(const std::string &file_name, std::vector<char> *buffer) const { | |||
| std::ifstream file(file_name.c_str(), std::ios::binary | std::ios::ate); | |||
| if (!file.is_open()) { | |||
| MS_LOG(ERROR) << "Open file [" << file_name << "] failed"; | |||
| return false; | |||
| } | |||
| std::streamsize size = file.tellg(); | |||
| if (size <= 0) { | |||
| file.close(); | |||
| MS_LOG(ERROR) << "Empty file [" << file_name << "], please check this file."; | |||
| return false; | |||
| } | |||
| if (size > INT_MAX) { | |||
| file.close(); | |||
| MS_LOG(ERROR) << "File [" << file_name << "] size [" << size << "] is out of limit[" << INT_MAX << "]"; | |||
| return false; | |||
| } | |||
| file.seekg(0, std::ios::beg); | |||
| buffer->resize(size); | |||
| file.read(buffer->data(), size); | |||
| file.close(); | |||
| return true; | |||
| } | |||
| bool AicpuOpKernelLoad::GetSoNeedLoadPath(const std::string &so_name, std::string *file_path) const { | |||
| MS_EXCEPTION_IF_NULL(file_path); | |||
| Dl_info dl_info; | |||
| if (dladdr(reinterpret_cast<void *>(const_cast<AicpuOpKernelLoad *>(this)), &dl_info) == 0) { | |||
| MS_LOG(ERROR) << "Get dladdr failed!"; | |||
| return false; | |||
| } | |||
| std::string cust_kernel_so_path(dl_info.dli_fname); | |||
| auto pos = cust_kernel_so_path.find_last_of('/'); | |||
| if (cust_kernel_so_path.empty() || pos == std::string::npos) { | |||
| MS_LOG(ERROR) << "Current path [" << cust_kernel_so_path << "] is invalid."; | |||
| return false; | |||
| } | |||
| auto real_cust_kernel_so_path = cust_kernel_so_path.substr(0, pos) + "/lib/"; | |||
| if (real_cust_kernel_so_path.size() > PATH_MAX) { | |||
| MS_LOG(ERROR) << "Current path [" << real_cust_kernel_so_path << "] is too long."; | |||
| return false; | |||
| } | |||
| *file_path = real_cust_kernel_so_path; | |||
| return true; | |||
| } | |||
| bool AicpuOpKernelLoad::PackageBinaryFile(const std::string &so_name, | |||
| std::map<std::string, OpKernelBinPtr> *so_name_with_bin_info) { | |||
| std::string bin_folder_path; | |||
| bool ret = GetSoNeedLoadPath(so_name, &bin_folder_path); | |||
| if (!ret) { | |||
| MS_LOG(ERROR) << "GetSoNeedLoadPath failed."; | |||
| return false; | |||
| } | |||
| std::string bin_file_path; | |||
| ret = GetBinaryFileName(so_name, bin_folder_path, &bin_file_path); | |||
| if (!ret) { | |||
| MS_LOG(ERROR) << "GetBinaryFileName failed."; | |||
| return false; | |||
| } | |||
| std::vector<char> buffer; | |||
| ret = ReadBytesFromBinaryFile(bin_file_path, &buffer); | |||
| if (!ret) { | |||
| MS_LOG(ERROR) << "ReadBytesFromBinaryFile failed."; | |||
| return false; | |||
| } | |||
| OpKernelBinPtr cust_aicpu_kernel_ptr = std::make_shared<OpKernelBin>(so_name, std::move(buffer)); | |||
| if (cust_aicpu_kernel_ptr == nullptr) { | |||
| MS_LOG(ERROR) << "Create OpKernelBin object failed."; | |||
| return false; | |||
| } | |||
| so_name_with_bin_info->insert({so_name, cust_aicpu_kernel_ptr}); | |||
| return true; | |||
| } | |||
| bool AicpuOpKernelLoad::LoadAicpuKernelSo(const AnfNodePtr &node, | |||
| const std::shared_ptr<AicpuOpKernelMod> &kernel_mod_ptr) { | |||
| std::lock_guard<std::mutex> lock(cust_aicpu_mutex_); | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| MS_EXCEPTION_IF_NULL(kernel_mod_ptr); | |||
| CNodePtr cnode = node->cast<CNodePtr>(); | |||
| MS_EXCEPTION_IF_NULL(cnode); | |||
| if (!AnfAlgo::HasNodeAttr(kAttrCustAicpu, cnode)) { | |||
| MS_LOG(INFO) << "Current aicpu ops:" << cnode->fullname_with_scope() << " isn't a custom ops."; | |||
| return true; | |||
| } | |||
| std::string so_name = "lib" + AnfAlgo::GetNodeAttr<std::string>(cnode, kAttrCustAicpu) + ".so"; | |||
| if (so_name == kLibAicpuKernelSoName || so_name == kLibCpuKernelSoName) { | |||
| MS_LOG(INFO) << "Aicpu so:" << so_name << " is default so."; | |||
| return true; | |||
| } | |||
| kernel_mod_ptr->SetCustSo(so_name); | |||
| rtContext_t rt_cur_ctx = nullptr; | |||
| auto rt_error = rtCtxGetCurrent(&rt_cur_ctx); | |||
| if (rt_error != RT_ERROR_NONE) { | |||
| MS_LOG(ERROR) << "Call rtCtxGetCurrent failed, ret = 0x" << rt_error; | |||
| return false; | |||
| } | |||
| // use current context as resource key | |||
| uintptr_t resource_id = reinterpret_cast<uintptr_t>(rt_cur_ctx); | |||
| auto it = cust_aicpu_so_.find(resource_id); | |||
| if (it != cust_aicpu_so_.end()) { | |||
| auto it_so_name = it->second.find(so_name); | |||
| if (it_so_name != it->second.end()) { | |||
| MS_LOG(INFO) << "Cust aicpu so:" << so_name << " has been loaded."; | |||
| return true; | |||
| } | |||
| } | |||
| std::map<std::string, OpKernelBinPtr> so_name_with_bin_info; | |||
| if (!PackageBinaryFile(so_name, &so_name_with_bin_info)) { | |||
| MS_LOG(ERROR) << "Package binary file failed."; | |||
| return false; | |||
| } | |||
| if (it == cust_aicpu_so_.end()) { | |||
| cust_aicpu_so_[resource_id] = so_name_with_bin_info; | |||
| MS_LOG(INFO) << "Load new aicpu so:" << so_name << "success, resource id:" << resource_id << "."; | |||
| return true; | |||
| } | |||
| auto it_so_name = it->second.find(so_name); | |||
| if (it_so_name == it->second.end()) { | |||
| it->second.insert(so_name_with_bin_info.begin(), so_name_with_bin_info.end()); | |||
| MS_LOG(INFO) << "Load cust aicpu so:" << so_name << "success, resource id:" << resource_id << "."; | |||
| return true; | |||
| } | |||
| return true; | |||
| } | |||
| bool AicpuOpKernelLoad::CacheBinaryFileToDevice(const uintptr_t &resource_id, std::vector<void *> *allocated_mem, | |||
| void **batch_args) { | |||
| auto it = cust_aicpu_so_.find(resource_id); | |||
| if (it == cust_aicpu_so_.end()) { | |||
| MS_LOG(ERROR) << "Context id:" << resource_id << " is invalid."; | |||
| return false; | |||
| } | |||
| rtError_t status; | |||
| std::vector<CustAicpuSoBuf> v_cust_so; | |||
| for (const auto &it_so : it->second) { | |||
| const auto &so_name = it_so.first; | |||
| const void *aicpu_data = it_so.second->GetBinData(); | |||
| uint32_t aicpu_data_length = it_so.second->GetBinDataSize(); | |||
| void *d_aicpu_data = nullptr; | |||
| void *d_so_name = nullptr; | |||
| status = rtMalloc(&d_aicpu_data, aicpu_data_length, RT_MEMORY_HBM); | |||
| if (status != RT_ERROR_NONE) { | |||
| MS_LOG(ERROR) << "Call rtMalloc failed, size:" << aicpu_data_length << ", ret = 0x" << status; | |||
| return false; | |||
| } | |||
| allocated_mem->emplace_back(d_aicpu_data); | |||
| status = rtMalloc(&d_so_name, so_name.size(), RT_MEMORY_HBM); | |||
| if (status != RT_ERROR_NONE) { | |||
| MS_LOG(ERROR) << "Call rtMalloc failed, size:" << so_name.size() << ", ret = 0x" << status; | |||
| return false; | |||
| } | |||
| allocated_mem->emplace_back(d_so_name); | |||
| status = rtMemcpy(d_aicpu_data, aicpu_data_length, aicpu_data, aicpu_data_length, RT_MEMCPY_HOST_TO_DEVICE); | |||
| if (status != RT_ERROR_NONE) { | |||
| MS_LOG(ERROR) << "Call rtMemcpy failed, ret = 0x" << status; | |||
| return false; | |||
| } | |||
| status = rtMemcpy(d_so_name, so_name.size(), reinterpret_cast<const void *>(so_name.c_str()), so_name.size(), | |||
| RT_MEMCPY_HOST_TO_DEVICE); | |||
| if (status != RT_ERROR_NONE) { | |||
| MS_LOG(ERROR) << "Call rtMemcpy failed, ret = 0x" << status; | |||
| return false; | |||
| } | |||
| CustAicpuSoBuf cust_aicpu_so_buf; | |||
| cust_aicpu_so_buf.kernelSoBuf = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(d_aicpu_data)); | |||
| cust_aicpu_so_buf.kernelSoBufLen = aicpu_data_length; | |||
| cust_aicpu_so_buf.kernelSoName = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(d_so_name)); | |||
| cust_aicpu_so_buf.kernelSoNameLen = so_name.size(); | |||
| v_cust_so.emplace_back(cust_aicpu_so_buf); | |||
| } | |||
| void *args = nullptr; | |||
| uint32_t args_size = sizeof(CustAicpuSoBuf) * v_cust_so.size(); | |||
| status = rtMalloc(&args, args_size, RT_MEMORY_HBM); | |||
| if (status != RT_ERROR_NONE) { | |||
| MS_LOG(ERROR) << "Call rtMalloc failed, size:" << args_size << ", ret = 0x" << status; | |||
| return false; | |||
| } | |||
| allocated_mem->emplace_back(args); | |||
| status = rtMemcpy(args, args_size, v_cust_so.data(), args_size, RT_MEMCPY_HOST_TO_DEVICE); | |||
| if (status != RT_ERROR_NONE) { | |||
| MS_LOG(ERROR) << "Call rtMemcpy failed, ret = 0x" << status; | |||
| return false; | |||
| } | |||
| BatchLoadOpFromBufArgs batch_cust_so; | |||
| batch_cust_so.soNum = v_cust_so.size(); | |||
| batch_cust_so.args = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(args)); | |||
| uint32_t batch_args_size = sizeof(BatchLoadOpFromBufArgs); | |||
| status = rtMalloc(batch_args, batch_args_size, RT_MEMORY_HBM); | |||
| if (status != RT_ERROR_NONE) { | |||
| MS_LOG(ERROR) << "Call rtMalloc failed, size:" << batch_args_size << ", ret = 0x" << status; | |||
| return false; | |||
| } | |||
| allocated_mem->emplace_back(*batch_args); | |||
| status = rtMemcpy(*batch_args, batch_args_size, static_cast<void *>(&batch_cust_so), batch_args_size, | |||
| RT_MEMCPY_HOST_TO_DEVICE); | |||
| if (status != RT_ERROR_NONE) { | |||
| MS_LOG(ERROR) << "Call rtMemcpy failed, ret = 0x" << status; | |||
| return false; | |||
| } | |||
| return true; | |||
| } | |||
| bool AicpuOpKernelLoad::LaunchAicpuKernelSo() { | |||
| std::lock_guard<std::mutex> lock(cust_aicpu_mutex_); | |||
| if (cust_aicpu_so_.empty()) { | |||
| return true; | |||
| } | |||
| rtContext_t rt_cur_ctx = nullptr; | |||
| rtError_t status = RT_ERROR_NONE; | |||
| status = rtCtxGetCurrent(&rt_cur_ctx); | |||
| if (status != RT_ERROR_NONE) { | |||
| MS_LOG(ERROR) << "Call rtCtxGetCurrent failed, ret = 0x" << status; | |||
| return false; | |||
| } | |||
| // use current context as resource key | |||
| uintptr_t resource_id = reinterpret_cast<uintptr_t>(rt_cur_ctx); | |||
| auto it = cust_aicpu_so_.find(resource_id); | |||
| if (it == cust_aicpu_so_.end()) { | |||
| MS_LOG(INFO) << "Cust aicpu so map is empty, context id:" << resource_id; | |||
| return true; | |||
| } | |||
| std::vector<void *> allocated_mem; | |||
| void *batch_args = nullptr; | |||
| uint32_t batch_args_size = sizeof(BatchLoadOpFromBufArgs); | |||
| bool ret = CacheBinaryFileToDevice(resource_id, &allocated_mem, &batch_args); | |||
| allocated_mem_list_.emplace_back(std::move(allocated_mem)); | |||
| if (!ret) { | |||
| MS_LOG(ERROR) << "CacheBinaryFileToDevice is failed."; | |||
| return false; | |||
| } | |||
| rtStream_t stream = nullptr; | |||
| status = rtStreamCreate(&stream, 0); | |||
| if (status != RT_ERROR_NONE) { | |||
| MS_LOG(ERROR) << "Call rtStreamCreate failed, ret = 0x" << status; | |||
| return false; | |||
| } | |||
| stream_list_.emplace_back(stream); | |||
| // launch "batchLoadsoFrombuf" event to device. | |||
| std::string load_event(kBatchLoadBuf); | |||
| status = rtCpuKernelLaunch(nullptr, load_event.c_str(), 1, batch_args, batch_args_size, nullptr, stream); | |||
| if (status != RT_ERROR_NONE) { | |||
| MS_LOG(ERROR) << "Call rtCpuKernelLaunch failed, ret = 0x" << status; | |||
| return false; | |||
| } | |||
| status = rtStreamSynchronize(stream); | |||
| if (status != RT_ERROR_NONE) { | |||
| MS_LOG(ERROR) << "Call rtStreamSynchronize failed, ret = 0x" << status; | |||
| return false; | |||
| } | |||
| MS_LOG(INFO) << "Aicpu kernel so launch success."; | |||
| return true; | |||
| } | |||
| void AicpuOpKernelLoad::FreeDeviceMemory() { | |||
| for (auto allocated_mem : allocated_mem_list_) { | |||
| for (auto mem : allocated_mem) { | |||
| if (mem == nullptr) { | |||
| continue; | |||
| } | |||
| auto rt_error = rtFree(mem); | |||
| if (rt_error != RT_ERROR_NONE) { | |||
| MS_LOG(EXCEPTION) << "Call rtFree failed, ret = 0x" << rt_error; | |||
| } | |||
| } | |||
| } | |||
| for (auto stream : stream_list_) { | |||
| if (stream != nullptr) { | |||
| auto rt_error = rtStreamDestroy(stream); | |||
| if (rt_error != RT_ERROR_NONE) { | |||
| MS_LOG(EXCEPTION) << "Call rtStreamDestroy failed, ret = 0x" << rt_error; | |||
| } | |||
| } | |||
| } | |||
| so_name_and_realpath_map_.clear(); | |||
| cust_aicpu_so_.clear(); | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -0,0 +1,78 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AICPU_AICPU_KERNEL_LOAD_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AICPU_AICPU_KERNEL_LOAD_H_ | |||
| #include <memory> | |||
| #include <string> | |||
| #include <map> | |||
| #include <vector> | |||
| #include "runtime/base.h" | |||
| #include "base/base.h" | |||
| #include "ir/anf.h" | |||
| #include "backend/kernel_compiler/aicpu/aicpu_util.h" | |||
| #include "backend/kernel_compiler/aicpu/aicpu_kernel_mod.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| constexpr auto kBatchLoadBuf = "batchLoadsoFrombuf"; | |||
| #pragma pack(push, 1) | |||
| struct CustAicpuSoBuf { | |||
| uint64_t kernelSoBuf; | |||
| uint32_t kernelSoBufLen; | |||
| uint64_t kernelSoName; | |||
| uint32_t kernelSoNameLen; | |||
| }; | |||
| struct BatchLoadOpFromBufArgs { | |||
| uint32_t soNum; | |||
| uint64_t args; | |||
| }; | |||
| #pragma pack(pop) | |||
| class AicpuOpKernelLoad { | |||
| public: | |||
| AicpuOpKernelLoad() = default; | |||
| ~AicpuOpKernelLoad() = default; | |||
| static AicpuOpKernelLoad &GetInstance() { | |||
| static AicpuOpKernelLoad instance; | |||
| return instance; | |||
| } | |||
| bool LaunchAicpuKernelSo(); | |||
| bool LoadAicpuKernelSo(const AnfNodePtr &node, const std::shared_ptr<AicpuOpKernelMod> &kernel_mod_ptr); | |||
| void FreeDeviceMemory(); | |||
| private: | |||
| bool GetBinaryFileName(const std::string &so_name, const std::string &bin_folder_path, std::string *bin_file_path); | |||
| bool ReadBytesFromBinaryFile(const std::string &file_name, std::vector<char> *buffer) const; | |||
| bool GetSoNeedLoadPath(const std::string &so_name, std::string *file_path) const; | |||
| bool PackageBinaryFile(const std::string &so_name, std::map<std::string, OpKernelBinPtr> *so_name_with_bin_info); | |||
| bool CacheBinaryFileToDevice(const uintptr_t &resource_id, std::vector<void *> *allocated_mem, void **batch_args); | |||
| std::map<std::string, std::string> so_name_and_realpath_map_; | |||
| std::map<uintptr_t, std::map<std::string, OpKernelBinPtr>> cust_aicpu_so_; | |||
| std::mutex cust_aicpu_mutex_; | |||
| std::vector<rtStream_t> stream_list_; | |||
| std::vector<std::vector<void *>> allocated_mem_list_; | |||
| }; | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AICPU_AICPU_KERNEL_LOAD_H_ | |||
| @@ -36,9 +36,6 @@ using HostDynamicKernel = mindspore::device::ascend::HostDynamicKernel; | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| constexpr auto AICPU_OPS_SO_NAME = "libaicpu_kernels.so"; | |||
| constexpr auto CUST_AICPU_OPS_SO_NAME = "libcpu_kernels.so"; | |||
| AicpuOpKernelMod::AicpuOpKernelMod() : anf_node_(nullptr) {} | |||
| AicpuOpKernelMod::~AicpuOpKernelMod() { | |||
| @@ -63,6 +60,10 @@ void AicpuOpKernelMod::SetOutputList(const std::vector<int64_t> &outputList) { o | |||
| void AicpuOpKernelMod::SetNodeDef(const std::string &nodeDef) { (void)node_def_str_.assign(nodeDef); } | |||
| void AicpuOpKernelMod::SetExtInfo(const std::string &ext_info) { ext_info_ = ext_info; } | |||
| void AicpuOpKernelMod::SetNodeName(const std::string &node_name) { node_name_ = node_name; } | |||
| void AicpuOpKernelMod::SetCustSo(const std::string &cust_so) { | |||
| node_so_ = cust_so; | |||
| cust_kernel_ = true; | |||
| } | |||
| void AicpuOpKernelMod::SetAnfNode(const mindspore::AnfNodePtr &anf_node) { | |||
| MS_EXCEPTION_IF_NULL(anf_node); | |||
| anf_node_ = anf_node; | |||
| @@ -72,15 +73,17 @@ void AicpuOpKernelMod::CreateCpuKernelInfo(const std::vector<AddressPtr> &inputs | |||
| const std::vector<AddressPtr> &outputs) { | |||
| MS_LOG(INFO) << "CreateCpuKernelInfoOffline start"; | |||
| if (kCustAiCpuKernelOps.find(node_name_) != kCustAiCpuKernelOps.end()) { | |||
| node_so_ = CUST_AICPU_OPS_SO_NAME; | |||
| node_name_ = kCustRunApi; | |||
| } else if (kCacheKernelOps.find(node_name_) != kCacheKernelOps.end()) { | |||
| node_so_ = AICPU_OPS_SO_NAME; | |||
| node_name_ = kCustRunApi; | |||
| } else { | |||
| if (node_so_ != CUST_AICPU_OPS_SO_NAME) { | |||
| node_so_ = AICPU_OPS_SO_NAME; | |||
| if (!cust_kernel_) { | |||
| if (kCpuKernelOps.find(node_name_) != kCpuKernelOps.end()) { | |||
| node_so_ = kLibCpuKernelSoName; | |||
| node_name_ = kCpuRunApi; | |||
| } else if (kCacheKernelOps.find(node_name_) != kCacheKernelOps.end()) { | |||
| node_so_ = kLibAicpuKernelSoName; | |||
| node_name_ = kCpuRunApi; | |||
| } else { | |||
| if (node_so_ != kLibCpuKernelSoName) { | |||
| node_so_ = kLibAicpuKernelSoName; | |||
| } | |||
| } | |||
| } | |||
| // InputOutputAddr | |||
| @@ -149,12 +152,16 @@ bool AicpuOpKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std:: | |||
| if (node_name_ == kStack) { | |||
| node_name_ = kPack; | |||
| } | |||
| auto flag = RT_KERNEL_DEFAULT; | |||
| if (cust_kernel_) { | |||
| flag = RT_KERNEL_CUSTOM_AICPU; | |||
| } | |||
| MS_LOG(INFO) << "Aicpu launch, node_so_:" << node_so_ << ", node name:" << node_name_ | |||
| << ", args_size:" << args_.length(); | |||
| if (rtCpuKernelLaunch(reinterpret_cast<const void *>(node_so_.c_str()), | |||
| reinterpret_cast<const void *>(node_name_.c_str()), 1, | |||
| reinterpret_cast<const void *>(args_.data()), static_cast<uint32_t>(args_.length()), nullptr, | |||
| stream_) != RT_ERROR_NONE) { | |||
| if (rtCpuKernelLaunchWithFlag(reinterpret_cast<const void *>(node_so_.c_str()), | |||
| reinterpret_cast<const void *>(node_name_.c_str()), 1, | |||
| reinterpret_cast<const void *>(args_.data()), static_cast<uint32_t>(args_.length()), | |||
| nullptr, stream_, flag) != RT_ERROR_NONE) { | |||
| MS_LOG(ERROR) << "Aicpu op launch failed!"; | |||
| return false; | |||
| @@ -168,15 +175,17 @@ std::vector<TaskInfoPtr> AicpuOpKernelMod::GenTask(const std::vector<AddressPtr> | |||
| MS_LOG(INFO) << "AicpuOpKernelMod GenTask start"; | |||
| stream_id_ = stream_id; | |||
| if (kCustAiCpuKernelOps.find(node_name_) != kCustAiCpuKernelOps.end()) { | |||
| node_so_ = CUST_AICPU_OPS_SO_NAME; | |||
| node_name_ = kCustRunApi; | |||
| } else if (kCacheKernelOps.find(node_name_) != kCacheKernelOps.end()) { | |||
| node_so_ = AICPU_OPS_SO_NAME; | |||
| node_name_ = kCustRunApi; | |||
| } else { | |||
| if (node_so_ != CUST_AICPU_OPS_SO_NAME) { | |||
| node_so_ = AICPU_OPS_SO_NAME; | |||
| if (!cust_kernel_) { | |||
| if (kCpuKernelOps.find(node_name_) != kCpuKernelOps.end()) { | |||
| node_so_ = kLibCpuKernelSoName; | |||
| node_name_ = kCpuRunApi; | |||
| } else if (kCacheKernelOps.find(node_name_) != kCacheKernelOps.end()) { | |||
| node_so_ = kLibAicpuKernelSoName; | |||
| node_name_ = kCpuRunApi; | |||
| } else { | |||
| if (node_so_ != kLibCpuKernelSoName) { | |||
| node_so_ = kLibAicpuKernelSoName; | |||
| } | |||
| } | |||
| } | |||
| std::vector<void *> input_data_addrs; | |||
| @@ -197,7 +206,7 @@ std::vector<TaskInfoPtr> AicpuOpKernelMod::GenTask(const std::vector<AddressPtr> | |||
| AicpuTaskInfoPtr task_info_ptr = std::make_shared<mindspore::ge::model_runner::AicpuTaskInfo>( | |||
| unique_name_, stream_id, node_so_, node_name_, node_def_str_, ext_info_, input_data_addrs, output_data_addrs, | |||
| NeedDump()); | |||
| NeedDump(), cust_kernel_); | |||
| MS_LOG(INFO) << "AicpuOpKernelMod GenTask end"; | |||
| return {task_info_ptr}; | |||
| @@ -39,6 +39,7 @@ class AicpuOpKernelMod : public AscendKernelMod { | |||
| void SetNodeDef(const std::string &nodeDef); | |||
| void SetExtInfo(const std::string &ext_info); | |||
| void SetNodeName(const std::string &node_name); | |||
| void SetCustSo(const std::string &cust_so); | |||
| /** | |||
| * @brief Build AICPU Engine kernel structure, and allocate device memory for offline task generate | |||
| @@ -56,6 +57,7 @@ class AicpuOpKernelMod : public AscendKernelMod { | |||
| const std::vector<size_t> &GetWorkspaceSizeList() const override; | |||
| private: | |||
| bool cust_kernel_{false}; | |||
| std::string args_; | |||
| std::string node_def_str_; | |||
| std::string node_name_; | |||
| @@ -17,6 +17,8 @@ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AICPU_AICPU_UTIL_H_ | |||
| #include <cstdint> | |||
| #include <utility> | |||
| #include <memory> | |||
| #include <vector> | |||
| #include <map> | |||
| #include <set> | |||
| @@ -24,6 +26,8 @@ | |||
| #include "backend/kernel_compiler/kernel.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| constexpr auto kLibAicpuKernelSoName = "libaicpu_kernels.so"; | |||
| constexpr auto kLibCpuKernelSoName = "libcpu_kernels.so"; | |||
| constexpr auto kInitDataSetQueue = "InitDataSetQueue"; | |||
| constexpr auto kInitData = "InitData"; | |||
| constexpr auto kGetNext = "GetNext"; | |||
| @@ -55,7 +59,7 @@ constexpr auto kUpdateCache = "UpdateCache"; | |||
| constexpr auto kCacheSwapTable = "CacheSwapTable"; | |||
| constexpr auto kSubAndFilter = "SubAndFilter"; | |||
| constexpr auto kPadAndShift = "PadAndShift"; | |||
| constexpr auto kCustRunApi = "RunCpuKernel"; | |||
| constexpr auto kCpuRunApi = "RunCpuKernel"; | |||
| constexpr auto kDropout2D = "Dropout2D"; | |||
| constexpr auto kDropout3D = "Dropout3D"; | |||
| constexpr auto kMaskedSelect = "MaskedSelect"; | |||
| @@ -65,8 +69,8 @@ constexpr auto kSearchSorted = "SearchSorted"; | |||
| constexpr auto kResizeBilinear = "ResizeBilinear"; | |||
| constexpr auto kResizeBilinearGrad = "ResizeBilinearGrad"; | |||
| constexpr auto kScatterElements = "ScatterElements"; | |||
| const std::set<std::string> kCustAiCpuKernelOps{kIdentity, kMaskedSelect, kMaskedSelectGrad, kDynamicStitch, | |||
| kSearchSorted, kResizeBilinear, kResizeBilinearGrad, kScatterElements}; | |||
| const std::set<std::string> kCpuKernelOps{kIdentity, kMaskedSelect, kMaskedSelectGrad, kDynamicStitch, | |||
| kSearchSorted, kResizeBilinear, kResizeBilinearGrad, kScatterElements}; | |||
| const std::set<std::string> kCacheKernelOps{kUpdateCache, kCacheSwapTable, kSubAndFilter, | |||
| kPadAndShift, kDropout3D, kDropout2D}; | |||
| const std::set<std::string> kDynamicInputOps{ | |||
| @@ -118,6 +122,24 @@ class AicpuOpUtil { | |||
| // kernel id | |||
| static uint64_t KernelId_; | |||
| }; | |||
| class OpKernelBin { | |||
| public: | |||
| OpKernelBin(std::string name, std::vector<char> &&data) : name_(std::move(name)), data_(std::move(data)) {} | |||
| ~OpKernelBin() = default; | |||
| const std::string &GetName() const { return name_; } | |||
| const uint8_t *GetBinData() const { return (const uint8_t *)data_.data(); } | |||
| size_t GetBinDataSize() const { return data_.size(); } | |||
| OpKernelBin(const OpKernelBin &) = delete; | |||
| const OpKernelBin &operator=(const OpKernelBin &) = delete; | |||
| private: | |||
| std::string name_; | |||
| std::vector<char> data_; | |||
| }; | |||
| using OpKernelBinPtr = std::shared_ptr<OpKernelBin>; | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -34,6 +34,7 @@ | |||
| #include "runtime/device/ascend/tasksink/task_generator.h" | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| #include "backend/session/kernel_build_client.h" | |||
| #include "backend/kernel_compiler/aicpu/aicpu_kernel_load.h" | |||
| #ifndef ENABLE_SECURITY | |||
| #include "runtime/device/ascend/profiling/profiling_manager.h" | |||
| #include "runtime/device/ascend/profiling/profiling_utils.h" | |||
| @@ -286,6 +287,7 @@ void AscendKernelRuntime::ReleaseDeviceRes() { | |||
| if (mem_manager_ != nullptr) { | |||
| mem_manager_->FreeDeviceMemory(); | |||
| } | |||
| mindspore::kernel::AicpuOpKernelLoad::GetInstance().FreeDeviceMemory(); | |||
| auto rt_ret = rtRegTaskFailCallbackByModule(kModuleName, nullptr); | |||
| if (rt_ret != RT_ERROR_NONE) { | |||
| @@ -438,6 +440,9 @@ bool AscendKernelRuntime::Load(const session::KernelGraph &graph, bool is_task_s | |||
| if (!LoadTask(graph)) { | |||
| return false; | |||
| } | |||
| if (!mindspore::kernel::AicpuOpKernelLoad::GetInstance().LaunchAicpuKernelSo()) { | |||
| return false; | |||
| } | |||
| return true; | |||
| } | |||
| @@ -73,13 +73,14 @@ void AicpuTask::Distribute() { | |||
| // for data dump | |||
| input_output_addr_ = reinterpret_cast<void *>(reinterpret_cast<uint8_t *>(args_) + io_addr_offset); | |||
| auto dump_flag = task_info_->dump_flag() ? RT_KERNEL_DUMPFLAG : RT_KERNEL_DEFAULT; | |||
| auto cpu_flag = task_info_->cust_aicpu() ? RT_KERNEL_CUSTOM_AICPU : dump_flag; | |||
| MS_LOG(INFO) << "Distribute AicpuTask start, args_size = " << args_size << ", io_addrs_num =" << io_addrs_num | |||
| << ", so_name = " << task_info_->so_name() << ", kernel_name = " << task_info_->kernel_name() | |||
| << ", dump_flag = " << dump_flag; | |||
| rt_ret = rtCpuKernelLaunchWithFlag(reinterpret_cast<const void *>(task_info_->so_name().data()), | |||
| reinterpret_cast<const void *>(task_info_->kernel_name().data()), 1, args_, | |||
| args_size, nullptr, stream_, dump_flag); | |||
| args_size, nullptr, stream_, cpu_flag); | |||
| if (rt_ret != RT_ERROR_NONE) { | |||
| MS_LOG(EXCEPTION) << "Call rt api rtCpuKernelLaunchWithFlag failed, ret: " << rt_ret; | |||
| } | |||
| @@ -119,14 +119,15 @@ class AicpuTaskInfo : public TaskInfo { | |||
| AicpuTaskInfo(const std::string &op_name, uint32_t stream_id, const std::string &so_name, | |||
| const std::string &kernel_name, const std::string &node_def, const std::string &ext_info, | |||
| const std::vector<void *> &input_data_addrs, const std::vector<void *> &output_data_addrs, | |||
| bool dump_flag) | |||
| bool dump_flag, bool cust_aicpu = false) | |||
| : TaskInfo(op_name, stream_id, TaskInfoType::AICPU, dump_flag), | |||
| so_name_(so_name), | |||
| kernel_name_(kernel_name), | |||
| node_def_(node_def), | |||
| ext_info_(ext_info), | |||
| input_data_addrs_(input_data_addrs), | |||
| output_data_addrs_(output_data_addrs) {} | |||
| output_data_addrs_(output_data_addrs), | |||
| cust_aicpu_(cust_aicpu) {} | |||
| ~AicpuTaskInfo() override {} | |||
| const std::string &so_name() const { return so_name_; } | |||
| @@ -135,6 +136,7 @@ class AicpuTaskInfo : public TaskInfo { | |||
| const std::vector<void *> &input_data_addrs() const { return input_data_addrs_; } | |||
| const std::vector<void *> &output_data_addrs() const { return output_data_addrs_; } | |||
| const std::string &ext_info() const { return ext_info_; } | |||
| const bool &cust_aicpu() const { return cust_aicpu_; } | |||
| private: | |||
| std::string so_name_; | |||
| @@ -143,6 +145,7 @@ class AicpuTaskInfo : public TaskInfo { | |||
| std::string ext_info_; | |||
| std::vector<void *> input_data_addrs_; | |||
| std::vector<void *> output_data_addrs_; | |||
| bool cust_aicpu_; | |||
| }; | |||
| class LabelSetTaskInfo : public TaskInfo { | |||
| @@ -492,6 +492,7 @@ constexpr auto kAttrDstType = "dst_type"; | |||
| constexpr auto kAttrDump = "dump"; | |||
| constexpr auto kAttrSkipNopOpAddr = "skip_nop_op_addr"; | |||
| constexpr auto kAttrFuncType = "func_type"; | |||
| constexpr auto kAttrCustAicpu = "cust_aicpu"; | |||
| // custom operator func type | |||
| constexpr auto kCustomTypeAOT = "aot"; | |||
| @@ -631,6 +631,38 @@ def prim_attr_register(fn): | |||
| return deco | |||
| def custom_aicpu_register(custom_aicpu_so="mindspore_aicpu_kernels"): | |||
| """Register custom aicpu attribute. | |||
| Args: | |||
| custom_aicpu_so (str): Path of the dynamic library loaded by the aicpu ops. | |||
| Default: "mindspore_aicpu_kernels" | |||
| """ | |||
| def deco(fn): | |||
| def wrapper(self, *args, **kwargs): | |||
| if not isinstance(custom_aicpu_so, str): | |||
| raise ValueError(f"custom_aicpu_so must be a str, but got {custom_aicpu_so}") | |||
| class_name = self.__class__.__name__ | |||
| if hasattr(self.__class__, "substitute_name"): | |||
| class_name = self.__class__.substitute_name | |||
| if isinstance(self, PrimitiveWithInfer): | |||
| PrimitiveWithInfer.__init__(self, class_name) | |||
| elif isinstance(self, PrimitiveWithCheck): | |||
| PrimitiveWithCheck.__init__(self, class_name) | |||
| else: | |||
| Primitive.__init__(self, self.__class__.__name__) | |||
| attr_name = "cust_aicpu" | |||
| self.add_prim_attr(attr_name, custom_aicpu_so) | |||
| self.init_attrs[attr_name] = custom_aicpu_so | |||
| ret = fn(self, *args, **kwargs) | |||
| return ret | |||
| return wrapper | |||
| return deco | |||
| def constexpr(fn=None, get_instance=True, name=None): | |||
| """ | |||
| Creates a PrimitiveWithInfer operator that can infer the value at compile time. We can use it to define a function | |||
| @@ -14,6 +14,7 @@ | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/kernel.h" | |||
| #include "backend/kernel_compiler/aicpu/aicpu_kernel_load.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| @@ -22,5 +23,8 @@ namespace kernel { | |||
| * @brief build op and return a callable mod | |||
| */ | |||
| KernelModPtr AicpuOpBuild(const AnfNodePtr &anf_node) { return nullptr; } | |||
| bool AicpuOpKernelLoad::LaunchAicpuKernelSo() { return true; } | |||
| void AicpuOpKernelLoad::FreeDeviceMemory() {} | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||