Merge pull request !28310 from TuDouNi/dynamic_shape_stage1tags/v1.6.0
| @@ -1,5 +1,6 @@ | |||
| file(GLOB_RECURSE KERNEL_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} | |||
| "kernel_build_info.cc" | |||
| "kernel.cc" | |||
| "kash/*.cc" | |||
| "common_utils.cc" | |||
| "oplib/*.cc" | |||
| @@ -12,6 +13,7 @@ endif() | |||
| if(ENABLE_D) | |||
| file(GLOB_RECURSE D_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} | |||
| "ascend_kernel_mod.cc" | |||
| "kernel_query.cc" | |||
| "tbe/*.cc" | |||
| "host/*.cc" | |||
| @@ -36,13 +36,12 @@ using HostDynamicKernel = mindspore::device::ascend::HostDynamicKernel; | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| AicpuOpKernelMod::AicpuOpKernelMod() : anf_node_(nullptr) {} | |||
| AicpuOpKernelMod::AicpuOpKernelMod() {} | |||
| AicpuOpKernelMod::~AicpuOpKernelMod() { | |||
| args_.clear(); | |||
| inputList_.clear(); | |||
| outputList_.clear(); | |||
| anf_node_ = nullptr; | |||
| input_list_.clear(); | |||
| output_list_.clear(); | |||
| input_size_list_.clear(); | |||
| output_size_list_.clear(); | |||
| workspace_size_list_.clear(); | |||
| @@ -55,9 +54,9 @@ void AicpuOpKernelMod::SetOutputSizeList(const std::vector<size_t> &size_list) { | |||
| const std::vector<size_t> &AicpuOpKernelMod::GetOutputSizeList() const { return output_size_list_; } | |||
| void AicpuOpKernelMod::SetWorkspaceSizeList(const std::vector<size_t> &size_list) { workspace_size_list_ = size_list; } | |||
| const std::vector<size_t> &AicpuOpKernelMod::GetWorkspaceSizeList() const { return workspace_size_list_; } | |||
| void AicpuOpKernelMod::SetInputList(const std::vector<int64_t> &inputList) { inputList_ = inputList; } | |||
| void AicpuOpKernelMod::SetOutputList(const std::vector<int64_t> &outputList) { outputList_ = outputList; } | |||
| void AicpuOpKernelMod::SetNodeDef(const std::string &nodeDef) { (void)node_def_str_.assign(nodeDef); } | |||
| void AicpuOpKernelMod::SetInputList(const std::vector<int64_t> &input_list) { input_list_ = input_list; } | |||
| void AicpuOpKernelMod::SetOutputList(const std::vector<int64_t> &output_list) { output_list_ = output_list; } | |||
| void AicpuOpKernelMod::SetNodeDef(const std::string &node_def) { (void)node_def_str_.assign(node_def); } | |||
| void AicpuOpKernelMod::SetExtInfo(const std::string &ext_info) { ext_info_ = ext_info; } | |||
| void AicpuOpKernelMod::SetNodeName(const std::string &node_name) { node_name_ = node_name; } | |||
| void AicpuOpKernelMod::SetCustSo(const std::string &cust_so) { | |||
| @@ -85,11 +84,18 @@ void AicpuOpKernelMod::CreateCpuKernelInfo(const std::vector<AddressPtr> &inputs | |||
| node_so_ = kLibAicpuKernelSoName; | |||
| } | |||
| } | |||
| } else { | |||
| if (kCpuKernelBaseOps.find(node_name_) == kCpuKernelBaseOps.end()) { | |||
| node_name_ = kCpuRunApi; | |||
| } | |||
| } else if (kCpuKernelBaseOps.find(node_name_) == kCpuKernelBaseOps.end()) { | |||
| node_name_ = kCpuRunApi; | |||
| } | |||
| if (node_name_ == kTopK) { | |||
| node_name_ = kTopKV2; | |||
| } | |||
| if (node_name_ == kStack) { | |||
| node_name_ = kPack; | |||
| } | |||
| // InputOutputAddr | |||
| vector<void *> io_addrs; | |||
| (void)std::transform(std::begin(inputs), std::end(inputs), std::back_inserter(io_addrs), | |||
| @@ -120,6 +126,8 @@ void AicpuOpKernelMod::CreateCpuKernelInfo(const std::vector<AddressPtr> &inputs | |||
| aicpu_param_head.extInfoAddr = 0; | |||
| } else { | |||
| MS_LOG(INFO) << "Dynamic Kernel Ext Info size:" << ext_info_.size(); | |||
| aicpu_param_head.extInfoLength = SizeToUint(ext_info_.size()); | |||
| aicpu_param_head.extInfoAddr = reinterpret_cast<uint64_t>(ext_info_addr_dev_); | |||
| } | |||
| args_.clear(); | |||
| @@ -162,6 +170,8 @@ bool AicpuOpKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std:: | |||
| } | |||
| MS_LOG(INFO) << "Aicpu launch, node_so_:" << node_so_ << ", node name:" << node_name_ | |||
| << ", args_size:" << args_.length(); | |||
| // cppcheck-suppress unreadVariable | |||
| auto lock = AscendKernelMod::LockRuntime(); | |||
| if (rtCpuKernelLaunchWithFlag(reinterpret_cast<const void *>(node_so_.c_str()), | |||
| reinterpret_cast<const void *>(node_name_.c_str()), 1, | |||
| reinterpret_cast<const void *>(args_.data()), static_cast<uint32_t>(args_.length()), | |||
| @@ -25,6 +25,8 @@ namespace kernel { | |||
| class AicpuOpKernelMod : public AscendKernelMod { | |||
| public: | |||
| AicpuOpKernelMod(); | |||
| explicit AicpuOpKernelMod(const AnfNodePtr &anf_node_ptr) : AscendKernelMod(anf_node_ptr) {} | |||
| ~AicpuOpKernelMod() override; | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) override; | |||
| @@ -33,10 +35,10 @@ class AicpuOpKernelMod : public AscendKernelMod { | |||
| const std::vector<AddressPtr> &outputs, uint32_t stream_id) override; | |||
| device::DynamicKernelPtr GenDynamicKernel(const CNodePtr &cnode_ptr, void *stream_ptr) override; | |||
| void SetInputList(const std::vector<int64_t> &inputList); | |||
| void SetOutputList(const std::vector<int64_t> &outputList); | |||
| void SetInputList(const std::vector<int64_t> &input_list); | |||
| void SetOutputList(const std::vector<int64_t> &output_list); | |||
| void SetAnfNode(const AnfNodePtr &anf_node); | |||
| void SetNodeDef(const std::string &nodeDef); | |||
| void SetNodeDef(const std::string &node_def); | |||
| void SetExtInfo(const std::string &ext_info); | |||
| void SetNodeName(const std::string &node_name); | |||
| void SetCustSo(const std::string &cust_so); | |||
| @@ -56,16 +58,18 @@ class AicpuOpKernelMod : public AscendKernelMod { | |||
| const std::vector<size_t> &GetOutputSizeList() const override; | |||
| const std::vector<size_t> &GetWorkspaceSizeList() const override; | |||
| private: | |||
| bool cust_kernel_{false}; | |||
| protected: | |||
| std::string args_; | |||
| std::string node_def_str_; | |||
| std::string ext_info_; | |||
| std::string node_name_; | |||
| std::string node_so_; | |||
| std::string ext_info_; | |||
| std::vector<int64_t> inputList_; | |||
| std::vector<int64_t> outputList_; | |||
| AnfNodePtr anf_node_; | |||
| bool cust_kernel_{false}; | |||
| std::string node_def_str_; | |||
| void *ext_info_addr_dev_ = nullptr; | |||
| private: | |||
| std::vector<int64_t> input_list_; | |||
| std::vector<int64_t> output_list_; | |||
| std::vector<size_t> input_size_list_; | |||
| std::vector<size_t> output_size_list_; | |||
| @@ -0,0 +1,231 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/aicpu/dynamic_aicpu_kernel_mod.h" | |||
| #include <memory> | |||
| #include <vector> | |||
| #include <string> | |||
| #include <algorithm> | |||
| #include "runtime/mem.h" | |||
| #include "acl/acl_rt.h" | |||
| #include "utils/convert_utils.h" | |||
| #include "backend/kernel_compiler/aicpu/aicpu_util.h" | |||
| #include "utils/ms_context.h" | |||
| #include "runtime/device/kernel_runtime.h" | |||
| #include "runtime/kernel.h" | |||
| #include "utils/utils.h" | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| DynamicAicpuOpKernelMod::DynamicAicpuOpKernelMod(const AnfNodePtr &anf_node_ptr) : AicpuOpKernelMod(anf_node_ptr) { | |||
| unknow_type_ = device::ascend::UnknowShapeOpType::DEPEND_IN_SHAPE; | |||
| auto cnode = anf_node_ptr->cast<CNodePtr>(); | |||
| if (cnode != nullptr) { | |||
| auto op_name = AnfAlgo::GetCNodeName(cnode); | |||
| if (kComputeDepend.find(op_name) != kComputeDepend.end()) { | |||
| unknow_type_ = device::ascend::UnknowShapeOpType::DEPEND_COMPUTE; | |||
| } | |||
| } | |||
| } | |||
| DynamicAicpuOpKernelMod::~DynamicAicpuOpKernelMod() { | |||
| // free dev ptr | |||
| if (ext_info_addr_dev_ == nullptr) { | |||
| return; | |||
| } | |||
| auto ret = rtFree(ext_info_addr_dev_); | |||
| if (ret != RT_ERROR_NONE) { | |||
| MS_LOG(ERROR) << "rtFree failed"; | |||
| } | |||
| } | |||
| void DynamicAicpuOpKernelMod::InferOp() { | |||
| auto node = anf_node_.lock(); | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| if (!AnfAlgo::IsDynamicShape(node)) { | |||
| MS_LOG(EXCEPTION) << "The node is not dynamic shape."; | |||
| } | |||
| KernelMod::InferShape(); | |||
| } | |||
| void DynamicAicpuOpKernelMod::InitOp() { | |||
| auto node = anf_node_.lock(); | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| auto cnode = node->cast<CNodePtr>(); | |||
| MS_EXCEPTION_IF_NULL(cnode); | |||
| if (!AnfAlgo::IsDynamicShape(cnode)) { | |||
| MS_LOG(EXCEPTION) << "The node is not dynamic shape: " << cnode->fullname_with_scope(); | |||
| } | |||
| MS_LOG(INFO) << "UpdateExtInfo of " << cnode->fullname_with_scope() << " start"; | |||
| auto input_num = AnfAlgo::GetInputTensorNum(cnode); | |||
| auto output_num = AnfAlgo::GetOutputTensorNum(cnode); | |||
| if (input_num == 0 && output_num == 0) { | |||
| MS_LOG(INFO) << "Node:" << cnode->fullname_with_scope() << " no need to update output shape"; | |||
| return; | |||
| } | |||
| // Parse aicpu ext info | |||
| ext_info_handler_ = std::make_shared<device::ascend::AicpuExtInfoHandler>( | |||
| cnode->fullname_with_scope(), static_cast<uint32_t>(input_num), static_cast<uint32_t>(output_num), unknow_type_); | |||
| MS_EXCEPTION_IF_NULL(ext_info_handler_); | |||
| if (!ext_info_handler_->Parse(ext_info_)) { | |||
| MS_LOG(EXCEPTION) << "Parse AiCpu ext_info_handler failed"; | |||
| } | |||
| if (ext_info_.empty()) { | |||
| MS_LOG(INFO) << "No need to copy to device, ext_info_ is empty. "; | |||
| return; | |||
| } | |||
| for (size_t i = 0; i < input_num; ++i) { | |||
| if (!ext_info_handler_->UpdateInputShapeAndType(i, NOT_NULL(cnode))) { | |||
| MS_LOG(EXCEPTION) << "Update input shape failed, cnode:" << cnode->fullname_with_scope() << " input:" << i; | |||
| } | |||
| } | |||
| if (unknow_type_ != device::ascend::UnknowShapeOpType::DEPEND_COMPUTE) { | |||
| for (size_t i = 0; i < output_num; ++i) { | |||
| if (!ext_info_handler_->UpdateOutputShapeAndType(i, NOT_NULL(cnode))) { | |||
| MS_LOG(EXCEPTION) << "Update output shape failed, cnode:" << cnode->fullname_with_scope() << " output:" << i; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| void DynamicAicpuOpKernelMod::AllocateExtInfoDeviceAddr(const CNodePtr &cnode) { | |||
| MS_EXCEPTION_IF_NULL(cnode); | |||
| if (ext_info_addr_dev_ != nullptr) { | |||
| return; | |||
| } | |||
| // Allocate ext info addr in device | |||
| if (ext_info_.size() != 0) { | |||
| auto ret = rtMalloc(&ext_info_addr_dev_, ext_info_.size(), RT_MEMORY_HBM); | |||
| if (ret != RT_ERROR_NONE) { | |||
| MS_LOG(EXCEPTION) << "Call rtMalloc ext_info_addr_dev_ failed. Op name: " << cnode->fullname_with_scope(); | |||
| } | |||
| } | |||
| ext_info_size_ = ext_info_.size(); | |||
| } | |||
| bool DynamicAicpuOpKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &, | |||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) { | |||
| if (stream_ptr == nullptr) { | |||
| MS_LOG(ERROR) << "stream_ptr should not be nullptr."; | |||
| return false; | |||
| } | |||
| if (stream_ == nullptr) { | |||
| stream_ = stream_ptr; | |||
| } | |||
| auto node = anf_node_.lock(); | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| auto cnode = node->cast<CNodePtr>(); | |||
| MS_EXCEPTION_IF_NULL(cnode); | |||
| MS_LOG(INFO) << "Start launch of node: " << cnode->fullname_with_scope(); | |||
| // is dynamic shape | |||
| if (!AnfAlgo::IsDynamicShape(cnode)) { | |||
| MS_LOG(EXCEPTION) << "The cnode is not dynamic shape:" << cnode->fullname_with_scope(); | |||
| } | |||
| // copy extinfo to device | |||
| AllocateExtInfoDeviceAddr(cnode); | |||
| MS_EXCEPTION_IF_NULL(ext_info_handler_); | |||
| auto ret = aclrtMemcpy(ext_info_addr_dev_, ext_info_size_, ext_info_handler_->GetExtInfo(), | |||
| ext_info_handler_->GetExtInfoLen(), ACL_MEMCPY_HOST_TO_DEVICE); | |||
| if (ret != RT_ERROR_NONE) { | |||
| MS_LOG(ERROR) << "UpdateExtInfo aclrtMemcpy failed. Node info: " << cnode->fullname_with_scope(); | |||
| return false; | |||
| } | |||
| AicpuOpKernelMod::CreateCpuKernelInfo(inputs, outputs); | |||
| MS_LOG(INFO) << "Aicpu launch, node_so_:" << node_so_ << ", node name:" << node_name_ | |||
| << ", args_size:" << args_.length(); | |||
| // cppcheck-suppress unreadVariable | |||
| auto lock = AscendKernelMod::LockRuntime(); | |||
| ret = rtCpuKernelLaunchWithFlag(reinterpret_cast<const void *>(node_so_.c_str()), | |||
| reinterpret_cast<const void *>(node_name_.c_str()), 1, | |||
| reinterpret_cast<const void *>(args_.data()), static_cast<uint32_t>(args_.length()), | |||
| nullptr, stream_, RT_KERNEL_DEFAULT); | |||
| if (ret != RT_ERROR_NONE) { | |||
| MS_LOG(ERROR) << "Aicpu op launch failed!"; | |||
| return false; | |||
| } | |||
| if (unknow_type_ == device::ascend::UnknowShapeOpType::DEPEND_COMPUTE) { | |||
| ret = aclrtMemcpyAsync(ext_info_handler_->GetExtInfo(), ext_info_handler_->GetExtInfoLen(), ext_info_addr_dev_, | |||
| ext_info_size_, ACL_MEMCPY_DEVICE_TO_HOST, stream_); | |||
| if (ret != RT_ERROR_NONE) { | |||
| MS_LOG(ERROR) << "aclrtMemcpyAsync output shape failed. Op name: " << cnode->fullname_with_scope(); | |||
| return false; | |||
| } | |||
| } | |||
| return true; | |||
| } | |||
| void DynamicAicpuOpKernelMod::UpdateOp() { | |||
| auto node = anf_node_.lock(); | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| auto cnode = node->cast<CNodePtr>(); | |||
| MS_EXCEPTION_IF_NULL(cnode); | |||
| MS_LOG(INFO) << "Aicpu " << cnode->fullname_with_scope() << " PostExecute"; | |||
| // is dynamic shape | |||
| if (!AnfAlgo::IsDynamicShape(cnode)) { | |||
| MS_LOG(EXCEPTION) << "The cnode is not dynamic shape:" << cnode->fullname_with_scope(); | |||
| } | |||
| if (unknow_type_ != device::ascend::UnknowShapeOpType::DEPEND_COMPUTE) { | |||
| MS_LOG(INFO) << "Node " << node->fullname_with_scope() << " update op skip."; | |||
| return; | |||
| } | |||
| // cppcheck-suppress unreadVariable | |||
| auto lock = AscendKernelMod::LockRuntime(); | |||
| auto ret = rtStreamSynchronize(stream_); | |||
| if (ret != RT_ERROR_NONE) { | |||
| MS_LOG(EXCEPTION) << "Call runtime rtStreamSynchronize failed. Op name: " << cnode->fullname_with_scope(); | |||
| } | |||
| MS_LOG(INFO) << "Update aicpu kernel output shape from ext_info. Op name: " << cnode->fullname_with_scope(); | |||
| UpdateOutputShapeFromExtInfo(cnode); | |||
| } | |||
| bool DynamicAicpuOpKernelMod::UpdateOutputShapeFromExtInfo(const CNodePtr &cnode) { | |||
| MS_EXCEPTION_IF_NULL(cnode); | |||
| MS_LOG(INFO) << "UpdateOutputShapeFromExtInfo start. Op name " << cnode->fullname_with_scope(); | |||
| MS_EXCEPTION_IF_NULL(ext_info_handler_); | |||
| std::vector<TypeId> type_ids; | |||
| std::vector<std::vector<size_t>> shapes; | |||
| auto output_num = AnfAlgo::GetOutputTensorNum(cnode); | |||
| for (size_t i = 0; i < output_num; ++i) { | |||
| MS_LOG(INFO) << "Get output:" << output_num << " Shape"; | |||
| std::vector<int64_t> shape; | |||
| TypeId type_id; | |||
| (void)ext_info_handler_->GetOutputShapeAndType(SizeToUint(i), NOT_NULL(&shape), NOT_NULL(&type_id)); | |||
| type_ids.emplace_back(type_id); | |||
| std::vector<size_t> size_t_shape; | |||
| std::transform(shape.begin(), shape.end(), std::back_inserter(size_t_shape), LongToSize); | |||
| shapes.emplace_back(size_t_shape); | |||
| } | |||
| AnfAlgo::SetOutputInferTypeAndShape(type_ids, shapes, cnode.get()); | |||
| return true; | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -0,0 +1,54 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AICPU_DYNAMIC_AICPU_KERNEL_MOD_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AICPU_DYNAMIC_AICPU_KERNEL_MOD_H_ | |||
| #include <vector> | |||
| #include <memory> | |||
| #include <string> | |||
| #include "backend/kernel_compiler/aicpu/aicpu_kernel_mod.h" | |||
| #include "backend/kernel_compiler/aicpu/aicpu_util.h" | |||
| #include "runtime/device/ascend/executor/aicpu_ext_info_handle.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| class DynamicAicpuOpKernelMod : public AicpuOpKernelMod { | |||
| public: | |||
| DynamicAicpuOpKernelMod() : unknow_type_(device::ascend::UnknowShapeOpType::DEPEND_IN_SHAPE) {} | |||
| explicit DynamicAicpuOpKernelMod(const AnfNodePtr &anf_node_ptr); | |||
| ~DynamicAicpuOpKernelMod() override; | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) override; | |||
| void InferOp() override; | |||
| void InitOp() override; | |||
| void UpdateOp() override; | |||
| private: | |||
| void AllocateExtInfoDeviceAddr(const CNodePtr &cnode); | |||
| bool UpdateOutputShapeFromExtInfo(const CNodePtr &cnode); | |||
| std::shared_ptr<device::ascend::AicpuExtInfoHandler> ext_info_handler_ = nullptr; | |||
| size_t ext_info_size_ = 0; | |||
| device::ascend::UnknowShapeOpType unknow_type_; | |||
| }; | |||
| using DynamicAicpuOpKernelModPtr = std::shared_ptr<DynamicAicpuOpKernelMod>; | |||
| using DynamicAicputOpKernelModPtrList = std::vector<DynamicAicpuOpKernelModPtr>; | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AICPU_DYNAMIC_AICPU_KERNEL_MOD_H_ | |||
| @@ -0,0 +1,35 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/ascend_kernel_mod.h" | |||
| #include "runtime/rt.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| void AscendKernelMod::UpdateOp() { | |||
| MS_EXCEPTION_IF_NULL(stream_); | |||
| // cppcheck-suppress unreadVariable | |||
| auto lock = LockRuntime(); | |||
| if (RT_ERROR_NONE != rtStreamSynchronize(stream_)) { | |||
| MS_LOG(EXCEPTION) << "Call runtime rtStreamSynchronize failed."; | |||
| } | |||
| } | |||
| std::lock_guard<std::mutex> AscendKernelMod::LockRuntime() { | |||
| static std::mutex mutex; | |||
| return std::lock_guard<std::mutex>(mutex); | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -31,6 +31,8 @@ namespace mindspore { | |||
| namespace kernel { | |||
| class AscendKernelMod : public KernelMod { | |||
| public: | |||
| AscendKernelMod() {} | |||
| explicit AscendKernelMod(const AnfNodePtr &anf_node_ptr) : KernelMod(anf_node_ptr) {} | |||
| virtual std::vector<TaskInfoPtr> GenTask(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &, | |||
| const std::vector<AddressPtr> &, uint32_t) = 0; | |||
| uint32_t block_dim() { return block_dim_; } | |||
| @@ -44,6 +46,7 @@ class AscendKernelMod : public KernelMod { | |||
| return false; | |||
| #endif | |||
| } | |||
| void UpdateOp() override; | |||
| void InitDynamicKernel(const CNodePtr &cnode_ptr, void *stream) { | |||
| if (dynamic_kernel_ == nullptr) { | |||
| @@ -54,6 +57,8 @@ class AscendKernelMod : public KernelMod { | |||
| } | |||
| device::DynamicKernelPtr DynamicKernel() const { return dynamic_kernel_; } | |||
| static std::lock_guard<std::mutex> LockRuntime(); | |||
| protected: | |||
| uint32_t block_dim_{1}; | |||
| uint32_t stream_id_{0}; | |||
| @@ -66,7 +66,13 @@ HcclKernelFactory &HcclKernelFactory::Get() { | |||
| HcclKernel::HcclKernel() | |||
| : hccl_count_(0), op_type_(::HcclReduceOp::HCCL_REDUCE_SUM), root_id_(0), src_rank_(0), dest_rank_(0) {} | |||
| HcclKernel::HcclKernel(const AnfNodePtr &anf_node) | |||
| : AscendKernelMod(), | |||
| hccl_count_(0), | |||
| op_type_(::HcclReduceOp::HCCL_REDUCE_SUM), | |||
| root_id_(0), | |||
| src_rank_(0), | |||
| dest_rank_(0) {} | |||
| HcclKernel::~HcclKernel() { | |||
| hccl_kernel_input_shape_list_.clear(); | |||
| hccl_kernel_output_shape_list_.clear(); | |||
| @@ -294,5 +300,99 @@ device::DynamicKernelPtr HcclKernel::GenDynamicKernel(const CNodePtr &cnode_ptr, | |||
| hccl_type, input_data_addr, output_data_addr, hccl_count_, data_type, op_type_, root_id_, stream_ptr, cnode_ptr); | |||
| return executor; | |||
| } | |||
| void HcclKernel::InferOp() { | |||
| if (AnfAlgo::IsDynamicShape(anf_node_.lock())) { | |||
| KernelMod::InferShape(); | |||
| } | |||
| } | |||
| bool HcclKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &, | |||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) { | |||
| auto node = anf_node_.lock(); | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| if (!node->isa<CNode>()) { | |||
| MS_LOG(EXCEPTION) << "anfnode is not a cnode"; | |||
| } | |||
| auto cnode = node->cast<CNodePtr>(); | |||
| MS_EXCEPTION_IF_NULL(cnode); | |||
| if (inputs.empty() && outputs.empty()) { | |||
| MS_LOG(ERROR) << "Hccl kernel input or output is empty"; | |||
| return false; | |||
| } | |||
| if (hccl_data_type_list_.empty()) { | |||
| MS_LOG(ERROR) << "Hccl data type list is empty"; | |||
| return false; | |||
| } | |||
| MS_EXCEPTION_IF_NULL(stream_ptr); | |||
| MS_LOG(INFO) << "Start Execute: " << cnode->DebugString(); | |||
| std::string hccl_type = MsOpNameToHcomOpType(AnfAlgo::GetCNodeName(anf_node_.lock())); | |||
| HcclDataType data_type = hccl_data_type_list_[0]; | |||
| ::HcomOperation op_info; | |||
| op_info.hcclType = hccl_type; | |||
| op_info.inputPtr = inputs[0]->addr; | |||
| op_info.outputPtr = outputs[0]->addr; | |||
| op_info.dataType = static_cast<HcclDataType>(data_type); | |||
| op_info.opType = static_cast<HcclReduceOp>(op_type_); | |||
| op_info.root = IntToUint(root_id_); | |||
| op_info.count = hccl_count_; | |||
| auto callback = [this](HcclResult status) { | |||
| if (status != HCCL_SUCCESS) { | |||
| MS_LOG(ERROR) << "HcomExcutorInitialize failed, ret:" << status; | |||
| } | |||
| std::lock_guard<std::mutex> lock(this->hccl_mutex_); | |||
| this->cond_.notify_all(); | |||
| MS_LOG(INFO) << "hccl callback success."; | |||
| }; | |||
| auto hccl_ret = hccl::HcclAdapter::GetInstance().HcclExecEnqueueOp(op_info, callback); | |||
| if (hccl_ret != HCCL_SUCCESS) { | |||
| MS_LOG(EXCEPTION) << "Call EnqueueHcomOperation failed, node info: " << cnode->DebugString(); | |||
| return false; | |||
| } | |||
| std::unique_lock<std::mutex> ulock(hccl_mutex_); | |||
| cond_.wait(ulock); | |||
| MS_LOG(INFO) << "Execute " << cnode->DebugString() << " success"; | |||
| return true; | |||
| } | |||
| void HcclKernel::InitOp() { | |||
| auto node = anf_node_.lock(); | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| if (!node->isa<CNode>()) { | |||
| MS_LOG(EXCEPTION) << "anfnode is not a cnode"; | |||
| } | |||
| auto cnode = node->cast<CNodePtr>(); | |||
| MS_EXCEPTION_IF_NULL(cnode); | |||
| if (!AnfAlgo::IsDynamicShape(cnode)) { | |||
| MS_LOG(DEBUG) << "The node is not dynamic shape: " << cnode->fullname_with_scope(); | |||
| return; | |||
| } | |||
| MS_LOG(INFO) << "Start to InitOp. Node info: " << cnode->DebugString(); | |||
| std::vector<std::vector<size_t>> hccl_kernel_input_shape_list; | |||
| if (!HcomUtil::GetKernelInputShape(cnode, &hccl_kernel_input_shape_list)) { | |||
| MS_LOG(EXCEPTION) << "GetKernelInputShape fail! Node info: " << cnode->DebugString(); | |||
| } | |||
| std::vector<HcclDataType> hccl_data_type_list; | |||
| if (!HcomUtil::GetHcomDataType(cnode, &hccl_data_type_list)) { | |||
| MS_LOG(EXCEPTION) << "GetHcomDataType fail! Node info: " << cnode->DebugString(); | |||
| } | |||
| // Update Hccl count | |||
| if (!HcomUtil::GetHcomCount(cnode, hccl_data_type_list, hccl_kernel_input_shape_list, &hccl_count_)) { | |||
| MS_LOG(EXCEPTION) << "GetHcomCount fail! Node info: " << cnode->DebugString(); | |||
| } | |||
| MS_LOG(INFO) << "Update Hccl count:" << hccl_count_; | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -34,6 +34,7 @@ namespace kernel { | |||
| class HcclKernel : public AscendKernelMod { | |||
| public: | |||
| HcclKernel(); | |||
| explicit HcclKernel(const AnfNodePtr &anf_node); | |||
| ~HcclKernel() override; | |||
| virtual bool Init(const AnfNodePtr &anf_node); | |||
| const std::vector<size_t> &GetInputSizeList() const override; | |||
| @@ -43,6 +44,12 @@ class HcclKernel : public AscendKernelMod { | |||
| const std::vector<AddressPtr> &outputs, uint32_t stream_id) override; | |||
| device::DynamicKernelPtr GenDynamicKernel(const CNodePtr &cnode_ptr, void *stream_ptr) override; | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) override; | |||
| void InferOp() override; | |||
| void InitOp() override; | |||
| protected: | |||
| std::vector<std::vector<size_t>> hccl_kernel_input_shape_list_; | |||
| std::vector<std::vector<size_t>> hccl_kernel_output_shape_list_; | |||
| @@ -56,9 +63,10 @@ class HcclKernel : public AscendKernelMod { | |||
| mutable std::vector<size_t> input_size_list_; | |||
| mutable std::vector<size_t> output_size_list_; | |||
| mutable std::vector<size_t> workspace_size_list_; | |||
| AnfNodeWeakPtr anf_node_; | |||
| std::string op_name_; | |||
| std::string group_; | |||
| std::mutex hccl_mutex_; | |||
| std::condition_variable cond_; | |||
| }; | |||
| using HcclKernelCreater = std::function<std::shared_ptr<HcclKernel>()>; | |||
| @@ -16,6 +16,7 @@ | |||
| #include "backend/kernel_compiler/host/dynamic_broadcast_gradient_args_kernel.h" | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| #include "runtime/device/ascend/ascend_kernel_runtime.h" | |||
| #include "utils/trace_base.h" | |||
| namespace mindspore { | |||
| @@ -195,6 +196,15 @@ void DynamicBroadcastGradientArgsKernel::Execute() { | |||
| input_shapes[1] = GetInputShape(cnode, 1); | |||
| auto grad_reduce_idx = CalculateOutput(input_shapes); | |||
| auto runtime_instance = device::KernelRuntimeManager::Instance().GetCurrentKernelRuntime(); | |||
| MS_EXCEPTION_IF_NULL(runtime_instance); | |||
| // cppcheck-suppress unreadVariable | |||
| auto lock = AscendKernelMod::LockRuntime(); | |||
| auto ret = runtime_instance->SyncStream(); | |||
| if (!ret) { | |||
| MS_LOG(EXCEPTION) << "Sync stream error!"; | |||
| } | |||
| auto r0_size = SetOutputValue(cnode, grad_reduce_idx, 0, input_shapes[0].size()); | |||
| auto r1_size = SetOutputValue(cnode, grad_reduce_idx, 1, input_shapes[1].size()); | |||
| @@ -209,5 +219,26 @@ device::DynamicKernelPtr DynamicBroadcastGradientArgsKernelMod::GenDynamicKernel | |||
| void *stream_ptr) { | |||
| return std::make_shared<DynamicBroadcastGradientArgsKernel>(stream_ptr, cnode_ptr); | |||
| } | |||
| bool DynamicBroadcastGradientArgsKernelMod::Launch(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &, | |||
| const std::vector<AddressPtr> &, void *stream_ptr) { | |||
| auto node = anf_node_.lock(); | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| if (!node->isa<CNode>()) { | |||
| MS_LOG(EXCEPTION) << "anfnode is not a cnode"; | |||
| } | |||
| auto cnode = node->cast<CNodePtr>(); | |||
| MS_EXCEPTION_IF_NULL(cnode); | |||
| stream_ = stream_ptr; | |||
| auto broadcast_grad_kernel = std::make_shared<DynamicBroadcastGradientArgsKernel>(stream_ptr, cnode); | |||
| try { | |||
| broadcast_grad_kernel->Execute(); | |||
| } catch (const std::exception &e) { | |||
| MS_LOG(ERROR) << "DynamicBroadcastGradientArgsKernel Launch failed. node: " << cnode->fullname_with_scope() | |||
| << ", Error message is " << e.what(); | |||
| return false; | |||
| } | |||
| return true; | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -36,6 +36,8 @@ class DynamicBroadcastGradientArgsKernelMod : public HostKernelMod { | |||
| DynamicBroadcastGradientArgsKernelMod() = default; | |||
| ~DynamicBroadcastGradientArgsKernelMod() override = default; | |||
| device::DynamicKernelPtr GenDynamicKernel(const CNodePtr &cnode_ptr, void *stream_ptr) override; | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) override; | |||
| }; | |||
| MS_HOST_REG_KERNEL(DynamicBroadcastGradientArgs, DynamicBroadcastGradientArgsKernelMod); | |||
| } // namespace kernel | |||
| @@ -114,5 +114,26 @@ void DynamicReshapeKernel::Execute() { | |||
| device::DynamicKernelPtr DynamicReshapeKernelMod::GenDynamicKernel(const CNodePtr &cnode_ptr, void *stream_ptr) { | |||
| return std::make_shared<DynamicReshapeKernel>(stream_ptr, cnode_ptr); | |||
| } | |||
| bool DynamicReshapeKernelMod::Launch(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &, | |||
| const std::vector<AddressPtr> &, void *stream_ptr) { | |||
| auto node = anf_node_.lock(); | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| if (!node->isa<CNode>()) { | |||
| MS_LOG(EXCEPTION) << "anfnode is not a cnode"; | |||
| } | |||
| auto cnode = node->cast<CNodePtr>(); | |||
| MS_EXCEPTION_IF_NULL(cnode); | |||
| stream_ = stream_ptr; | |||
| auto reshape_kernel = std::make_shared<DynamicReshapeKernel>(stream_ptr, cnode); | |||
| try { | |||
| reshape_kernel->Execute(); | |||
| } catch (const std::exception &e) { | |||
| MS_LOG(ERROR) << "DynamicReshapeKernel Launch failed. node: " << cnode->fullname_with_scope() | |||
| << ", Error message is " << e.what(); | |||
| return false; | |||
| } | |||
| return true; | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -35,6 +35,9 @@ class DynamicReshapeKernelMod : public HostKernelMod { | |||
| DynamicReshapeKernelMod() = default; | |||
| ~DynamicReshapeKernelMod() override = default; | |||
| device::DynamicKernelPtr GenDynamicKernel(const CNodePtr &cnode_ptr, void *stream_ptr) override; | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) override; | |||
| void UpdateOp() override { AscendKernelMod::UpdateOp(); } | |||
| }; | |||
| MS_HOST_REG_KERNEL(DynamicReshape, DynamicReshapeKernelMod); | |||
| } // namespace kernel | |||
| @@ -57,6 +57,8 @@ void DynamicShapeKernel::Execute() { | |||
| } else { | |||
| auto runtime_instance = device::KernelRuntimeManager::Instance().GetCurrentKernelRuntime(); | |||
| MS_EXCEPTION_IF_NULL(runtime_instance); | |||
| // cppcheck-suppress unreadVariable | |||
| auto lock = AscendKernelMod::LockRuntime(); | |||
| auto ret = runtime_instance->SyncStream(); | |||
| if (!ret) { | |||
| MS_LOG(EXCEPTION) << "Sync stream error!"; | |||
| @@ -106,5 +108,23 @@ void DynamicShapeKernel::Execute(const std::vector<AddressPtr> &inputs, const st | |||
| device::DynamicKernelPtr DynamicShapeKernelMod::GenDynamicKernel(const CNodePtr &cnode_ptr, void *stream_ptr) { | |||
| return std::make_shared<DynamicShapeKernel>(stream_ptr, cnode_ptr); | |||
| } | |||
| bool DynamicShapeKernelMod::Launch(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &, | |||
| const std::vector<AddressPtr> &, void *stream_ptr) { | |||
| auto node = anf_node_.lock(); | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| auto cnode = node->cast<CNodePtr>(); | |||
| MS_EXCEPTION_IF_NULL(cnode); | |||
| stream_ = stream_ptr; | |||
| auto shape_kernel = std::make_shared<DynamicShapeKernel>(stream_ptr, cnode); | |||
| try { | |||
| shape_kernel->Execute(); | |||
| } catch (const std::exception &e) { | |||
| MS_LOG(ERROR) << "DynamicShapeKernelMod Launch failed. node: " << cnode->fullname_with_scope() | |||
| << ", Error message is " << e.what(); | |||
| return false; | |||
| } | |||
| return true; | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -38,18 +38,7 @@ class DynamicShapeKernelMod : public HostKernelMod { | |||
| ~DynamicShapeKernelMod() override = default; | |||
| device::DynamicKernelPtr GenDynamicKernel(const CNodePtr &cnode_ptr, void *stream_ptr) override; | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) override { | |||
| if (kernel_ == nullptr) { | |||
| kernel_ = | |||
| std::dynamic_pointer_cast<DynamicShapeKernel>(GenDynamicKernel(anf_node_->cast<CNodePtr>(), stream_ptr)); | |||
| kernel_->Initialize(); | |||
| } | |||
| kernel_->Execute(inputs, outputs); | |||
| return true; | |||
| } | |||
| private: | |||
| std::shared_ptr<DynamicShapeKernel> kernel_; | |||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) override; | |||
| }; | |||
| MS_HOST_REG_KERNEL(DynamicShape, DynamicShapeKernelMod); | |||
| } // namespace kernel | |||
| @@ -77,6 +77,16 @@ bool HostKernelMod::Launch(const std::vector<AddressPtr> &, const std::vector<Ad | |||
| const std::vector<AddressPtr> &, void *) { | |||
| return true; | |||
| } | |||
| void HostKernelMod::InferOp() { | |||
| auto node = anf_node_.lock(); | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| if (!AnfAlgo::IsDynamicShape(node)) { | |||
| MS_LOG(EXCEPTION) << "The node is not dynamic shape."; | |||
| } | |||
| KernelMod::InferShape(); | |||
| } | |||
| std::vector<TaskInfoPtr> HostKernelMod::GenTask(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &, | |||
| const std::vector<AddressPtr> &, uint32_t) { | |||
| return {}; | |||
| @@ -36,9 +36,10 @@ class HostKernelMod : public AscendKernelMod { | |||
| const std::vector<AddressPtr> &, uint32_t) override; | |||
| device::DynamicKernelPtr GenDynamicKernel(const CNodePtr &cnode_ptr, void *stream_ptr) override = 0; | |||
| bool Init(const AnfNodePtr &anf_node); | |||
| void InferOp() override; | |||
| void UpdateOp() override {} | |||
| protected: | |||
| AnfNodePtr anf_node_; | |||
| std::string op_name_; | |||
| std::vector<size_t> input_size_list_; | |||
| std::vector<size_t> output_size_list_; | |||
| @@ -0,0 +1,184 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/kernel.h" | |||
| #include <algorithm> | |||
| #include <stack> | |||
| #include <utility> | |||
| #include "utils/ms_context.h" | |||
| #include "utils/anf_utils.h" | |||
| #include "utils/ms_device_shape_transfer.h" | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| #include "backend/optimizer/common/helper.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| constexpr int64_t kInvalidShape = -2; | |||
| void KernelMod::InferShape() { | |||
| auto node = anf_node_.lock(); | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| if (!node->isa<CNode>()) { | |||
| MS_LOG(EXCEPTION) << "anfnode is not a cnode"; | |||
| } | |||
| auto cnode = node->cast<CNodePtr>(); | |||
| MS_EXCEPTION_IF_NULL(cnode); | |||
| MS_LOG(INFO) << "InferShape start, node:" << cnode->fullname_with_scope(); | |||
| GetDepndLists(cnode); | |||
| auto ret = InferShapeForDefiniteOutputNode(cnode); | |||
| if (ret) { | |||
| return; | |||
| } | |||
| depend_tensor_map_.clear(); | |||
| auto inputs = cnode->inputs(); | |||
| if (inputs.empty()) { | |||
| MS_LOG(EXCEPTION) << "Invalid inputs"; | |||
| } | |||
| auto context = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(context); | |||
| AbstractBasePtrList args_spec_list; | |||
| auto primitive = GetValueNode<PrimitivePtr>(inputs[0]); | |||
| auto input_size = AnfAlgo::GetInputTensorNum(cnode); | |||
| std::vector<AnfNodePtr> input_nodes; | |||
| for (size_t i = 0; i < input_size; i++) { | |||
| auto input_node_with_index = AnfAlgo::GetPrevNodeOutput(cnode, i); | |||
| auto real_input = input_node_with_index.first; | |||
| MS_EXCEPTION_IF_NULL(real_input); | |||
| auto cnode_input = cnode->input(i + 1); | |||
| MS_EXCEPTION_IF_NULL(cnode_input); | |||
| InferShapeForNopNode(&real_input); | |||
| if (depend_list_.find(i) != depend_list_.end()) { | |||
| auto pre_node_with_index = AnfAlgo::GetPrevNodeOutput(cnode, i); | |||
| bool skip_nop_node = !context->get_param<bool>(MS_CTX_ENABLE_MINDRT); | |||
| auto output_addr = AnfAlgo::GetPrevNodeMutableOutputAddr(cnode, i, skip_nop_node); | |||
| std::vector<int64_t> shapes = | |||
| trans::GetRuntimePaddingShape(pre_node_with_index.first, pre_node_with_index.second); | |||
| auto host_type = AnfAlgo::GetOutputInferDataType(pre_node_with_index.first, pre_node_with_index.second); | |||
| auto out_tensor = std::make_shared<tensor::Tensor>(host_type, shapes); | |||
| MS_EXCEPTION_IF_NULL(out_tensor); | |||
| // The second parameter must be false, otherwise the device address cannot be released and allocated, and the | |||
| // address size will be wrong in the dynamic shape scenario. | |||
| out_tensor->set_device_address(output_addr, false); | |||
| auto ret2 = depend_tensor_map_.try_emplace(i, out_tensor); | |||
| if (!ret2.second) { | |||
| MS_LOG(EXCEPTION) << "Insert map failed"; | |||
| } | |||
| out_tensor->data_sync(); | |||
| auto lock = AnfUtils::GetAbstractLock(real_input.get()); | |||
| MS_EXCEPTION_IF_NULL(real_input->abstract()); | |||
| auto real_abs = real_input->abstract()->Clone(); | |||
| if (real_abs->isa<abstract::AbstractTensor>()) { | |||
| real_abs->set_value(out_tensor); | |||
| } else if (real_abs->isa<abstract::AbstractTuple>()) { | |||
| auto tuple_get_item_index = AnfAlgo::GetTupleGetItemOutIndex(cnode_input->cast<CNodePtr>()); | |||
| auto abstract_tuple = real_abs->cast<abstract::AbstractTuplePtr>(); | |||
| MS_EXCEPTION_IF_NULL(abstract_tuple); | |||
| auto tuple_elements = abstract_tuple->elements()[tuple_get_item_index]; | |||
| tuple_elements->set_value(out_tensor); | |||
| } | |||
| real_input->set_abstract(real_abs); | |||
| } | |||
| bool is_cnode_input = AnfAlgo::AddArgList(&args_spec_list, cnode_input, real_input, i); | |||
| if (is_cnode_input) { | |||
| input_nodes.push_back(cnode_input); | |||
| } else { | |||
| input_nodes.push_back(real_input); | |||
| } | |||
| } | |||
| std::vector<AbstractScope> locks; | |||
| std::transform(input_nodes.begin(), input_nodes.end(), std::back_inserter(locks), | |||
| [](const AnfNodePtr &input) { return AnfUtils::GetAbstractLock(input.get()); }); | |||
| auto eval_result = opt::CppInferShape(primitive, args_spec_list); | |||
| locks.clear(); | |||
| // cppcheck-suppress unreadVariable | |||
| auto lock = AnfUtils::GetAbstractLock(cnode.get()); | |||
| cnode->set_abstract(eval_result); | |||
| } | |||
| bool KernelMod::InferShapeForDefiniteOutputNode(const CNodePtr &cnode) { | |||
| MS_EXCEPTION_IF_NULL(cnode); | |||
| if (!AnfAlgo::CheckPrimitiveType(cnode, prim::kPrimShape)) { | |||
| return false; | |||
| } | |||
| auto input_size = AnfAlgo::GetInputTensorNum(cnode); | |||
| if (input_size != 1) { | |||
| MS_LOG(EXCEPTION) << "Node only has one input: " << cnode->fullname_with_scope(); | |||
| } | |||
| auto cur_shape = dynamic_cast<mindspore::abstract::Shape *>(cnode->Shape().get())->shape(); | |||
| if (std::any_of(cur_shape.begin(), cur_shape.end(), [](int64_t x) { return x == kInvalidShape; })) { | |||
| return false; | |||
| } | |||
| std::vector<int64_t> output_shape = {static_cast<int64_t>(cur_shape.size())}; | |||
| mindspore::abstract::BaseShapePtr shape = std::make_shared<mindspore::abstract::Shape>(output_shape); | |||
| auto lock = AnfUtils::GetAbstractLock(cnode.get()); | |||
| auto abstract = cnode->abstract()->Clone(); | |||
| MS_EXCEPTION_IF_NULL(abstract); | |||
| abstract->set_shape(shape); | |||
| cnode->set_abstract(abstract); | |||
| return true; | |||
| } | |||
| void KernelMod::InferShapeForNopNode(AnfNodePtr *input_node) { | |||
| MS_EXCEPTION_IF_NULL(*input_node); | |||
| if (!opt::IsNopNode(*input_node) || !AnfAlgo::IsDynamicShape(*input_node)) { | |||
| MS_LOG(INFO) << "Input node is not a nop node, no need infer."; | |||
| return; | |||
| } | |||
| MS_LOG(INFO) << "Infer shape for nop node."; | |||
| std::stack<AnfNodePtr> nop_road; | |||
| nop_road.push(*input_node); | |||
| /*lint -e716*/ | |||
| while (true) { | |||
| auto input_node_with_idx = AnfAlgo::GetPrevNodeOutput(*input_node, 0); | |||
| auto in_node = input_node_with_idx.first; | |||
| MS_EXCEPTION_IF_NULL(in_node); | |||
| if (opt::IsNopNode(in_node)) { | |||
| nop_road.push(in_node); | |||
| *input_node = in_node; | |||
| } else { | |||
| break; | |||
| } | |||
| } | |||
| /*lint +e716*/ | |||
| while (!nop_road.empty()) { | |||
| auto nop_node = nop_road.top(); | |||
| MS_EXCEPTION_IF_NULL(nop_node); | |||
| AnfAlgo::InferShape(nop_node->cast<CNodePtr>()); | |||
| nop_road.pop(); | |||
| } | |||
| } | |||
| void KernelMod::GetDepndLists(const CNodePtr &cnode) { | |||
| MS_EXCEPTION_IF_NULL(cnode); | |||
| if (depend_list_.size() != 0) { | |||
| return; | |||
| } | |||
| auto ret = abstract::GetDependsFormMap(cnode); | |||
| if (ret.empty()) { | |||
| MS_LOG(DEBUG) << "No dynamic_shape_depends found"; | |||
| return; | |||
| } | |||
| MS_LOG(INFO) << "Have depends"; | |||
| (void)std::transform(ret.begin(), ret.end(), std::inserter(depend_list_, depend_list_.begin()), | |||
| [](const int64_t &value) { return static_cast<int>(value); }); | |||
| MS_LOG(INFO) << "Init End"; | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -18,6 +18,8 @@ | |||
| #include <vector> | |||
| #include <string> | |||
| #include <memory> | |||
| #include <map> | |||
| #include <set> | |||
| #include "nlohmann/json.hpp" | |||
| #include "ir/anf.h" | |||
| #include "ir/dtype.h" | |||
| @@ -180,6 +182,8 @@ struct KernelLaunchInfo { | |||
| class KernelMod { | |||
| public: | |||
| KernelMod() {} | |||
| explicit KernelMod(const AnfNodePtr &anf_node_ptr) : anf_node_(anf_node_ptr) {} | |||
| virtual const std::vector<size_t> &GetInputSizeList() const = 0; | |||
| virtual const std::vector<size_t> &GetOutputSizeList() const = 0; | |||
| virtual const std::vector<size_t> &GetWorkspaceSizeList() const = 0; | |||
| @@ -193,6 +197,10 @@ class KernelMod { | |||
| virtual std::vector<size_t> GenParameters() { return {}; } | |||
| virtual void ReleaseResource() {} | |||
| virtual void InferOp() {} | |||
| virtual void InitOp() {} | |||
| virtual void UpdateOp() {} | |||
| virtual ~KernelMod() = default; | |||
| void set_unique_name(const std::string &unique_name) { unique_name_ = unique_name; } | |||
| void set_fullname(const std::string &fullname) { fullname_ = fullname; } | |||
| @@ -205,18 +213,29 @@ class KernelMod { | |||
| const std::vector<AddressPtr> &GetOutputsAddr() { return outputs_addr_; } | |||
| void SetStream(void *stream) { stream_ = stream; } | |||
| void *GetStream() const { return stream_; } | |||
| void SetAtomicCleanNodes(const std::vector<CNodePtr> &atomic_clean_node) { atomic_clean_nodes_ = atomic_clean_node; } | |||
| protected: | |||
| void InferShape(); | |||
| std::string kernel_name_; | |||
| std::string unique_name_; | |||
| std::string fullname_; | |||
| bool is_monad_{false}; | |||
| void *stream_{nullptr}; | |||
| AnfNodeWeakPtr anf_node_; | |||
| std::map<uint32_t, tensor::TensorPtr> depend_tensor_map_; | |||
| std::vector<CNodePtr> atomic_clean_nodes_; | |||
| private: | |||
| void InferShapeForNopNode(AnfNodePtr *input_node); | |||
| void GetDepndLists(const CNodePtr &cnode); | |||
| bool InferShapeForDefiniteOutputNode(const CNodePtr &cnode); | |||
| std::vector<AddressPtr> inputs_addr_; | |||
| std::vector<AddressPtr> workspaces_addr_; | |||
| std::vector<AddressPtr> outputs_addr_; | |||
| std::set<uint32_t> depend_list_; | |||
| }; | |||
| using KernelModPtr = std::shared_ptr<KernelMod>; | |||
| } // namespace kernel | |||
| @@ -0,0 +1,298 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/tbe/dynamic_tbe_kernel_mod.h" | |||
| #include <algorithm> | |||
| #include <stack> | |||
| #include "acl/acl_rt.h" | |||
| #include "utils/ms_context.h" | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| #include "runtime/device/kernel_runtime.h" | |||
| #include "backend/optimizer/common/helper.h" | |||
| #include "framework/common/debug/log.h" | |||
| #include "utils/log_adapter.h" | |||
| #include "utils/convert_utils_base.h" | |||
| #include "runtime/device/kernel_runtime_manager.h" | |||
| #include "runtime/kernel.h" | |||
| #include "runtime/mem.h" | |||
| #include "pipeline/jit/static_analysis/static_analysis.h" | |||
| #include "runtime/device/ascend/executor/tiling/op_tiling_adapter.h" | |||
| #include "utils/ms_device_shape_transfer.h" | |||
| #include "utils/utils.h" | |||
| #include "register/op_tiling.h" | |||
| #include "nlohmann/json.hpp" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| using TbeTaskInfoPtr = std::shared_ptr<mindspore::ge::model_runner::TbeTaskInfo>; | |||
| using tbe::KernelManager; | |||
| using AddressPtrList = std::vector<mindspore::kernel::AddressPtr>; | |||
| DynamicTbeKernelMod::DynamicTbeKernelMod(KernelPackPtr kernel_pack, const AnfNodePtr &anf_node_ptr) | |||
| : TbeKernelMod(std::move(kernel_pack), anf_node_ptr) { | |||
| MS_EXCEPTION_IF_NULL(anf_node_ptr); | |||
| auto cnode = anf_node_ptr->cast<CNodePtr>(); | |||
| if (cnode != nullptr) { | |||
| op_compile_info_ = ParseCompileJson(cnode); | |||
| } | |||
| } | |||
| DynamicTbeKernelMod::~DynamicTbeKernelMod() { | |||
| if (tiling_data_ptr_ != nullptr) { | |||
| (void)rtFree(tiling_data_ptr_); | |||
| } | |||
| } | |||
| void DynamicTbeKernelMod::InferOp() { | |||
| if (AnfAlgo::IsDynamicShape(anf_node_.lock())) { | |||
| auto node = anf_node_.lock(); | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| auto cnode = node->cast<CNodePtr>(); | |||
| MS_EXCEPTION_IF_NULL(cnode); | |||
| need_skip_execute_ = NeedSkipExecute(cnode); | |||
| if (need_skip_execute_) { | |||
| std::vector<TypeId> dtypes{AnfAlgo::GetOutputInferDataType(cnode, 0)}; | |||
| AnfAlgo::SetOutputInferTypeAndShape(dtypes, {AnfAlgo::GetInputDeviceShape(cnode, 0)}, cnode.get()); | |||
| } else { | |||
| KernelMod::InferShape(); | |||
| } | |||
| } | |||
| } | |||
| void DynamicTbeKernelMod::InitOp() { | |||
| auto node = anf_node_.lock(); | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| auto cnode = node->cast<CNodePtr>(); | |||
| MS_EXCEPTION_IF_NULL(cnode); | |||
| if (!AnfAlgo::IsDynamicShape(cnode)) { | |||
| MS_LOG(EXCEPTION) << "The node is not dynamic shape: " << cnode->fullname_with_scope(); | |||
| } | |||
| if (!atomic_clean_nodes_.empty()) { | |||
| for (const auto &atomic_clean_node : atomic_clean_nodes_) { | |||
| AnfAlgo::GetKernelMod(atomic_clean_node)->InitOp(); | |||
| } | |||
| } | |||
| if (need_skip_execute_) { | |||
| return; | |||
| } | |||
| // gen FuncStub | |||
| if (handle_ == nullptr) { | |||
| auto func_stub = KernelManager::GenFuncStub(*kernel_pack_, false, &block_dim_, true, &handle_, &origin_key_); | |||
| if (func_stub != 1) { | |||
| MS_LOG(EXCEPTION) << "GenFuncStub failed."; | |||
| } | |||
| } | |||
| // start compute tiling | |||
| MS_LOG(INFO) << "Start compute tiling of: " << cnode->fullname_with_scope(); | |||
| optiling::utils::OpRunInfo op_run_info_v2(-1, true, 0); | |||
| device::tiling::OpTilingCalculateAdapter converter; | |||
| ::ge::ComputeGraphPtr ge_graph = std::make_shared<::ge::ComputeGraph>("default"); | |||
| auto ge_node = converter.AnfNodeToGeNodeAdapter(cnode, &ge_graph, depend_tensor_map_, op_compile_info_); | |||
| (void)optiling::OpParaCalculateV2(ge_node, op_run_info_v2); | |||
| block_dim_ = op_run_info_v2.GetBlockDim(); | |||
| std::vector<int64_t> workspace_size_list; | |||
| op_run_info_v2.GetAllWorkspaces(workspace_size_list); | |||
| tiling_data_ = op_run_info_v2.GetAllTilingData().str(); | |||
| tiling_key_ = op_run_info_v2.GetTilingKey(); | |||
| workspace_size_list_.clear(); | |||
| workspace_size_list_.resize(workspace_size_list.size()); | |||
| std::transform(workspace_size_list.begin(), workspace_size_list.end(), workspace_size_list_.begin(), | |||
| [](int64_t size) { return static_cast<size_t>(size); }); | |||
| } | |||
| std::string DynamicTbeKernelMod::ParseCompileJson(const CNodePtr &cnode) { | |||
| MS_EXCEPTION_IF_NULL(cnode); | |||
| bool get_flag = true; | |||
| std::string op_compile_info = ""; | |||
| TbeUtils::GetCompileInfo(cnode, &op_compile_info, &get_flag); | |||
| if (!get_flag) { | |||
| MS_LOG(EXCEPTION) << "Get compile_info failed. The compile result of [" << cnode->fullname_with_scope() | |||
| << "] maybe not in the json file(kernel_meta/) or the file had been deleted."; | |||
| } | |||
| MS_LOG(INFO) << "Node: " << cnode->fullname_with_scope() << " get compile_info: " << op_compile_info; | |||
| return op_compile_info; | |||
| } | |||
| void DynamicTbeKernelMod::InitTilingDataPtr() { | |||
| if (tiling_data_ptr_ != nullptr) { | |||
| return; | |||
| } | |||
| auto kernel_json_info = kernel_pack_->kernel_json_info(); | |||
| auto op_para_size = kernel_json_info.op_para_size; | |||
| if (op_para_size > 0) { | |||
| auto ret = rtMalloc(&tiling_data_ptr_, op_para_size, RT_MEMORY_HBM); | |||
| if (ret != RT_ERROR_NONE) { | |||
| MS_LOG(EXCEPTION) << "rtMalloc tiling data failed"; | |||
| } | |||
| } | |||
| } | |||
| bool DynamicTbeKernelMod::CopyTilingToDevice(void *stream_ptr) { | |||
| InitTilingDataPtr(); | |||
| MS_EXCEPTION_IF_NULL(kernel_pack_); | |||
| auto kernel_json_info = kernel_pack_->kernel_json_info(); | |||
| auto op_para_size = kernel_json_info.op_para_size; | |||
| if (tiling_data_.size() > op_para_size) { | |||
| MS_LOG(EXCEPTION) << "Compute tiling size:" << tiling_data_.size() | |||
| << " larger than tbe build op_para_size:" << op_para_size; | |||
| } | |||
| if (tiling_data_.empty() || tiling_data_ptr_ == nullptr) { | |||
| MS_LOG(INFO) << "Tiling size is 0, skip aclrtMemcpyAsync"; | |||
| return true; | |||
| } | |||
| // cppcheck-suppress unreadVariable | |||
| auto lock = AscendKernelMod::LockRuntime(); | |||
| auto ret = aclrtMemcpyAsync(tiling_data_ptr_, op_para_size, tiling_data_.c_str(), tiling_data_.size(), | |||
| ACL_MEMCPY_HOST_TO_DEVICE, stream_ptr); | |||
| if (ret != RT_ERROR_NONE) { | |||
| MS_LOG(EXCEPTION) << "Tiling aclrtMemcpyAsync failed, ret:" << ret; | |||
| } | |||
| return true; | |||
| } | |||
| bool DynamicTbeKernelMod::NeedSkipExecute(const CNodePtr &cnode) { | |||
| // Skip run ReduceSum when axis is a Empty Tensor | |||
| MS_EXCEPTION_IF_NULL(cnode); | |||
| auto op_name = AnfAlgo::GetCNodeName(cnode); | |||
| if (op_name != kReduceSumOpName) { | |||
| return false; | |||
| } | |||
| const size_t axes_index = 1; | |||
| if (cnode->inputs().size() <= axes_index + 1) { | |||
| return false; | |||
| } | |||
| auto input_axes = cnode->input(axes_index + 1); | |||
| // cppcheck-suppress unreadVariable | |||
| auto lock = AnfUtils::GetAbstractLock(input_axes.get()); | |||
| auto axes_abs = input_axes->abstract()->Clone(); | |||
| MS_EXCEPTION_IF_NULL(axes_abs); | |||
| auto axes_shape = AnfAlgo::GetInputDeviceShape(cnode, axes_index); | |||
| if (axes_abs->isa<abstract::AbstractTensor>()) { | |||
| if (std::any_of(axes_shape.begin(), axes_shape.end(), [](ssize_t shape) { return shape == 0; })) { | |||
| return true; | |||
| } | |||
| } | |||
| return false; | |||
| } | |||
| bool DynamicTbeKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) { | |||
| if (stream_ptr == nullptr) { | |||
| MS_LOG(ERROR) << "stream_ptr should not be nullptr."; | |||
| return false; | |||
| } | |||
| if (kernel_pack_ == nullptr) { | |||
| MS_LOG(ERROR) << "kernel pack should not be nullptr."; | |||
| return false; | |||
| } | |||
| if (stream_ == nullptr) { | |||
| stream_ = stream_ptr; | |||
| } | |||
| auto node = anf_node_.lock(); | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| if (!node->isa<CNode>()) { | |||
| MS_LOG(EXCEPTION) << "anfnode is not a cnode"; | |||
| } | |||
| auto cnode = node->cast<CNodePtr>(); | |||
| MS_EXCEPTION_IF_NULL(cnode); | |||
| // is dynamic shape | |||
| if (!AnfAlgo::IsDynamicShape(cnode)) { | |||
| MS_LOG(EXCEPTION) << "The cnode is not dynamic shape:" << cnode->fullname_with_scope(); | |||
| } | |||
| if (!atomic_clean_nodes_.empty()) { | |||
| for (auto atomic_clean_node : atomic_clean_nodes_) { | |||
| KernelLaunchInfo kernel_launch_info; | |||
| auto kernel_mod = AnfAlgo::GetKernelMod(atomic_clean_node); | |||
| MS_EXCEPTION_IF_NULL(kernel_mod); | |||
| device::KernelRuntime::GenLaunchArgs(*kernel_mod, atomic_clean_node, &kernel_launch_info); | |||
| auto atomic_inputs = kernel_launch_info.inputs_; | |||
| std::vector<AddressPtr> atomic_outputs; | |||
| std::vector<AddressPtr> atomic_workspace; | |||
| kernel_mod->Launch(atomic_inputs, atomic_workspace, atomic_outputs, stream_ptr); | |||
| } | |||
| } | |||
| // need skip, for reducesum empty input axis | |||
| if (need_skip_execute_) { | |||
| // Skip reduce if axis is a empty Tensor (shape = 0) | |||
| MS_LOG(INFO) << "The node " << cnode->fullname_with_scope() << "Need Skip."; | |||
| // cppcheck-suppress unreadVariable | |||
| auto lock = AscendKernelMod::LockRuntime(); | |||
| rtError_t status = aclrtMemcpyAsync(outputs[0]->addr, inputs[0]->size, inputs[0]->addr, inputs[0]->size, | |||
| ACL_MEMCPY_DEVICE_TO_DEVICE, stream_ptr); | |||
| if (status != RT_ERROR_NONE) { | |||
| MS_LOG(EXCEPTION) << "aclrtMemcpyAsync failed for " << cnode->fullname_with_scope(); | |||
| } | |||
| MS_LOG(INFO) << "Execute node:" << cnode->fullname_with_scope() << " success."; | |||
| return true; | |||
| } | |||
| // copy tiling to device | |||
| if (!CopyTilingToDevice(stream_ptr)) { | |||
| MS_LOG(EXCEPTION) << "Copy tiling to device failed. op name: " << cnode->fullname_with_scope(); | |||
| } | |||
| // pack all addresses into a vector. | |||
| std::vector<void *> runtimeargs; | |||
| (void)std::transform(std::begin(inputs), std::end(inputs), std::back_inserter(runtimeargs), | |||
| [](const AddressPtr &input) -> void * { return input->addr; }); | |||
| (void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(runtimeargs), | |||
| [](const AddressPtr &output) -> void * { return output->addr; }); | |||
| if (!workspace.empty()) { | |||
| (void)std::transform(std::begin(workspace), std::end(workspace), std::back_inserter(runtimeargs), | |||
| [](const AddressPtr &addr) -> void * { return addr->addr; }); | |||
| } | |||
| if (!tiling_data_.empty() && tiling_data_ptr_ != nullptr) { | |||
| runtimeargs.push_back(tiling_data_ptr_); | |||
| } | |||
| rtL2Ctrl_t *l2ctrl = nullptr; | |||
| auto args_size = static_cast<uint32_t>(UlongToUint(sizeof(void *)) * runtimeargs.size()); | |||
| auto node_info = cnode->fullname_with_scope(); | |||
| const auto dev_func = | |||
| origin_key_.find("kernel0") != origin_key_.npos ? origin_key_ : origin_key_ + "_" + std::to_string(tiling_key_); | |||
| const auto kernel_info = node_info + "/" + std::to_string(tiling_key_); | |||
| // cppcheck-suppress unreadVariable | |||
| auto lock = AscendKernelMod::LockRuntime(); | |||
| auto ret = rtKernelLaunchWithHandle(handle_, dev_func.c_str(), block_dim_, runtimeargs.data(), args_size, l2ctrl, | |||
| stream_ptr, kernel_info.c_str()); | |||
| if (ret != RT_ERROR_NONE) { | |||
| MS_LOG(ERROR) << "Call runtime rtKernelLaunchWithHandle error. Node info: " << node_info; | |||
| return false; | |||
| } | |||
| return true; | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -0,0 +1,65 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_TBE_DYNAMIC_TBE_KERNEL_MOD_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_TBE_DYNAMIC_TBE_KERNEL_MOD_H_ | |||
| #include <memory> | |||
| #include <string> | |||
| #include <vector> | |||
| #include <utility> | |||
| #include <map> | |||
| #include "backend/kernel_compiler/tbe/tbe_kernel_mod.h" | |||
| #include "backend/kernel_compiler/tbe/tbe_utils.h" | |||
| #include "runtime/device/device_address.h" | |||
| #include "ir/tensor.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| class DynamicTbeKernelMod : public TbeKernelMod { | |||
| public: | |||
| explicit DynamicTbeKernelMod(KernelPackPtr kernel_pack) : TbeKernelMod(kernel_pack) {} // maybe delete later | |||
| DynamicTbeKernelMod(KernelPackPtr kernel_pack, const AnfNodePtr &anf_node_ptr); | |||
| ~DynamicTbeKernelMod() override; | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) override; | |||
| void InferOp() override; | |||
| void InitOp() override; | |||
| private: | |||
| void InferShapeRecursive(); | |||
| void InferShapeForNopNode(AnfNodePtr *input_node); | |||
| std::string ParseCompileJson(const CNodePtr &cnode); | |||
| void InitTilingDataPtr(); | |||
| bool CopyTilingToDevice(void *stream_ptr); | |||
| bool NeedSkipExecute(const CNodePtr &cnode); | |||
| uint32_t block_dim_ = 1; | |||
| std::string tiling_data_; | |||
| void *tiling_data_ptr_ = nullptr; | |||
| uint32_t tiling_key_{0}; | |||
| void *handle_ = nullptr; | |||
| std::string origin_key_{""}; | |||
| std::string op_compile_info_{}; | |||
| bool need_skip_execute_ = false; | |||
| }; | |||
| using DynamicTbeKernelModPtr = std::shared_ptr<DynamicTbeKernelMod>; | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_TBE_TBE_KERNEL_MOD_H_ | |||
| @@ -15,6 +15,8 @@ | |||
| */ | |||
| #include "backend/kernel_compiler/tbe/tbe_kernel_mod.h" | |||
| #include <algorithm> | |||
| #include "runtime/rt.h" | |||
| #include "utils/ms_context.h" | |||
| #include "runtime/device/ascend/ge_runtime/task_info.h" | |||
| @@ -41,6 +43,20 @@ bool TbeKernelMod::Launch(const std::vector<mindspore::kernel::AddressPtr> &inpu | |||
| if (stream_ == nullptr) { | |||
| stream_ = stream_ptr; | |||
| } | |||
| // launch atomic_cleans first | |||
| if (!atomic_clean_nodes_.empty()) { | |||
| for (const auto &atomic_clean_node : atomic_clean_nodes_) { | |||
| KernelLaunchInfo kernel_launch_info; | |||
| auto kernel_mod = AnfAlgo::GetKernelMod(atomic_clean_node); | |||
| MS_EXCEPTION_IF_NULL(kernel_mod); | |||
| device::KernelRuntime::GenLaunchArgs(*kernel_mod, atomic_clean_node, &kernel_launch_info); | |||
| auto atomic_inputs = kernel_launch_info.inputs_; | |||
| std::vector<AddressPtr> atomic_outputs; | |||
| std::vector<AddressPtr> atomic_workspace; | |||
| kernel_mod->Launch(atomic_inputs, atomic_workspace, atomic_outputs, stream_ptr); | |||
| } | |||
| } | |||
| uint32_t blockdim = 1; // default blockdim equal to 1. | |||
| auto func_stub = KernelManager::GenFuncStub(*kernel_pack_, false, &blockdim); | |||
| if (func_stub == 0) { | |||
| @@ -61,6 +77,7 @@ bool TbeKernelMod::Launch(const std::vector<mindspore::kernel::AddressPtr> &inpu | |||
| rtL2Ctrl_t *l2ctrl = nullptr; | |||
| const void *stubFunc = reinterpret_cast<void *>(func_stub); | |||
| auto argsSize = static_cast<uint32_t>(UlongToUint(sizeof(void *)) * runtimeargs.size()); | |||
| auto lock = AscendKernelMod::LockRuntime(); | |||
| auto ret = rtKernelLaunch(stubFunc, blockdim, runtimeargs.data(), argsSize, l2ctrl, stream_); | |||
| if (ret != RT_ERROR_NONE) { | |||
| MS_LOG(ERROR) << "Call runtime rtKernelLaunch error."; | |||
| @@ -29,6 +29,8 @@ namespace kernel { | |||
| class TbeKernelMod : public AscendKernelMod { | |||
| public: | |||
| explicit TbeKernelMod(KernelPackPtr kernel_pack) : kernel_pack_(std::move(kernel_pack)) {} | |||
| TbeKernelMod(KernelPackPtr kernel_pack, const AnfNodePtr &anf_node_ptr) | |||
| : AscendKernelMod(anf_node_ptr), kernel_pack_(std::move(kernel_pack)) {} | |||
| ~TbeKernelMod() override = default; | |||
| void SetInputSizeList(const std::vector<size_t> &size_list) { input_size_list_ = size_list; } | |||
| @@ -45,7 +47,7 @@ class TbeKernelMod : public AscendKernelMod { | |||
| device::DynamicKernelPtr GenDynamicKernel(const CNodePtr &cnode_ptr, void *stream_ptr) override; | |||
| std::vector<size_t> GenParameters() override; | |||
| private: | |||
| protected: | |||
| KernelPackPtr kernel_pack_; | |||
| std::vector<size_t> input_size_list_; | |||
| std::vector<size_t> output_size_list_; | |||
| @@ -729,9 +729,8 @@ KernelWithIndex AnfRuntimeAlgorithm::GetPrevNodeOutput(const AnfNodePtr &anf_nod | |||
| auto kernel_info = anf_node->kernel_info(); | |||
| if (kernel_info) { | |||
| auto runtime_cache = kernel_info->runtime_cache(); | |||
| MS_EXCEPTION_IF_NULL(runtime_cache); | |||
| if (runtime_cache->is_valid()) { | |||
| auto output = runtime_cache->get_prev_node_output(input_idx); | |||
| if (runtime_cache.runtime_cache().is_valid()) { | |||
| auto output = runtime_cache.runtime_cache().get_prev_node_output(input_idx); | |||
| if (output.first != nullptr) { | |||
| return output; | |||
| } | |||
| @@ -747,9 +746,8 @@ KernelWithIndex AnfRuntimeAlgorithm::GetPrevNodeOutput(const AnfNodePtr &anf_nod | |||
| } | |||
| if (kernel_info) { | |||
| auto runtime_cache = kernel_info->runtime_cache(); | |||
| MS_EXCEPTION_IF_NULL(runtime_cache); | |||
| if (runtime_cache->is_valid()) { | |||
| runtime_cache->set_prev_node_output(input_idx, res); | |||
| if (runtime_cache.runtime_cache().is_valid()) { | |||
| runtime_cache.runtime_cache().set_prev_node_output(input_idx, res); | |||
| } | |||
| } | |||
| return res; | |||
| @@ -2065,7 +2063,7 @@ std::vector<int64_t> AnfRuntimeAlgorithm::GetOutputMinShape(const AnfNodePtr &an | |||
| } | |||
| } | |||
| bool IsNodeInputDynamicShape(const CNodePtr &anf_node_ptr) { | |||
| bool AnfRuntimeAlgorithm::IsNodeInputDynamicShape(const CNodePtr &anf_node_ptr) { | |||
| MS_EXCEPTION_IF_NULL(anf_node_ptr); | |||
| auto input_num = AnfAlgo::GetInputTensorNum(anf_node_ptr); | |||
| for (size_t i = 0; i < input_num; ++i) { | |||
| @@ -2274,6 +2272,7 @@ void AnfRuntimeAlgorithm::InferShape(const CNodePtr &node, std::map<uint32_t, te | |||
| AbstractBasePtrList args_spec_list; | |||
| auto primitive = GetValueNode<PrimitivePtr>(inputs[0]); | |||
| auto input_size = AnfAlgo::GetInputTensorNum(node); | |||
| std::vector<AnfNodePtr> input_nodes; | |||
| for (size_t i = 0; i < input_size; ++i) { | |||
| auto input_with_index = AnfAlgo::GetPrevNodeOutput(node, i); | |||
| auto real_input = input_with_index.first; | |||
| @@ -2289,9 +2288,12 @@ void AnfRuntimeAlgorithm::InferShape(const CNodePtr &node, std::map<uint32_t, te | |||
| // sync data from device to host | |||
| tensor_ptr->data_sync(); | |||
| } | |||
| auto real_abs = real_input->abstract(); | |||
| // cppcheck-suppress unreadVariable | |||
| auto lock = AnfUtils::GetAbstractLock(real_input.get()); | |||
| MS_EXCEPTION_IF_NULL(real_input->abstract()); | |||
| auto real_abs = real_input->abstract()->Clone(); | |||
| if (real_abs->isa<abstract::AbstractTensor>()) { | |||
| real_input->abstract()->set_value(tensor_ptr); | |||
| real_abs->set_value(tensor_ptr); | |||
| } else if (real_abs->isa<abstract::AbstractTuple>()) { | |||
| auto tuple_get_item_index = AnfAlgo::GetTupleGetItemOutIndex(cnode_input->cast<CNodePtr>()); | |||
| auto abstract_tuple = real_abs->cast<abstract::AbstractTuplePtr>(); | |||
| @@ -2299,15 +2301,27 @@ void AnfRuntimeAlgorithm::InferShape(const CNodePtr &node, std::map<uint32_t, te | |||
| auto tuple_elements = abstract_tuple->elements()[tuple_get_item_index]; | |||
| tuple_elements->set_value(tensor_ptr); | |||
| } | |||
| real_input->set_abstract(real_abs); | |||
| } | |||
| } | |||
| AddArgList(&args_spec_list, cnode_input, real_input, i); | |||
| bool is_cnode_input = AddArgList(&args_spec_list, cnode_input, real_input, i); | |||
| if (is_cnode_input) { | |||
| input_nodes.push_back(cnode_input); | |||
| } else { | |||
| input_nodes.push_back(real_input); | |||
| } | |||
| } | |||
| std::vector<AbstractScope> locks; | |||
| std::transform(input_nodes.begin(), input_nodes.end(), std::back_inserter(locks), | |||
| [](const AnfNodePtr &input) { return AnfUtils::GetAbstractLock(input.get()); }); | |||
| auto eval_result = opt::CppInferShape(primitive, args_spec_list); | |||
| locks.clear(); | |||
| // cppcheck-suppress unreadVariable | |||
| auto lock = AnfUtils::GetAbstractLock(node.get()); | |||
| node->set_abstract(eval_result); | |||
| } | |||
| void AnfRuntimeAlgorithm::AddArgList(AbstractBasePtrList *args_spec_list, const AnfNodePtr &cnode_input, | |||
| bool AnfRuntimeAlgorithm::AddArgList(AbstractBasePtrList *args_spec_list, const AnfNodePtr &cnode_input, | |||
| const AnfNodePtr &real_input, size_t index) { | |||
| if (AnfAlgo::CheckPrimitiveType(cnode_input, prim::kPrimTupleGetItem)) { | |||
| auto base_shape = real_input->Shape(); | |||
| @@ -2315,15 +2329,24 @@ void AnfRuntimeAlgorithm::AddArgList(AbstractBasePtrList *args_spec_list, const | |||
| MS_LOG(EXCEPTION) << "Node input is a tuple_get_item but real input node shape is not a TupleShape. trace: " | |||
| << trace::DumpSourceLines(real_input); | |||
| } | |||
| // cppcheck-suppress unreadVariable | |||
| auto lock = AnfUtils::GetAbstractLock(real_input.get()); | |||
| auto abs = real_input->abstract()->cast<abstract::AbstractTuplePtr>(); | |||
| MS_EXCEPTION_IF_NULL(abs); | |||
| auto tuple_get_item_indexk = AnfAlgo::GetTupleGetItemOutIndex(cnode_input->cast<CNodePtr>()); | |||
| auto abs_i = abs->elements()[tuple_get_item_indexk]; | |||
| (void)args_spec_list->emplace_back(abs_i); | |||
| return false; | |||
| } else if (cnode_input->isa<CNode>() && AnfAlgo::GetCNodeName(cnode_input) == prim::kPrimReshape->name()) { | |||
| // cppcheck-suppress unreadVariable | |||
| auto lock = AnfUtils::GetAbstractLock(cnode_input.get()); | |||
| (void)args_spec_list->emplace_back(cnode_input->abstract()); | |||
| return true; | |||
| } else { | |||
| // cppcheck-suppress unreadVariable | |||
| auto lock = AnfUtils::GetAbstractLock(real_input.get()); | |||
| (void)args_spec_list->emplace_back(real_input->abstract()); | |||
| return false; | |||
| } | |||
| } | |||
| @@ -288,6 +288,7 @@ class AnfRuntimeAlgorithm { | |||
| static TypeId GetCNodeOutputPrecision(const AnfNodePtr &node); | |||
| // get fix output precision from prev node, input_idx is the input index of current node related to prev node. | |||
| static TypeId GetPrevNodeOutputPrecision(const AnfNodePtr &node, size_t input_idx); | |||
| static bool IsNodeInputDynamicShape(const CNodePtr &anf_node_ptr); | |||
| static bool IsDynamicShape(const AnfNodePtr &node); | |||
| static bool HasDynamicShapeFlag(const PrimitivePtr &prim); | |||
| static bool IsCondControlKernel(const CNodePtr &node); | |||
| @@ -302,7 +303,8 @@ class AnfRuntimeAlgorithm { | |||
| static bool IsNodeDynamicShape(const AnfNodePtr &node); | |||
| static bool IsHostKernel(const CNodePtr &node); | |||
| static void InferShape(const CNodePtr &node, std::map<uint32_t, tensor::TensorPtr> *depend_tensors = nullptr); | |||
| static void AddArgList(AbstractBasePtrList *args_spec_list, const AnfNodePtr &cnode_input, | |||
| // return true if use cnode_input's abstract, false if use real_input's abstract | |||
| static bool AddArgList(AbstractBasePtrList *args_spec_list, const AnfNodePtr &cnode_input, | |||
| const AnfNodePtr &real_input, size_t index); | |||
| static std::vector<size_t> GetInputRealDeviceShapeIfExist(const AnfNodePtr &anf_node, size_t index); | |||
| static std::vector<size_t> GetOutputRealDeviceShapeIfExist(const AnfNodePtr &anf_node, size_t index); | |||
| @@ -123,8 +123,7 @@ void AscendEnableDynamicRuntimeCache(const KernelGraph *graph) { | |||
| } | |||
| MS_EXCEPTION_IF_NULL(kernel_info); | |||
| auto runtime_cache = kernel_info->runtime_cache(); | |||
| MS_EXCEPTION_IF_NULL(runtime_cache); | |||
| runtime_cache->set_valid(); | |||
| runtime_cache.runtime_cache().set_valid(); | |||
| } | |||
| } | |||
| } // namespace | |||
| @@ -37,21 +37,21 @@ class OpTilingCalculateAdapter { | |||
| OpTilingCalculateAdapter() = default; | |||
| ~OpTilingCalculateAdapter() = default; | |||
| ge::Operator AnfNodeToGeNodeAdapter(const CNodePtr &node, ge::ComputeGraphPtr *ge_graph, | |||
| const std::map<uint32_t, tensor::TensorPtr> &depend_tensor_map, | |||
| const std::string &op_compile_info); | |||
| ::ge::Operator AnfNodeToGeNodeAdapter(const CNodePtr &node, ::ge::ComputeGraphPtr *ge_graph, | |||
| const std::map<uint32_t, tensor::TensorPtr> &depend_tensor_map, | |||
| const std::string &op_compile_info); | |||
| private: | |||
| void ConvertInputShapeAndType(const CNodePtr &node, ge::OpDescPtr *op_desc); | |||
| void ConvertOutputShapeAndType(const CNodePtr &node, ge::OpDescPtr *op_desc); | |||
| void ConvertCompileInfo(const CNodePtr &node, ge::OpDescPtr *op_desc); | |||
| void ConvertAttrs(const CNodePtr &node, ge::OpDescPtr *op_desc); | |||
| std::vector<std::tuple<std::size_t, ge::NodePtr>> ConvertDepends( | |||
| const CNodePtr &node, const std::map<uint32_t, tensor::TensorPtr> &depend_tensor_map, ge::OpDescPtr *op_desc, | |||
| ge::ComputeGraphPtr *ge_graph); | |||
| ge::NodePtr NewConstantOp(const CNodePtr &node, const std::string &name, const tensor::TensorPtr &tensor_data, | |||
| ge::ComputeGraphPtr *ge_graph, size_t index); | |||
| void AddEdge(const ge::NodePtr &ge_node, const std::vector<std::tuple<std::size_t, ge::NodePtr>> &constant_ops); | |||
| void ConvertInputShapeAndType(const CNodePtr &node, ::ge::OpDescPtr *op_desc); | |||
| void ConvertOutputShapeAndType(const CNodePtr &node, ::ge::OpDescPtr *op_desc); | |||
| void ConvertCompileInfo(const CNodePtr &node, ::ge::OpDescPtr *op_desc); | |||
| void ConvertAttrs(const CNodePtr &node, ::ge::OpDescPtr *op_desc); | |||
| std::vector<std::tuple<std::size_t, ::ge::NodePtr>> ConvertDepends( | |||
| const CNodePtr &node, const std::map<uint32_t, tensor::TensorPtr> &depend_tensor_map, ::ge::OpDescPtr *op_desc, | |||
| ::ge::ComputeGraphPtr *ge_graph); | |||
| ::ge::NodePtr NewConstantOp(const CNodePtr &node, const std::string &name, const tensor::TensorPtr &tensor_data, | |||
| ::ge::ComputeGraphPtr *ge_graph, size_t index); | |||
| void AddEdge(const ::ge::NodePtr &ge_node, const std::vector<std::tuple<std::size_t, ::ge::NodePtr>> &constant_ops); | |||
| std::string GetRealOpType(const std::string &op_type); | |||
| std::string GetInputName(const CNodePtr &node, size_t index); | |||
| std::string GetOutputName(const CNodePtr &node, size_t index); | |||
| @@ -103,7 +103,7 @@ void DynamicKernel::InferShape() { | |||
| tuple_elements->set_value(out_tensor); | |||
| } | |||
| } | |||
| AnfAlgo::AddArgList(&args_spec_list, cnode_input, real_input, i); | |||
| (void)AnfAlgo::AddArgList(&args_spec_list, cnode_input, real_input, i); | |||
| } | |||
| auto eval_result = opt::CppInferShape(primitive, args_spec_list); | |||
| cnode->set_abstract(eval_result); | |||
| @@ -164,8 +164,7 @@ class DeviceContext { | |||
| } | |||
| MS_EXCEPTION_IF_NULL(kernel_info); | |||
| auto runtime_cache = kernel_info->runtime_cache(); | |||
| MS_EXCEPTION_IF_NULL(runtime_cache); | |||
| runtime_cache->set_valid(); | |||
| runtime_cache.runtime_cache().set_valid(); | |||
| } | |||
| } | |||
| @@ -28,8 +28,21 @@ | |||
| #include "ir/func_graph.h" | |||
| #include "ir/primitive.h" | |||
| #include "utils/ms_context.h" | |||
| #include "utils/anf_utils.h" | |||
| namespace mindspore { | |||
| const AbstractBasePtr &AnfNode::abstract() const { | |||
| // cppcheck-suppress unreadVariable | |||
| auto lock = AnfUtils::GetAbstractLock(this); | |||
| return abstract_; | |||
| } | |||
| void AnfNode::set_abstract(const AbstractBasePtr &abs) { | |||
| // cppcheck-suppress unreadVariable | |||
| auto lock = AnfUtils::GetAbstractLock(this); | |||
| abstract_ = abs; | |||
| } | |||
| // namespace to support intermediate representation definition | |||
| CNode::CNode(const std::vector<AnfNodePtr> &inputs, const FuncGraphPtr &func_graph) | |||
| : AnfNode(func_graph), | |||
| @@ -574,9 +587,8 @@ std::string GetCNodeTarget(const AnfNodePtr &node) { | |||
| auto kernel_info = node->kernel_info(); | |||
| if (kernel_info != nullptr) { | |||
| auto runtime_cache = kernel_info->runtime_cache(); | |||
| MS_EXCEPTION_IF_NULL(runtime_cache); | |||
| if (runtime_cache->is_valid()) { | |||
| auto tmp_target = runtime_cache->device_target(); | |||
| if (runtime_cache.runtime_cache().is_valid()) { | |||
| auto tmp_target = runtime_cache.runtime_cache().device_target(); | |||
| if (!tmp_target.empty()) { | |||
| return tmp_target; | |||
| } | |||
| @@ -595,9 +607,8 @@ std::string GetCNodeTarget(const AnfNodePtr &node) { | |||
| if (kernel_info != nullptr) { | |||
| auto runtime_cache = kernel_info->runtime_cache(); | |||
| MS_EXCEPTION_IF_NULL(runtime_cache); | |||
| if (runtime_cache->is_valid()) { | |||
| runtime_cache->set_device_target(target); | |||
| if (runtime_cache.runtime_cache().is_valid()) { | |||
| runtime_cache.runtime_cache().set_device_target(target); | |||
| } | |||
| } | |||
| return target; | |||
| @@ -178,12 +178,12 @@ class MS_CORE_API AnfNode : public Base { | |||
| /// \brief Obtain the inferred abstract value of this AnfNode. | |||
| /// | |||
| /// \return The inferred abstract value. | |||
| const AbstractBasePtr &abstract() const { return abstract_; } | |||
| const AbstractBasePtr &abstract() const; | |||
| /// \brief Set the abstract value of this AnfNode. | |||
| /// | |||
| /// \param[in] abs New abstract value. | |||
| void set_abstract(const AbstractBasePtr &abs) { abstract_ = abs; } | |||
| void set_abstract(const AbstractBasePtr &abs); | |||
| /// \brief Obtain the intermediate abstract value of this AnfNode. | |||
| /// | |||
| @@ -24,12 +24,21 @@ | |||
| #include "ir/visitor.h" | |||
| #include "ir/func_graph.h" | |||
| #include "base/core_ops.h" | |||
| #include "utils/anf_utils.h" | |||
| namespace mindspore { | |||
| // namespace to support intermediate representation definition | |||
| // Methods of AnfNode | |||
| TypePtr AnfNode::Type() const { return (abstract_ == nullptr) ? nullptr : abstract_->BuildType(); } | |||
| BaseShapePtr AnfNode::Shape() const { return (abstract_ == nullptr) ? nullptr : abstract_->BuildShape(); } | |||
| TypePtr AnfNode::Type() const { | |||
| // cppcheck-suppress unreadVariable | |||
| auto lock = AnfUtils::GetAbstractLock(this); | |||
| return (abstract_ == nullptr) ? nullptr : abstract_->BuildType(); | |||
| } | |||
| BaseShapePtr AnfNode::Shape() const { | |||
| // cppcheck-suppress unreadVariable | |||
| auto lock = AnfUtils::GetAbstractLock(this); | |||
| return (abstract_ == nullptr) ? nullptr : abstract_->BuildShape(); | |||
| } | |||
| std::string AnfNode::ToString() const { | |||
| return mindspore::label_manage::Label(const_cast<AnfNode *>(this)->shared_from_base<AnfNode>()->debug_info()); | |||
| @@ -68,13 +68,26 @@ class RuntimeCache { | |||
| // Interface for device kernel program information. | |||
| class KernelInfoDevice { | |||
| public: | |||
| class RuntimeCacheScope { | |||
| public: | |||
| RuntimeCacheScope(RuntimeCache &base, std::mutex &mu) : runtime_cache_(base), mu_(mu) { mu_.lock(); } | |||
| RuntimeCacheScope(const RuntimeCacheScope &other) = delete; | |||
| RuntimeCacheScope operator=(const RuntimeCacheScope &other) = delete; | |||
| ~RuntimeCacheScope() { mu_.unlock(); } | |||
| RuntimeCache &runtime_cache() { return runtime_cache_; } | |||
| private: | |||
| RuntimeCache &runtime_cache_; | |||
| std::mutex &mu_; | |||
| }; | |||
| // If kernel program was built and build info is set. | |||
| virtual bool has_build_info() const = 0; | |||
| RuntimeCache *runtime_cache() { return &runtime_cache_; } | |||
| RuntimeCacheScope runtime_cache() { return RuntimeCacheScope(runtime_cache_, mu_); } | |||
| private: | |||
| RuntimeCache runtime_cache_; | |||
| std::mutex mu_; | |||
| }; | |||
| using KernelInfoDevicePtr = std::shared_ptr<KernelInfoDevice>; | |||
| } // namespace mindspore | |||
| @@ -15,6 +15,7 @@ | |||
| */ | |||
| #include "utils/anf_utils.h" | |||
| #include <map> | |||
| #include <string> | |||
| #include "base/core_ops.h" | |||
| #include "utils/trace_base.h" | |||
| @@ -23,8 +24,52 @@ | |||
| namespace mindspore { | |||
| namespace { | |||
| const PrimitiveSet follow_first_input_prims = {prim::kPrimDepend, prim::kPrimLoad}; | |||
| class AbstractMutexManager { | |||
| public: | |||
| static AbstractMutexManager &GetInstance() { | |||
| static AbstractMutexManager instance; | |||
| return instance; | |||
| } | |||
| AbstractScope GetAbstractLock(const AnfNode *node) { | |||
| std::lock_guard<std::recursive_mutex> lock(mu_); | |||
| return AbstractScope(&mu_for_nodes_[node]); | |||
| } | |||
| private: | |||
| std::map<const AnfNode *, std::recursive_mutex> mu_for_nodes_; | |||
| std::recursive_mutex mu_; | |||
| }; | |||
| } // namespace | |||
| AbstractScope::AbstractScope(std::recursive_mutex *mu) { | |||
| MS_EXCEPTION_IF_NULL(mu); | |||
| mu_ = mu; | |||
| mu_->lock(); | |||
| } | |||
| AbstractScope::AbstractScope(AbstractScope &&other) { | |||
| mu_ = other.mu_; | |||
| other.mu_ = nullptr; | |||
| } | |||
| AbstractScope &AbstractScope::operator=(AbstractScope &&other) { | |||
| mu_ = other.mu_; | |||
| other.mu_ = nullptr; | |||
| return *this; | |||
| } | |||
| AbstractScope::~AbstractScope() { | |||
| if (mu_ != nullptr) { | |||
| mu_->unlock(); | |||
| } | |||
| } | |||
| AbstractScope AnfUtils::GetAbstractLock(const AnfNode *node) { | |||
| return AbstractMutexManager::GetInstance().GetAbstractLock(node); | |||
| } | |||
| bool AnfUtils::IsDimUnknown(const abstract::ShapePtr &shape) { | |||
| MS_EXCEPTION_IF_NULL(shape); | |||
| return std::any_of(shape->shape().begin(), shape->shape().end(), [](int64_t s) { return s < -1; }); | |||
| @@ -112,20 +157,18 @@ bool AnfUtils::IsRealKernel(const AnfNodePtr &node) { | |||
| auto kernel_info = cnode->kernel_info(); | |||
| if (kernel_info) { | |||
| auto runtime_cache = kernel_info->runtime_cache(); | |||
| MS_EXCEPTION_IF_NULL(runtime_cache); | |||
| if (runtime_cache->is_real_kernel() != CacheBool::UNCACHED) { | |||
| return (runtime_cache->is_real_kernel() == CacheBool::TRUE); | |||
| if (runtime_cache.runtime_cache().is_real_kernel() != CacheBool::UNCACHED) { | |||
| return (runtime_cache.runtime_cache().is_real_kernel() == CacheBool::TRUE); | |||
| } | |||
| } | |||
| bool res = !IsOneOfPrimitive(cnode->input(kAnfPrimitiveIndex), virtual_prims); | |||
| if (kernel_info) { | |||
| auto runtime_cache = kernel_info->runtime_cache(); | |||
| MS_EXCEPTION_IF_NULL(runtime_cache); | |||
| if (res) { | |||
| runtime_cache->set_real_kernel(CacheBool::TRUE); | |||
| runtime_cache.runtime_cache().set_real_kernel(CacheBool::TRUE); | |||
| } else { | |||
| runtime_cache->set_real_kernel(CacheBool::FALSE); | |||
| runtime_cache.runtime_cache().set_real_kernel(CacheBool::FALSE); | |||
| } | |||
| } | |||
| @@ -175,10 +218,15 @@ size_t AnfUtils::GetInputTensorNum(const AnfNodePtr &node) { | |||
| MS_LOG(EXCEPTION) << "Only cnode has real input, but this anf is " << node->DebugString() | |||
| << trace::DumpSourceLines(node); | |||
| } | |||
| ssize_t input_tensor_num = cnode->input_tensor_num(); | |||
| if (input_tensor_num >= 0) { | |||
| return static_cast<size_t>(input_tensor_num); | |||
| { | |||
| // cppcheck-suppress unreadVariable | |||
| auto lock = AnfUtils::GetAbstractLock(node.get()); | |||
| ssize_t input_tensor_num = cnode->input_tensor_num(); | |||
| if (input_tensor_num >= 0) { | |||
| return static_cast<size_t>(input_tensor_num); | |||
| } | |||
| } | |||
| size_t input_num = cnode->inputs().size(); | |||
| if (input_num == 0) { | |||
| MS_LOG(EXCEPTION) << "Cnode inputs size can't be zero" << trace::DumpSourceLines(node); | |||
| @@ -191,6 +239,8 @@ size_t AnfUtils::GetInputTensorNum(const AnfNodePtr &node) { | |||
| auto &inputs = cnode->inputs(); | |||
| // Search monad inputs, backward. | |||
| for (auto iter = inputs.rbegin(); iter != inputs.rend(); ++iter) { | |||
| // cppcheck-suppress unreadVariable | |||
| auto lock = AnfUtils::GetAbstractLock(node.get()); | |||
| if (!HasAbstractMonad(*iter)) { | |||
| // Stop count if we encounter a non-monad input. | |||
| break; | |||
| @@ -198,6 +248,8 @@ size_t AnfUtils::GetInputTensorNum(const AnfNodePtr &node) { | |||
| --input_num; | |||
| } | |||
| } | |||
| // cppcheck-suppress unreadVariable | |||
| auto lock = AnfUtils::GetAbstractLock(node.get()); | |||
| cnode->set_input_tensor_num(static_cast<ssize_t>(input_num)); | |||
| return input_num; | |||
| } | |||
| @@ -207,8 +259,8 @@ size_t AnfUtils::GetOutputTensorNum(const AnfNodePtr &node) { | |||
| auto kernel_info = node->kernel_info(); | |||
| if (kernel_info) { | |||
| auto runtime_cache = kernel_info->runtime_cache(); | |||
| if (runtime_cache->is_valid()) { | |||
| ssize_t output_tensor_num = runtime_cache->output_tensor_num(); | |||
| if (runtime_cache.runtime_cache().is_valid()) { | |||
| ssize_t output_tensor_num = runtime_cache.runtime_cache().output_tensor_num(); | |||
| if (output_tensor_num >= 0) { | |||
| return static_cast<size_t>(output_tensor_num); | |||
| } | |||
| @@ -231,8 +283,8 @@ size_t AnfUtils::GetOutputTensorNum(const AnfNodePtr &node) { | |||
| if (kernel_info) { | |||
| auto runtime_cache = kernel_info->runtime_cache(); | |||
| if (runtime_cache->is_valid()) { | |||
| runtime_cache->set_output_tensor_num(static_cast<ssize_t>(res)); | |||
| if (runtime_cache.runtime_cache().is_valid()) { | |||
| runtime_cache.runtime_cache().set_output_tensor_num(static_cast<ssize_t>(res)); | |||
| } | |||
| } | |||
| return res; | |||
| @@ -25,6 +25,19 @@ | |||
| #include "ir/primitive.h" | |||
| namespace mindspore { | |||
| class AbstractScope { | |||
| public: | |||
| explicit AbstractScope(std::recursive_mutex *mu); | |||
| AbstractScope(const AbstractScope &other) = delete; | |||
| AbstractScope operator=(const AbstractScope &other) = delete; | |||
| AbstractScope(AbstractScope &&other); | |||
| AbstractScope &operator=(AbstractScope &&other); | |||
| ~AbstractScope(); | |||
| private: | |||
| std::recursive_mutex *mu_; | |||
| }; | |||
| class AnfUtils { | |||
| public: | |||
| static bool IsDimUnknown(const abstract::ShapePtr &shape); | |||
| @@ -52,6 +65,7 @@ class AnfUtils { | |||
| static void SetDumpFlag(const AnfNodePtr &node); | |||
| // Get dump flag from CNode's primitive. | |||
| static bool GetDumpFlag(const AnfNodePtr &node); | |||
| static AbstractScope GetAbstractLock(const AnfNode *node); | |||
| }; | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CORE_UTILS_ANF_UTILS_H_ | |||
| @@ -182,6 +182,13 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} | |||
| "../../../mindspore/ccsrc/profiler/device/ascend/*.cc" | |||
| "../../../mindspore/ccsrc/profiler/device/profiling.cc" | |||
| "../../../mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/adam_fp32.c" | |||
| "../../../mindspore/ccsrc/backend/kernel_compiler/kernel.cc" | |||
| "../../../mindspore/ccsrc/backend/kernel_compiler/ascend_kernel_mod.cc" | |||
| "../../../mindspore/ccsrc/backend/optimizer/common/helper.cc" | |||
| "../../../mindspore/ccsrc/runtime/device/ascend/executor/tiling/op_tiling_adapter.cc" | |||
| "../../../mindspore/ccsrc/runtime/device/ascend/executor/aicpu_ext_info_handle.cc" | |||
| "../../../mindspore/ccsrc/runtime/device/ascend/ge_types_convert.cc" | |||
| "../../../mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_util.cc" | |||
| ) | |||
| if(ENABLE_SECURITY) | |||
| @@ -230,6 +237,24 @@ add_dependencies(_ut_ut_obj engine-cache-server graph) | |||
| add_executable(ut_tests $<TARGET_OBJECTS:_ut_ut_obj> | |||
| $<TARGET_OBJECTS:_ut_mindspore_obj>) | |||
| include_directories("${CMAKE_BINARY_DIR}/backend/kernel_compiler/aicpu") | |||
| file(GLOB_RECURSE PROTO_IN RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} | |||
| "../../../mindspore/ccsrc/backend/kernel_compiler/aicpu/proto/*.proto") | |||
| ms_protobuf_generate(PROTOSRCS PROTOHDRS ${PROTO_IN}) | |||
| file(GLOB_RECURSE PROTO_DUMP RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} | |||
| "../../../mindspore/ccsrc/runtime/device/ascend/dump/proto/*.proto") | |||
| ms_protobuf_generate(DUMP_PROTOSRCS PROTOHDRS ${PROTO_DUMP}) | |||
| list(APPEND MINDSPORE_PROTO_LIST ${PROTOSRCS}) | |||
| list(APPEND MINDSPORE_PROTO_LIST ${PREDICT_PROTOSRCS}) | |||
| list(APPEND MINDSPORE_PROTO_LIST ${DUMP_PROTOSRCS}) | |||
| if(MINDSPORE_PROTO_LIST) | |||
| add_library(proto_input_ut STATIC ${MINDSPORE_PROTO_LIST}) | |||
| set_target_properties(proto_input_ut PROPERTIES COMPILE_FLAGS "-Wno-unused-variable") | |||
| endif() | |||
| if(ENABLE_GE) | |||
| if(ENABLE_TRAIN) | |||
| target_link_libraries(ut_tests PRIVATE graph ge_runner) | |||
| @@ -0,0 +1,75 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "register/op_tiling_info.h" | |||
| #include "register/op_tiling.h" | |||
| namespace optiling { | |||
| using std::make_shared; | |||
| extern "C" ge::graphStatus OpParaCalculateV2(const ge::Operator &op, OpRunInfoV2 &run_info) { | |||
| return ge::GRAPH_SUCCESS; | |||
| } | |||
| namespace utils { | |||
| OpRunInfo::OpRunInfo() {} | |||
| OpRunInfo::OpRunInfo(const uint32_t &block_dim, const bool &clear_atomic, const uint64_t &tiling_key) {} | |||
| OpRunInfo::OpRunInfo(const OpRunInfo &runinfo) {} | |||
| OpRunInfo::OpRunInfo(OpRunInfo &&runinfo) {} | |||
| OpRunInfo &OpRunInfo::operator=(const OpRunInfo &runinfo) { return *this; } | |||
| OpRunInfo &OpRunInfo::operator=(OpRunInfo &&runinfo) { return *this; } | |||
| void OpRunInfo::SetBlockDim(const uint32_t &block_dim) { return; } | |||
| uint32_t OpRunInfo::GetBlockDim() const { return 0; } | |||
| void OpRunInfo::AddWorkspace(const int64_t &workspace) { return; } | |||
| size_t OpRunInfo::GetWorkspaceNum() const { return 0; } | |||
| ge::graphStatus OpRunInfo::GetWorkspace(const size_t &idx, int64_t &workspace) const { return ge::GRAPH_SUCCESS; } | |||
| void OpRunInfo::GetAllWorkspaces(std::vector<int64_t> &workspaces) const { return; } | |||
| void OpRunInfo::SetWorkspaces(const std::vector<int64_t> &workspaces) { return; } | |||
| void OpRunInfo::InternelSetTiling(const ByteBuffer &value) { return; } | |||
| void OpRunInfo::AddTilingData(const char *_value, size_t _size) { return; } | |||
| ByteBuffer &OpRunInfo::GetAllTilingData() { | |||
| std::shared_ptr<ByteBuffer> tiling_data = std::make_shared<ByteBuffer>(); | |||
| return *tiling_data; | |||
| } | |||
| const ByteBuffer &OpRunInfo::GetAllTilingData() const { | |||
| std::shared_ptr<ByteBuffer> tiling_data = std::make_shared<ByteBuffer>(); | |||
| return *tiling_data; | |||
| } | |||
| void OpRunInfo::SetClearAtomic(bool clear_atomic_input) { return; } | |||
| bool OpRunInfo::GetClearAtomic() const { return true; } | |||
| void OpRunInfo::SetTilingKey(const uint64_t &new_tiling_key) { return; } | |||
| uint64_t OpRunInfo::GetTilingKey() const { return 0; } | |||
| } // namespace utils | |||
| } // namespace optiling | |||
| @@ -211,3 +211,9 @@ RTS_API rtError_t rtMemGetInfoEx(rtMemInfoType_t memInfoType, size_t *free, size | |||
| RTS_API rtError_t rtProfRegisterCtrlCallback(uint32_t moduleId, rtProfCtrlHandle callback) { return RT_ERROR_NONE; } | |||
| RTS_API rtError_t rtGetRtCapability(rtFeatureType_t, int32_t, int64_t *) { return RT_ERROR_NONE; } | |||
| RTS_API rtError_t rtKernelLaunchWithHandle(void *handle, const void *devFunc, uint32_t blockDim, void *args, | |||
| uint32_t argsSize, rtSmDesc_t *smDesc, rtStream_t stream, | |||
| const void *kernelInfo) { | |||
| return RT_ERROR_NONE; | |||
| } | |||