From: @lvchangquan Reviewed-by: @kisnwang,@chujinjin Signed-off-by: @chujinjinpull/14052/MERGE
| @@ -27,6 +27,7 @@ | |||
| #include "runtime/device/kernel_runtime_manager.h" | |||
| #include "runtime/device/ascend/ascend_event.h" | |||
| #include "runtime/device/ascend/ascend_launch_mul.h" | |||
| #include "runtime/device/ascend/ascend_launch_atomic_clean.h" | |||
| #include "utils/profile.h" | |||
| #define CHECK_ASCEND_RT_WITH_EXCEPTION(expression, message) \ | |||
| @@ -90,16 +91,18 @@ void AscendBucket::FreeAllDeviceMem() { | |||
| ar_output_addr_ = nullptr; | |||
| } | |||
| // clear launch mul device Memory | |||
| if (launch_kernel != nullptr) { | |||
| launch_kernel->FreeLaunchDeviceMem(); | |||
| if (launch_mul_ != nullptr) { | |||
| launch_mul_->FreeLaunchDeviceMem(); | |||
| } | |||
| // clear launch atomic clean device Memory | |||
| if (launch_atomic_clean_ != nullptr) { | |||
| launch_atomic_clean_->FreeLaunchDeviceMem(); | |||
| } | |||
| } | |||
| void AscendBucket::CopyTensorToContiguousMemory() { | |||
| // Clean input addr | |||
| CHECK_ASCEND_RT_WITH_EXCEPTION(rtMemsetAsync(ar_input_addr_, total_size_, 0, total_size_, compute_stream_), | |||
| "Call rtMemsetAsync failed"); | |||
| // clear allreduce input addr | |||
| CleanAllReduceInputAddr(); | |||
| for (size_t i = 0; i < bucket_size_; ++i) { | |||
| MS_EXCEPTION_IF_NULL(memcpy_input_addrs_[i]); | |||
| MS_EXCEPTION_IF_NULL(memcpy_output_addrs_[i]); | |||
| @@ -151,15 +154,36 @@ void AscendBucket::LaunchAllReduce() { | |||
| } | |||
| } | |||
| std::shared_ptr<LaunchKernel> AscendBucket::CreateLaunchKernel() { | |||
| void AscendBucket::CleanAllReduceInputAddr() { | |||
| if (launch_atomic_clean_ == nullptr) { | |||
| launch_atomic_clean_ = CreateLaunchAtomicClean(); | |||
| MS_EXCEPTION_IF_NULL(launch_atomic_clean_); | |||
| } | |||
| // set atomic clean input addr | |||
| launch_atomic_clean_->SetInputAddr(ar_input_addr_); | |||
| // launch atomic clean | |||
| launch_atomic_clean_->LaunchOpKernel(); | |||
| } | |||
| std::shared_ptr<LaunchKernel> AscendBucket::CreateLaunchMul() { | |||
| if (tensor_type_list_.empty()) { | |||
| MS_LOG(ERROR) << "tensor_type_list_ is empty"; | |||
| } | |||
| auto launch_mul = std::make_shared<AscendLaunchMul>(stream_, tensor_type_list_[0], total_size_, ar_output_addr_); | |||
| auto launch_mul = std::make_shared<AscendLaunchMul>(stream_, tensor_type_list_[0], total_size_); | |||
| MS_EXCEPTION_IF_NULL(launch_mul); | |||
| return launch_mul; | |||
| } | |||
| std::shared_ptr<LaunchKernel> AscendBucket::CreateLaunchAtomicClean() { | |||
| if (tensor_type_list_.empty()) { | |||
| MS_LOG(ERROR) << "tensor_type_list_ is empty"; | |||
| } | |||
| auto launch_atomic_clean = | |||
| std::make_shared<AscendLaunchAtomicClean>(compute_stream_, tensor_type_list_[0], total_size_); | |||
| MS_EXCEPTION_IF_NULL(launch_atomic_clean); | |||
| return launch_atomic_clean; | |||
| } | |||
| void AscendBucket::Init() { | |||
| pre_event_ = std::make_shared<AscendEvent>(); | |||
| post_event_ = std::make_shared<AscendEvent>(); | |||
| @@ -34,7 +34,9 @@ class AscendBucket : public Bucket { | |||
| void FreeDeviceMem(void *dev_ptr) override; | |||
| void CopyTensorToContiguousMemory() override; | |||
| void LaunchAllReduce() override; | |||
| std::shared_ptr<LaunchKernel> CreateLaunchKernel() override; | |||
| std::shared_ptr<LaunchKernel> CreateLaunchMul() override; | |||
| std::shared_ptr<LaunchKernel> CreateLaunchAtomicClean(); | |||
| void CleanAllReduceInputAddr(); | |||
| }; | |||
| } // namespace mindspore::device::ascend | |||
| #endif // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_BUCKET_H_ | |||
| @@ -0,0 +1,114 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "runtime/device/ascend/ascend_launch_atomic_clean.h" | |||
| #include <memory> | |||
| #include <vector> | |||
| #include "abstract/utils.h" | |||
| #include "backend/session/single_kernel_graph.h" | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| #include "debug/anf_ir_dump.h" | |||
| namespace mindspore::device::ascend { | |||
| void AscendLaunchAtomicClean::FreeDeviceMem(void *addr) { AscendLaunchKernel::FreeDeviceMem(addr); } | |||
| size_t AscendLaunchAtomicClean::AlignSizeForLaunchKernel(size_t size) { | |||
| return AscendLaunchKernel::AlignSizeForLaunchKernel(size); | |||
| } | |||
| uint8_t *AscendLaunchAtomicClean::AllocDeviceMem(size_t size) { return AscendLaunchKernel::AllocDeviceMem(size); } | |||
| void AscendLaunchAtomicClean::KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) { | |||
| AscendLaunchKernel::KernelSelect(kernel_graph); | |||
| } | |||
| void AscendLaunchAtomicClean::KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) { | |||
| AscendLaunchKernel::KernelBuild(kernel_graph); | |||
| } | |||
| void AscendLaunchAtomicClean::LaunchOpKernel() { | |||
| if (atomic_clean_graph_ == nullptr) { | |||
| // construct atomic clean kernel graph and set attr | |||
| ConstructKernelGraphAndSetAttr(); | |||
| // kernel build | |||
| KernelBuild(atomic_clean_graph_); | |||
| } | |||
| // obtain kernel_mod | |||
| if (atomic_clean_graph_->execution_order().size() != 1) { | |||
| MS_LOG(ERROR) << "the execution order of the atomic clean graph should have only one node"; | |||
| } | |||
| kernel_mod_ = AnfAlgo::GetKernelMod(atomic_clean_graph_->execution_order()[0]); | |||
| MS_EXCEPTION_IF_NULL(kernel_mod_); | |||
| // obtain kernel inputs | |||
| std::vector<kernel::AddressPtr> kernel_inputs; | |||
| auto input = std::make_shared<kernel::Address>(); | |||
| MS_EXCEPTION_IF_NULL(input); | |||
| input->addr = input_addr_; | |||
| MS_EXCEPTION_IF_NULL(input->addr); | |||
| input->size = total_size_; | |||
| kernel_inputs.push_back(input); | |||
| // obtain kernel outputs | |||
| auto kernel_outputs = ObtainKernelOutputs(kernel_mod_->GetOutputSizeList()); | |||
| // obtain kernel workspace | |||
| auto kernel_workspaces = ObtainKernelWorkspaces(kernel_mod_->GetWorkspaceSizeList()); | |||
| // launch | |||
| auto ret_status = kernel_mod_->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_); | |||
| if (!ret_status) { | |||
| MS_LOG(ERROR) << "Launch single kernel failed."; | |||
| } | |||
| } | |||
| void AscendLaunchAtomicClean::FreeLaunchDeviceMem() { | |||
| input_addr_ = nullptr; | |||
| FreeOutputAndWorkspaceDeviceMem(); | |||
| } | |||
| std::shared_ptr<session::KernelGraph> AscendLaunchAtomicClean::ObtainAtomicCleanKernelGraph() { | |||
| std::vector<TypeId> input_dtypes = {dtype_}; | |||
| std::vector<TypeId> output_dtypes = {}; | |||
| // obtain input & output shapes | |||
| size_t dtype_size = abstract::TypeIdSize(dtype_); | |||
| int64_t shape = total_size_ / dtype_size; | |||
| std::vector<std::vector<int64_t>> input_shapes = {{shape}}; | |||
| std::vector<std::vector<size_t>> output_shapes = {}; | |||
| auto atomic_clean_graph = session::SingleKernelGraph::ConstructKernelGraphBasedOnSingleOp( | |||
| kAtomicAddrCleanOpName, input_dtypes, input_shapes, output_dtypes, output_shapes); | |||
| MS_EXCEPTION_IF_NULL(atomic_clean_graph); | |||
| return atomic_clean_graph; | |||
| } | |||
| void AscendLaunchAtomicClean::ConstructKernelGraphAndSetAttr() { | |||
| // construct atomic clean kernel graph | |||
| atomic_clean_graph_ = ObtainAtomicCleanKernelGraph(); | |||
| MS_EXCEPTION_IF_NULL(atomic_clean_graph_); | |||
| // set atomic clean attr | |||
| if (!atomic_clean_graph_->execution_order().empty()) { | |||
| auto clean_node = atomic_clean_graph_->execution_order()[0]; | |||
| // set abstract | |||
| AbstractBasePtr abstract = std::make_shared<abstract::AbstractNone>(); | |||
| MS_EXCEPTION_IF_NULL(abstract); | |||
| clean_node->set_abstract(abstract); | |||
| // set build info | |||
| auto builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>(); | |||
| builder->SetKernelType(KernelType::TBE_KERNEL); | |||
| AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), clean_node.get()); | |||
| // set attr | |||
| std::vector<size_t> clean_size = {total_size_}; | |||
| AnfAlgo::SetNodeAttr(kAttrAtomicAddMemSize, MakeValue(clean_size), clean_node); | |||
| } | |||
| } | |||
| } // namespace mindspore::device::ascend | |||
| @@ -0,0 +1,57 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_LAUNCH_ATOMIC_CLEAN_H_ | |||
| #define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_LAUNCH_ATOMIC_CLEAN_H_ | |||
| #include <vector> | |||
| #include <memory> | |||
| #include "runtime/device/ascend/ascend_launch_kernel.h" | |||
| namespace mindspore::device::ascend { | |||
| class AscendLaunchAtomicClean : public AscendLaunchKernel { | |||
| public: | |||
| AscendLaunchAtomicClean(void *stream, TypeId dtype, size_t total_size) | |||
| : AscendLaunchKernel(stream), | |||
| dtype_(dtype), | |||
| total_size_(total_size), | |||
| atomic_clean_graph_(nullptr), | |||
| input_addr_(nullptr) {} | |||
| ~AscendLaunchAtomicClean() override = default; | |||
| void SetInputAddr(uint8_t *input_addr) override { input_addr_ = input_addr; } | |||
| void FreeDeviceMem(void *addr) override; | |||
| size_t AlignSizeForLaunchKernel(size_t size) override; | |||
| uint8_t *AllocDeviceMem(size_t size) override; | |||
| void KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) override; | |||
| void KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) override; | |||
| void LaunchOpKernel() override; | |||
| void FreeLaunchDeviceMem() override; | |||
| protected: | |||
| TypeId dtype_; | |||
| size_t total_size_; | |||
| std::shared_ptr<session::KernelGraph> atomic_clean_graph_; | |||
| uint8_t *input_addr_; | |||
| private: | |||
| std::shared_ptr<session::KernelGraph> ObtainAtomicCleanKernelGraph(); | |||
| void ConstructKernelGraphAndSetAttr(); | |||
| }; | |||
| } // namespace mindspore::device::ascend | |||
| #endif // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_LAUNCH_ATOMIC_CLEAN_H_ | |||
| @@ -33,6 +33,7 @@ class AscendLaunchKernel : public LaunchKernel { | |||
| void KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) override; | |||
| void KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) override; | |||
| void SetInputAddr(uint8_t *input_addr) override = 0; | |||
| void LaunchOpKernel() override = 0; | |||
| void FreeLaunchDeviceMem() override = 0; | |||
| }; | |||
| @@ -25,10 +25,11 @@ | |||
| namespace mindspore::device::ascend { | |||
| class AscendLaunchMul : public AscendLaunchKernel, public LaunchMul { | |||
| public: | |||
| AscendLaunchMul(void *stream, TypeId dtype, size_t total_size, uint8_t *input1_addr) | |||
| : AscendLaunchKernel(stream), LaunchMul(dtype, total_size, input1_addr) {} | |||
| AscendLaunchMul(void *stream, TypeId dtype, size_t total_size) | |||
| : AscendLaunchKernel(stream), LaunchMul(dtype, total_size) {} | |||
| ~AscendLaunchMul() override = default; | |||
| void SetInputAddr(uint8_t *input1_addr) override { input1_addr_ = input1_addr; } | |||
| void FreeDeviceMem(void *addr) override; | |||
| size_t AlignSizeForLaunchKernel(size_t size) override; | |||
| uint8_t *AllocDeviceMem(size_t size) override; | |||
| @@ -94,12 +94,16 @@ void Bucket::CalculateMean() { | |||
| if (!grad_mean) { | |||
| return; | |||
| } | |||
| launch_kernel = CreateLaunchKernel(); | |||
| MS_EXCEPTION_IF_NULL(launch_kernel); | |||
| if (launch_mul_ == nullptr) { | |||
| launch_mul_ = CreateLaunchMul(); | |||
| MS_EXCEPTION_IF_NULL(launch_mul_); | |||
| } | |||
| // set mul input1 addr | |||
| launch_mul_->SetInputAddr(ar_output_addr_); | |||
| // launch mean | |||
| launch_kernel->LaunchOpKernel(); | |||
| launch_mul_->LaunchOpKernel(); | |||
| // store output tensor addr | |||
| auto launch_output = launch_kernel->GetKernelOutputAddr(); | |||
| auto launch_output = launch_mul_->GetKernelOutputAddr(); | |||
| if (launch_output.size() != 1) { | |||
| MS_LOG(ERROR) << "launch mul outputs should have one output"; | |||
| } | |||
| @@ -38,7 +38,8 @@ class Bucket { | |||
| compute_stream_(nullptr), | |||
| pre_event_(nullptr), | |||
| post_event_(nullptr), | |||
| launch_kernel(nullptr), | |||
| launch_mul_(nullptr), | |||
| launch_atomic_clean_(nullptr), | |||
| total_size_(0), | |||
| ar_input_addr_(nullptr), | |||
| ar_output_addr_(nullptr) {} | |||
| @@ -60,7 +61,8 @@ class Bucket { | |||
| std::shared_ptr<DeviceEvent> pre_event_; | |||
| std::shared_ptr<DeviceEvent> post_event_; | |||
| std::shared_ptr<LaunchKernel> launch_kernel; | |||
| std::shared_ptr<LaunchKernel> launch_mul_; | |||
| std::shared_ptr<LaunchKernel> launch_atomic_clean_; | |||
| size_t total_size_; | |||
| uint8_t *ar_input_addr_; | |||
| @@ -77,7 +79,7 @@ class Bucket { | |||
| virtual void AllocateAllReduceAddr() = 0; | |||
| void UpdateTensorAddr(); | |||
| void CalculateMean(); | |||
| virtual std::shared_ptr<LaunchKernel> CreateLaunchKernel() = 0; | |||
| virtual std::shared_ptr<LaunchKernel> CreateLaunchMul() = 0; | |||
| virtual void LaunchAllReduce() = 0; | |||
| virtual void FreeAllDeviceMem() = 0; | |||
| virtual void FreeDeviceMem(void *dev_ptr) = 0; | |||
| @@ -92,8 +92,8 @@ void GPUBucket::FreeAllDeviceMem() { | |||
| ar_output_addr_ = nullptr; | |||
| } | |||
| // clear launch mul device memory | |||
| if (launch_kernel != nullptr) { | |||
| launch_kernel->FreeLaunchDeviceMem(); | |||
| if (launch_mul_ != nullptr) { | |||
| launch_mul_->FreeLaunchDeviceMem(); | |||
| } | |||
| MS_LOG(INFO) << "end"; | |||
| } | |||
| @@ -156,11 +156,11 @@ void GPUBucket::LaunchAllReduce() { | |||
| MS_LOG(INFO) << "end"; | |||
| } | |||
| std::shared_ptr<LaunchKernel> GPUBucket::CreateLaunchKernel() { | |||
| std::shared_ptr<LaunchKernel> GPUBucket::CreateLaunchMul() { | |||
| if (tensor_type_list_.empty()) { | |||
| MS_LOG(ERROR) << "tensor_type_list_ is empty"; | |||
| } | |||
| auto launch_mul = std::make_shared<GPULaunchMul>(stream_, tensor_type_list_[0], total_size_, ar_output_addr_); | |||
| auto launch_mul = std::make_shared<GPULaunchMul>(stream_, tensor_type_list_[0], total_size_); | |||
| MS_EXCEPTION_IF_NULL(launch_mul); | |||
| return launch_mul; | |||
| } | |||
| @@ -34,7 +34,7 @@ class GPUBucket : public Bucket { | |||
| void FreeDeviceMem(void *dev_ptr) override; | |||
| void CopyTensorToContiguousMemory() override; | |||
| void LaunchAllReduce() override; | |||
| std::shared_ptr<LaunchKernel> CreateLaunchKernel() override; | |||
| std::shared_ptr<LaunchKernel> CreateLaunchMul() override; | |||
| const void *collective_handle_; | |||
| }; | |||
| } // namespace mindspore::device::gpu | |||
| @@ -33,6 +33,7 @@ class GPULaunchkernel : public LaunchKernel { | |||
| void KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) override; | |||
| void KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) override; | |||
| void SetInputAddr(uint8_t *input_addr) override = 0; | |||
| void LaunchOpKernel() override = 0; | |||
| void FreeLaunchDeviceMem() override = 0; | |||
| }; | |||
| @@ -25,10 +25,10 @@ | |||
| namespace mindspore::device::gpu { | |||
| class GPULaunchMul : public GPULaunchkernel, public LaunchMul { | |||
| public: | |||
| GPULaunchMul(void *stream, TypeId dtype, size_t total_size, uint8_t *input1_addr) | |||
| : GPULaunchkernel(stream), LaunchMul(dtype, total_size, input1_addr) {} | |||
| GPULaunchMul(void *stream, TypeId dtype, size_t total_size) : GPULaunchkernel(stream), LaunchMul(dtype, total_size) {} | |||
| ~GPULaunchMul() override = default; | |||
| void SetInputAddr(uint8_t *input1_addr) override { input1_addr_ = input1_addr; } | |||
| void FreeDeviceMem(void *addr) override; | |||
| size_t AlignSizeForLaunchKernel(size_t size) override; | |||
| uint8_t *AllocDeviceMem(size_t size) override; | |||
| @@ -83,7 +83,7 @@ void LaunchKernel::LaunchSingleKernel(const std::vector<uint8_t *> &inputs_addr) | |||
| // launch | |||
| auto ret_status = kernel_mod_->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_); | |||
| if (!ret_status) { | |||
| MS_LOG(ERROR) << "Launch mul kernel failed."; | |||
| MS_LOG(ERROR) << "Launch single kernel failed."; | |||
| } | |||
| } | |||
| @@ -37,6 +37,7 @@ class LaunchKernel { | |||
| virtual void KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) = 0; | |||
| virtual void KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) = 0; | |||
| virtual void SetInputAddr(uint8_t *input_addr) = 0; | |||
| virtual void LaunchOpKernel() = 0; | |||
| virtual void FreeLaunchDeviceMem() = 0; | |||
| @@ -46,7 +47,6 @@ class LaunchKernel { | |||
| std::vector<uint8_t *> outputs_addr_; | |||
| std::vector<uint8_t *> workspaces_addr_; | |||
| private: | |||
| std::vector<kernel::AddressPtr> ObtainKernelAddress(const std::vector<size_t> &list, std::vector<uint8_t *> *addr); | |||
| std::vector<kernel::AddressPtr> ObtainKernelInputs(const std::vector<size_t> &inputs_list, | |||
| const std::vector<uint8_t *> &inputs_addr); | |||
| @@ -24,10 +24,10 @@ | |||
| namespace mindspore::device { | |||
| class LaunchMul { | |||
| public: | |||
| LaunchMul(TypeId dtype, size_t total_size, uint8_t *input1_addr) | |||
| LaunchMul(TypeId dtype, size_t total_size) | |||
| : dtype_(dtype), | |||
| total_size_(total_size), | |||
| input1_addr_(input1_addr), | |||
| input1_addr_(nullptr), | |||
| input2_addr_(nullptr), | |||
| input2_value_(0), | |||
| mul_graph_(nullptr) {} | |||
| @@ -106,6 +106,7 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} | |||
| "../../../mindspore/ccsrc/runtime/device/ascend/kernel_select_ascend.cc" | |||
| "../../../mindspore/ccsrc/runtime/device/ascend/ascend_launch_kernel.cc" | |||
| "../../../mindspore/ccsrc/runtime/device/ascend/ascend_launch_mul.cc" | |||
| "../../../mindspore/ccsrc/runtime/device/ascend/ascend_launch_atomic_clean.cc" | |||
| "../../../mindspore/ccsrc/runtime/device/ascend/kernel_select_graph_kernel.cc" | |||
| "../../../mindspore/ccsrc/runtime/device/convert_tensor_utils.cc" | |||
| "../../../mindspore/ccsrc/runtime/device/ascend/ascend_bucket.cc" | |||