From: @lvchangquan Reviewed-by: @kisnwang,@chujinjin Signed-off-by: @chujinjinpull/14052/MERGE
| @@ -27,6 +27,7 @@ | |||||
| #include "runtime/device/kernel_runtime_manager.h" | #include "runtime/device/kernel_runtime_manager.h" | ||||
| #include "runtime/device/ascend/ascend_event.h" | #include "runtime/device/ascend/ascend_event.h" | ||||
| #include "runtime/device/ascend/ascend_launch_mul.h" | #include "runtime/device/ascend/ascend_launch_mul.h" | ||||
| #include "runtime/device/ascend/ascend_launch_atomic_clean.h" | |||||
| #include "utils/profile.h" | #include "utils/profile.h" | ||||
| #define CHECK_ASCEND_RT_WITH_EXCEPTION(expression, message) \ | #define CHECK_ASCEND_RT_WITH_EXCEPTION(expression, message) \ | ||||
| @@ -90,16 +91,18 @@ void AscendBucket::FreeAllDeviceMem() { | |||||
| ar_output_addr_ = nullptr; | ar_output_addr_ = nullptr; | ||||
| } | } | ||||
| // clear launch mul device Memory | // clear launch mul device Memory | ||||
| if (launch_kernel != nullptr) { | |||||
| launch_kernel->FreeLaunchDeviceMem(); | |||||
| if (launch_mul_ != nullptr) { | |||||
| launch_mul_->FreeLaunchDeviceMem(); | |||||
| } | |||||
| // clear launch atomic clean device Memory | |||||
| if (launch_atomic_clean_ != nullptr) { | |||||
| launch_atomic_clean_->FreeLaunchDeviceMem(); | |||||
| } | } | ||||
| } | } | ||||
| void AscendBucket::CopyTensorToContiguousMemory() { | void AscendBucket::CopyTensorToContiguousMemory() { | ||||
| // Clean input addr | |||||
| CHECK_ASCEND_RT_WITH_EXCEPTION(rtMemsetAsync(ar_input_addr_, total_size_, 0, total_size_, compute_stream_), | |||||
| "Call rtMemsetAsync failed"); | |||||
| // clear allreduce input addr | |||||
| CleanAllReduceInputAddr(); | |||||
| for (size_t i = 0; i < bucket_size_; ++i) { | for (size_t i = 0; i < bucket_size_; ++i) { | ||||
| MS_EXCEPTION_IF_NULL(memcpy_input_addrs_[i]); | MS_EXCEPTION_IF_NULL(memcpy_input_addrs_[i]); | ||||
| MS_EXCEPTION_IF_NULL(memcpy_output_addrs_[i]); | MS_EXCEPTION_IF_NULL(memcpy_output_addrs_[i]); | ||||
| @@ -151,15 +154,36 @@ void AscendBucket::LaunchAllReduce() { | |||||
| } | } | ||||
| } | } | ||||
| std::shared_ptr<LaunchKernel> AscendBucket::CreateLaunchKernel() { | |||||
| void AscendBucket::CleanAllReduceInputAddr() { | |||||
| if (launch_atomic_clean_ == nullptr) { | |||||
| launch_atomic_clean_ = CreateLaunchAtomicClean(); | |||||
| MS_EXCEPTION_IF_NULL(launch_atomic_clean_); | |||||
| } | |||||
| // set atomic clean input addr | |||||
| launch_atomic_clean_->SetInputAddr(ar_input_addr_); | |||||
| // launch atomic clean | |||||
| launch_atomic_clean_->LaunchOpKernel(); | |||||
| } | |||||
| std::shared_ptr<LaunchKernel> AscendBucket::CreateLaunchMul() { | |||||
| if (tensor_type_list_.empty()) { | if (tensor_type_list_.empty()) { | ||||
| MS_LOG(ERROR) << "tensor_type_list_ is empty"; | MS_LOG(ERROR) << "tensor_type_list_ is empty"; | ||||
| } | } | ||||
| auto launch_mul = std::make_shared<AscendLaunchMul>(stream_, tensor_type_list_[0], total_size_, ar_output_addr_); | |||||
| auto launch_mul = std::make_shared<AscendLaunchMul>(stream_, tensor_type_list_[0], total_size_); | |||||
| MS_EXCEPTION_IF_NULL(launch_mul); | MS_EXCEPTION_IF_NULL(launch_mul); | ||||
| return launch_mul; | return launch_mul; | ||||
| } | } | ||||
| std::shared_ptr<LaunchKernel> AscendBucket::CreateLaunchAtomicClean() { | |||||
| if (tensor_type_list_.empty()) { | |||||
| MS_LOG(ERROR) << "tensor_type_list_ is empty"; | |||||
| } | |||||
| auto launch_atomic_clean = | |||||
| std::make_shared<AscendLaunchAtomicClean>(compute_stream_, tensor_type_list_[0], total_size_); | |||||
| MS_EXCEPTION_IF_NULL(launch_atomic_clean); | |||||
| return launch_atomic_clean; | |||||
| } | |||||
| void AscendBucket::Init() { | void AscendBucket::Init() { | ||||
| pre_event_ = std::make_shared<AscendEvent>(); | pre_event_ = std::make_shared<AscendEvent>(); | ||||
| post_event_ = std::make_shared<AscendEvent>(); | post_event_ = std::make_shared<AscendEvent>(); | ||||
| @@ -34,7 +34,9 @@ class AscendBucket : public Bucket { | |||||
| void FreeDeviceMem(void *dev_ptr) override; | void FreeDeviceMem(void *dev_ptr) override; | ||||
| void CopyTensorToContiguousMemory() override; | void CopyTensorToContiguousMemory() override; | ||||
| void LaunchAllReduce() override; | void LaunchAllReduce() override; | ||||
| std::shared_ptr<LaunchKernel> CreateLaunchKernel() override; | |||||
| std::shared_ptr<LaunchKernel> CreateLaunchMul() override; | |||||
| std::shared_ptr<LaunchKernel> CreateLaunchAtomicClean(); | |||||
| void CleanAllReduceInputAddr(); | |||||
| }; | }; | ||||
| } // namespace mindspore::device::ascend | } // namespace mindspore::device::ascend | ||||
| #endif // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_BUCKET_H_ | #endif // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_BUCKET_H_ | ||||
| @@ -0,0 +1,114 @@ | |||||
| /** | |||||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "runtime/device/ascend/ascend_launch_atomic_clean.h" | |||||
| #include <memory> | |||||
| #include <vector> | |||||
| #include "abstract/utils.h" | |||||
| #include "backend/session/single_kernel_graph.h" | |||||
| #include "backend/session/anf_runtime_algorithm.h" | |||||
| #include "debug/anf_ir_dump.h" | |||||
| namespace mindspore::device::ascend { | |||||
| void AscendLaunchAtomicClean::FreeDeviceMem(void *addr) { AscendLaunchKernel::FreeDeviceMem(addr); } | |||||
| size_t AscendLaunchAtomicClean::AlignSizeForLaunchKernel(size_t size) { | |||||
| return AscendLaunchKernel::AlignSizeForLaunchKernel(size); | |||||
| } | |||||
| uint8_t *AscendLaunchAtomicClean::AllocDeviceMem(size_t size) { return AscendLaunchKernel::AllocDeviceMem(size); } | |||||
| void AscendLaunchAtomicClean::KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) { | |||||
| AscendLaunchKernel::KernelSelect(kernel_graph); | |||||
| } | |||||
| void AscendLaunchAtomicClean::KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) { | |||||
| AscendLaunchKernel::KernelBuild(kernel_graph); | |||||
| } | |||||
| void AscendLaunchAtomicClean::LaunchOpKernel() { | |||||
| if (atomic_clean_graph_ == nullptr) { | |||||
| // construct atomic clean kernel graph and set attr | |||||
| ConstructKernelGraphAndSetAttr(); | |||||
| // kernel build | |||||
| KernelBuild(atomic_clean_graph_); | |||||
| } | |||||
| // obtain kernel_mod | |||||
| if (atomic_clean_graph_->execution_order().size() != 1) { | |||||
| MS_LOG(ERROR) << "the execution order of the atomic clean graph should have only one node"; | |||||
| } | |||||
| kernel_mod_ = AnfAlgo::GetKernelMod(atomic_clean_graph_->execution_order()[0]); | |||||
| MS_EXCEPTION_IF_NULL(kernel_mod_); | |||||
| // obtain kernel inputs | |||||
| std::vector<kernel::AddressPtr> kernel_inputs; | |||||
| auto input = std::make_shared<kernel::Address>(); | |||||
| MS_EXCEPTION_IF_NULL(input); | |||||
| input->addr = input_addr_; | |||||
| MS_EXCEPTION_IF_NULL(input->addr); | |||||
| input->size = total_size_; | |||||
| kernel_inputs.push_back(input); | |||||
| // obtain kernel outputs | |||||
| auto kernel_outputs = ObtainKernelOutputs(kernel_mod_->GetOutputSizeList()); | |||||
| // obtain kernel workspace | |||||
| auto kernel_workspaces = ObtainKernelWorkspaces(kernel_mod_->GetWorkspaceSizeList()); | |||||
| // launch | |||||
| auto ret_status = kernel_mod_->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_); | |||||
| if (!ret_status) { | |||||
| MS_LOG(ERROR) << "Launch single kernel failed."; | |||||
| } | |||||
| } | |||||
| void AscendLaunchAtomicClean::FreeLaunchDeviceMem() { | |||||
| input_addr_ = nullptr; | |||||
| FreeOutputAndWorkspaceDeviceMem(); | |||||
| } | |||||
| std::shared_ptr<session::KernelGraph> AscendLaunchAtomicClean::ObtainAtomicCleanKernelGraph() { | |||||
| std::vector<TypeId> input_dtypes = {dtype_}; | |||||
| std::vector<TypeId> output_dtypes = {}; | |||||
| // obtain input & output shapes | |||||
| size_t dtype_size = abstract::TypeIdSize(dtype_); | |||||
| int64_t shape = total_size_ / dtype_size; | |||||
| std::vector<std::vector<int64_t>> input_shapes = {{shape}}; | |||||
| std::vector<std::vector<size_t>> output_shapes = {}; | |||||
| auto atomic_clean_graph = session::SingleKernelGraph::ConstructKernelGraphBasedOnSingleOp( | |||||
| kAtomicAddrCleanOpName, input_dtypes, input_shapes, output_dtypes, output_shapes); | |||||
| MS_EXCEPTION_IF_NULL(atomic_clean_graph); | |||||
| return atomic_clean_graph; | |||||
| } | |||||
| void AscendLaunchAtomicClean::ConstructKernelGraphAndSetAttr() { | |||||
| // construct atomic clean kernel graph | |||||
| atomic_clean_graph_ = ObtainAtomicCleanKernelGraph(); | |||||
| MS_EXCEPTION_IF_NULL(atomic_clean_graph_); | |||||
| // set atomic clean attr | |||||
| if (!atomic_clean_graph_->execution_order().empty()) { | |||||
| auto clean_node = atomic_clean_graph_->execution_order()[0]; | |||||
| // set abstract | |||||
| AbstractBasePtr abstract = std::make_shared<abstract::AbstractNone>(); | |||||
| MS_EXCEPTION_IF_NULL(abstract); | |||||
| clean_node->set_abstract(abstract); | |||||
| // set build info | |||||
| auto builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>(); | |||||
| builder->SetKernelType(KernelType::TBE_KERNEL); | |||||
| AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), clean_node.get()); | |||||
| // set attr | |||||
| std::vector<size_t> clean_size = {total_size_}; | |||||
| AnfAlgo::SetNodeAttr(kAttrAtomicAddMemSize, MakeValue(clean_size), clean_node); | |||||
| } | |||||
| } | |||||
| } // namespace mindspore::device::ascend | |||||
| @@ -0,0 +1,57 @@ | |||||
| /** | |||||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_LAUNCH_ATOMIC_CLEAN_H_ | |||||
| #define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_LAUNCH_ATOMIC_CLEAN_H_ | |||||
| #include <vector> | |||||
| #include <memory> | |||||
| #include "runtime/device/ascend/ascend_launch_kernel.h" | |||||
| namespace mindspore::device::ascend { | |||||
| class AscendLaunchAtomicClean : public AscendLaunchKernel { | |||||
| public: | |||||
| AscendLaunchAtomicClean(void *stream, TypeId dtype, size_t total_size) | |||||
| : AscendLaunchKernel(stream), | |||||
| dtype_(dtype), | |||||
| total_size_(total_size), | |||||
| atomic_clean_graph_(nullptr), | |||||
| input_addr_(nullptr) {} | |||||
| ~AscendLaunchAtomicClean() override = default; | |||||
| void SetInputAddr(uint8_t *input_addr) override { input_addr_ = input_addr; } | |||||
| void FreeDeviceMem(void *addr) override; | |||||
| size_t AlignSizeForLaunchKernel(size_t size) override; | |||||
| uint8_t *AllocDeviceMem(size_t size) override; | |||||
| void KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) override; | |||||
| void KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) override; | |||||
| void LaunchOpKernel() override; | |||||
| void FreeLaunchDeviceMem() override; | |||||
| protected: | |||||
| TypeId dtype_; | |||||
| size_t total_size_; | |||||
| std::shared_ptr<session::KernelGraph> atomic_clean_graph_; | |||||
| uint8_t *input_addr_; | |||||
| private: | |||||
| std::shared_ptr<session::KernelGraph> ObtainAtomicCleanKernelGraph(); | |||||
| void ConstructKernelGraphAndSetAttr(); | |||||
| }; | |||||
| } // namespace mindspore::device::ascend | |||||
| #endif // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_LAUNCH_ATOMIC_CLEAN_H_ | |||||
| @@ -33,6 +33,7 @@ class AscendLaunchKernel : public LaunchKernel { | |||||
| void KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) override; | void KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) override; | ||||
| void KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) override; | void KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) override; | ||||
| void SetInputAddr(uint8_t *input_addr) override = 0; | |||||
| void LaunchOpKernel() override = 0; | void LaunchOpKernel() override = 0; | ||||
| void FreeLaunchDeviceMem() override = 0; | void FreeLaunchDeviceMem() override = 0; | ||||
| }; | }; | ||||
| @@ -25,10 +25,11 @@ | |||||
| namespace mindspore::device::ascend { | namespace mindspore::device::ascend { | ||||
| class AscendLaunchMul : public AscendLaunchKernel, public LaunchMul { | class AscendLaunchMul : public AscendLaunchKernel, public LaunchMul { | ||||
| public: | public: | ||||
| AscendLaunchMul(void *stream, TypeId dtype, size_t total_size, uint8_t *input1_addr) | |||||
| : AscendLaunchKernel(stream), LaunchMul(dtype, total_size, input1_addr) {} | |||||
| AscendLaunchMul(void *stream, TypeId dtype, size_t total_size) | |||||
| : AscendLaunchKernel(stream), LaunchMul(dtype, total_size) {} | |||||
| ~AscendLaunchMul() override = default; | ~AscendLaunchMul() override = default; | ||||
| void SetInputAddr(uint8_t *input1_addr) override { input1_addr_ = input1_addr; } | |||||
| void FreeDeviceMem(void *addr) override; | void FreeDeviceMem(void *addr) override; | ||||
| size_t AlignSizeForLaunchKernel(size_t size) override; | size_t AlignSizeForLaunchKernel(size_t size) override; | ||||
| uint8_t *AllocDeviceMem(size_t size) override; | uint8_t *AllocDeviceMem(size_t size) override; | ||||
| @@ -94,12 +94,16 @@ void Bucket::CalculateMean() { | |||||
| if (!grad_mean) { | if (!grad_mean) { | ||||
| return; | return; | ||||
| } | } | ||||
| launch_kernel = CreateLaunchKernel(); | |||||
| MS_EXCEPTION_IF_NULL(launch_kernel); | |||||
| if (launch_mul_ == nullptr) { | |||||
| launch_mul_ = CreateLaunchMul(); | |||||
| MS_EXCEPTION_IF_NULL(launch_mul_); | |||||
| } | |||||
| // set mul input1 addr | |||||
| launch_mul_->SetInputAddr(ar_output_addr_); | |||||
| // launch mean | // launch mean | ||||
| launch_kernel->LaunchOpKernel(); | |||||
| launch_mul_->LaunchOpKernel(); | |||||
| // store output tensor addr | // store output tensor addr | ||||
| auto launch_output = launch_kernel->GetKernelOutputAddr(); | |||||
| auto launch_output = launch_mul_->GetKernelOutputAddr(); | |||||
| if (launch_output.size() != 1) { | if (launch_output.size() != 1) { | ||||
| MS_LOG(ERROR) << "launch mul outputs should have one output"; | MS_LOG(ERROR) << "launch mul outputs should have one output"; | ||||
| } | } | ||||
| @@ -38,7 +38,8 @@ class Bucket { | |||||
| compute_stream_(nullptr), | compute_stream_(nullptr), | ||||
| pre_event_(nullptr), | pre_event_(nullptr), | ||||
| post_event_(nullptr), | post_event_(nullptr), | ||||
| launch_kernel(nullptr), | |||||
| launch_mul_(nullptr), | |||||
| launch_atomic_clean_(nullptr), | |||||
| total_size_(0), | total_size_(0), | ||||
| ar_input_addr_(nullptr), | ar_input_addr_(nullptr), | ||||
| ar_output_addr_(nullptr) {} | ar_output_addr_(nullptr) {} | ||||
| @@ -60,7 +61,8 @@ class Bucket { | |||||
| std::shared_ptr<DeviceEvent> pre_event_; | std::shared_ptr<DeviceEvent> pre_event_; | ||||
| std::shared_ptr<DeviceEvent> post_event_; | std::shared_ptr<DeviceEvent> post_event_; | ||||
| std::shared_ptr<LaunchKernel> launch_kernel; | |||||
| std::shared_ptr<LaunchKernel> launch_mul_; | |||||
| std::shared_ptr<LaunchKernel> launch_atomic_clean_; | |||||
| size_t total_size_; | size_t total_size_; | ||||
| uint8_t *ar_input_addr_; | uint8_t *ar_input_addr_; | ||||
| @@ -77,7 +79,7 @@ class Bucket { | |||||
| virtual void AllocateAllReduceAddr() = 0; | virtual void AllocateAllReduceAddr() = 0; | ||||
| void UpdateTensorAddr(); | void UpdateTensorAddr(); | ||||
| void CalculateMean(); | void CalculateMean(); | ||||
| virtual std::shared_ptr<LaunchKernel> CreateLaunchKernel() = 0; | |||||
| virtual std::shared_ptr<LaunchKernel> CreateLaunchMul() = 0; | |||||
| virtual void LaunchAllReduce() = 0; | virtual void LaunchAllReduce() = 0; | ||||
| virtual void FreeAllDeviceMem() = 0; | virtual void FreeAllDeviceMem() = 0; | ||||
| virtual void FreeDeviceMem(void *dev_ptr) = 0; | virtual void FreeDeviceMem(void *dev_ptr) = 0; | ||||
| @@ -92,8 +92,8 @@ void GPUBucket::FreeAllDeviceMem() { | |||||
| ar_output_addr_ = nullptr; | ar_output_addr_ = nullptr; | ||||
| } | } | ||||
| // clear launch mul device memory | // clear launch mul device memory | ||||
| if (launch_kernel != nullptr) { | |||||
| launch_kernel->FreeLaunchDeviceMem(); | |||||
| if (launch_mul_ != nullptr) { | |||||
| launch_mul_->FreeLaunchDeviceMem(); | |||||
| } | } | ||||
| MS_LOG(INFO) << "end"; | MS_LOG(INFO) << "end"; | ||||
| } | } | ||||
| @@ -156,11 +156,11 @@ void GPUBucket::LaunchAllReduce() { | |||||
| MS_LOG(INFO) << "end"; | MS_LOG(INFO) << "end"; | ||||
| } | } | ||||
| std::shared_ptr<LaunchKernel> GPUBucket::CreateLaunchKernel() { | |||||
| std::shared_ptr<LaunchKernel> GPUBucket::CreateLaunchMul() { | |||||
| if (tensor_type_list_.empty()) { | if (tensor_type_list_.empty()) { | ||||
| MS_LOG(ERROR) << "tensor_type_list_ is empty"; | MS_LOG(ERROR) << "tensor_type_list_ is empty"; | ||||
| } | } | ||||
| auto launch_mul = std::make_shared<GPULaunchMul>(stream_, tensor_type_list_[0], total_size_, ar_output_addr_); | |||||
| auto launch_mul = std::make_shared<GPULaunchMul>(stream_, tensor_type_list_[0], total_size_); | |||||
| MS_EXCEPTION_IF_NULL(launch_mul); | MS_EXCEPTION_IF_NULL(launch_mul); | ||||
| return launch_mul; | return launch_mul; | ||||
| } | } | ||||
| @@ -34,7 +34,7 @@ class GPUBucket : public Bucket { | |||||
| void FreeDeviceMem(void *dev_ptr) override; | void FreeDeviceMem(void *dev_ptr) override; | ||||
| void CopyTensorToContiguousMemory() override; | void CopyTensorToContiguousMemory() override; | ||||
| void LaunchAllReduce() override; | void LaunchAllReduce() override; | ||||
| std::shared_ptr<LaunchKernel> CreateLaunchKernel() override; | |||||
| std::shared_ptr<LaunchKernel> CreateLaunchMul() override; | |||||
| const void *collective_handle_; | const void *collective_handle_; | ||||
| }; | }; | ||||
| } // namespace mindspore::device::gpu | } // namespace mindspore::device::gpu | ||||
| @@ -33,6 +33,7 @@ class GPULaunchkernel : public LaunchKernel { | |||||
| void KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) override; | void KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) override; | ||||
| void KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) override; | void KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) override; | ||||
| void SetInputAddr(uint8_t *input_addr) override = 0; | |||||
| void LaunchOpKernel() override = 0; | void LaunchOpKernel() override = 0; | ||||
| void FreeLaunchDeviceMem() override = 0; | void FreeLaunchDeviceMem() override = 0; | ||||
| }; | }; | ||||
| @@ -25,10 +25,10 @@ | |||||
| namespace mindspore::device::gpu { | namespace mindspore::device::gpu { | ||||
| class GPULaunchMul : public GPULaunchkernel, public LaunchMul { | class GPULaunchMul : public GPULaunchkernel, public LaunchMul { | ||||
| public: | public: | ||||
| GPULaunchMul(void *stream, TypeId dtype, size_t total_size, uint8_t *input1_addr) | |||||
| : GPULaunchkernel(stream), LaunchMul(dtype, total_size, input1_addr) {} | |||||
| GPULaunchMul(void *stream, TypeId dtype, size_t total_size) : GPULaunchkernel(stream), LaunchMul(dtype, total_size) {} | |||||
| ~GPULaunchMul() override = default; | ~GPULaunchMul() override = default; | ||||
| void SetInputAddr(uint8_t *input1_addr) override { input1_addr_ = input1_addr; } | |||||
| void FreeDeviceMem(void *addr) override; | void FreeDeviceMem(void *addr) override; | ||||
| size_t AlignSizeForLaunchKernel(size_t size) override; | size_t AlignSizeForLaunchKernel(size_t size) override; | ||||
| uint8_t *AllocDeviceMem(size_t size) override; | uint8_t *AllocDeviceMem(size_t size) override; | ||||
| @@ -83,7 +83,7 @@ void LaunchKernel::LaunchSingleKernel(const std::vector<uint8_t *> &inputs_addr) | |||||
| // launch | // launch | ||||
| auto ret_status = kernel_mod_->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_); | auto ret_status = kernel_mod_->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_); | ||||
| if (!ret_status) { | if (!ret_status) { | ||||
| MS_LOG(ERROR) << "Launch mul kernel failed."; | |||||
| MS_LOG(ERROR) << "Launch single kernel failed."; | |||||
| } | } | ||||
| } | } | ||||
| @@ -37,6 +37,7 @@ class LaunchKernel { | |||||
| virtual void KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) = 0; | virtual void KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) = 0; | ||||
| virtual void KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) = 0; | virtual void KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) = 0; | ||||
| virtual void SetInputAddr(uint8_t *input_addr) = 0; | |||||
| virtual void LaunchOpKernel() = 0; | virtual void LaunchOpKernel() = 0; | ||||
| virtual void FreeLaunchDeviceMem() = 0; | virtual void FreeLaunchDeviceMem() = 0; | ||||
| @@ -46,7 +47,6 @@ class LaunchKernel { | |||||
| std::vector<uint8_t *> outputs_addr_; | std::vector<uint8_t *> outputs_addr_; | ||||
| std::vector<uint8_t *> workspaces_addr_; | std::vector<uint8_t *> workspaces_addr_; | ||||
| private: | |||||
| std::vector<kernel::AddressPtr> ObtainKernelAddress(const std::vector<size_t> &list, std::vector<uint8_t *> *addr); | std::vector<kernel::AddressPtr> ObtainKernelAddress(const std::vector<size_t> &list, std::vector<uint8_t *> *addr); | ||||
| std::vector<kernel::AddressPtr> ObtainKernelInputs(const std::vector<size_t> &inputs_list, | std::vector<kernel::AddressPtr> ObtainKernelInputs(const std::vector<size_t> &inputs_list, | ||||
| const std::vector<uint8_t *> &inputs_addr); | const std::vector<uint8_t *> &inputs_addr); | ||||
| @@ -24,10 +24,10 @@ | |||||
| namespace mindspore::device { | namespace mindspore::device { | ||||
| class LaunchMul { | class LaunchMul { | ||||
| public: | public: | ||||
| LaunchMul(TypeId dtype, size_t total_size, uint8_t *input1_addr) | |||||
| LaunchMul(TypeId dtype, size_t total_size) | |||||
| : dtype_(dtype), | : dtype_(dtype), | ||||
| total_size_(total_size), | total_size_(total_size), | ||||
| input1_addr_(input1_addr), | |||||
| input1_addr_(nullptr), | |||||
| input2_addr_(nullptr), | input2_addr_(nullptr), | ||||
| input2_value_(0), | input2_value_(0), | ||||
| mul_graph_(nullptr) {} | mul_graph_(nullptr) {} | ||||
| @@ -106,6 +106,7 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} | |||||
| "../../../mindspore/ccsrc/runtime/device/ascend/kernel_select_ascend.cc" | "../../../mindspore/ccsrc/runtime/device/ascend/kernel_select_ascend.cc" | ||||
| "../../../mindspore/ccsrc/runtime/device/ascend/ascend_launch_kernel.cc" | "../../../mindspore/ccsrc/runtime/device/ascend/ascend_launch_kernel.cc" | ||||
| "../../../mindspore/ccsrc/runtime/device/ascend/ascend_launch_mul.cc" | "../../../mindspore/ccsrc/runtime/device/ascend/ascend_launch_mul.cc" | ||||
| "../../../mindspore/ccsrc/runtime/device/ascend/ascend_launch_atomic_clean.cc" | |||||
| "../../../mindspore/ccsrc/runtime/device/ascend/kernel_select_graph_kernel.cc" | "../../../mindspore/ccsrc/runtime/device/ascend/kernel_select_graph_kernel.cc" | ||||
| "../../../mindspore/ccsrc/runtime/device/convert_tensor_utils.cc" | "../../../mindspore/ccsrc/runtime/device/convert_tensor_utils.cc" | ||||
| "../../../mindspore/ccsrc/runtime/device/ascend/ascend_bucket.cc" | "../../../mindspore/ccsrc/runtime/device/ascend/ascend_bucket.cc" | ||||