Merge pull request !5667 from zyli2020/code_refactortags/v1.0.0
| @@ -96,9 +96,15 @@ class ActivationGradGpuKernel : public GpuKernel { | |||
| const int split_dim = 4; | |||
| if (input_shape.size() <= split_dim) { | |||
| ShapeNdTo4d(input_shape, &shape); | |||
| CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(data_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_, | |||
| shape[0], shape[1], shape[2], shape[3]), | |||
| "SetTensor4dDescriptor failed"); | |||
| if (AnfAlgo::GetInputFormat(kernel_node, 0) == kOpFormat_NHWC) { | |||
| CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(data_descriptor_, CUDNN_TENSOR_NHWC, cudnn_data_type_, | |||
| shape[0], shape[3], shape[1], shape[2]), | |||
| "cudnnSetTensor4dDescriptor failed"); | |||
| } else { | |||
| CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(data_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_, | |||
| shape[0], shape[1], shape[2], shape[3]), | |||
| "cudnnSetTensor4dDescriptor failed"); | |||
| } | |||
| } else { | |||
| CudnnSetTensorNdDescriptor(input_shape, data_descriptor_, cudnn_data_type_); | |||
| } | |||
| @@ -2,7 +2,6 @@ file(GLOB_RECURSE _PREACTIVATE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} | |||
| "common/*.cc" | |||
| "mem_reuse/*.cc" | |||
| "pass/*.cc" | |||
| "gpu/*.cc" | |||
| ) | |||
| if (ENABLE_D) | |||
| @@ -10,5 +9,10 @@ if (ENABLE_D) | |||
| list(APPEND _PREACTIVATE_SRC_LIST ${_D_SRC_LIST}) | |||
| endif () | |||
| if (ENABLE_GPU) | |||
| file(GLOB_RECURSE _GPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "gpu/*.cc") | |||
| list(APPEND _PREACTIVATE_SRC_LIST ${_GPU_SRC_LIST}) | |||
| endif () | |||
| set_property(SOURCE ${_PREACTIVATE_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_PRE_ACT) | |||
| add_library(_mindspore_backend_optimizer_obj OBJECT ${_PREACTIVATE_SRC_LIST}) | |||
| @@ -23,6 +23,7 @@ | |||
| #include "ir/primitive.h" | |||
| #include "utils/utils.h" | |||
| #include "backend/optimizer/common/helper.h" | |||
| #include "runtime/device/gpu/kernel_info_setter.h" | |||
| namespace mindspore { | |||
| namespace opt { | |||
| @@ -46,7 +47,7 @@ const AnfNodePtr BatchNormAddReluFusion::Process(const FuncGraphPtr &graph, cons | |||
| auto batch_norm_ex = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple_get_item), 0); | |||
| MS_EXCEPTION_IF_NULL(batch_norm_ex); | |||
| if (AnfAlgo::GetOutputInferDataType(batch_norm_ex, 0) != kNumberTypeFloat16) { | |||
| if (AnfAlgo::GetInputFormat(batch_norm_ex, 0) != kOpFormat_NHWC) { | |||
| return nullptr; | |||
| } | |||
| @@ -83,6 +84,7 @@ const AnfNodePtr BatchNormAddReluFusion::Process(const FuncGraphPtr &graph, cons | |||
| auto manager = graph->manager(); | |||
| MS_EXCEPTION_IF_NULL(manager); | |||
| manager->Replace(batch_norm_ex, fused_batch_norm_with_add_relu); | |||
| device::gpu::SetKernelInfo(fused_batch_norm_with_add_relu); | |||
| return tuple_get_item; | |||
| } | |||
| } // namespace opt | |||
| @@ -24,6 +24,7 @@ | |||
| #include "ir/primitive.h" | |||
| #include "utils/utils.h" | |||
| #include "backend/optimizer/common/helper.h" | |||
| #include "runtime/device/gpu/kernel_info_setter.h" | |||
| namespace mindspore { | |||
| namespace opt { | |||
| @@ -123,7 +124,8 @@ const AnfNodePtr BatchNormAddReluGradFusion::Process(const FuncGraphPtr &graph, | |||
| const EquivPtr &) const { | |||
| MS_EXCEPTION_IF_NULL(graph); | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| if (AnfAlgo::GetOutputInferDataType(node, 0) != kNumberTypeFloat16) { | |||
| if (AnfAlgo::GetInputFormat(node, 0) != kOpFormat_NHWC) { | |||
| return nullptr; | |||
| } | |||
| @@ -169,6 +171,7 @@ const AnfNodePtr BatchNormAddReluGradFusion::Process(const FuncGraphPtr &graph, | |||
| AnfAlgo::CopyNodeAttrs(node, fused_batch_norm_add_relu_grad); | |||
| SetShapeAndType(fused_batch_norm_add_relu_grad, node, relu_grad); | |||
| ReplaceOutput(graph, node, relu_grad, fused_batch_norm_add_relu_grad); | |||
| device::gpu::SetKernelInfo(fused_batch_norm_add_relu_grad); | |||
| return nullptr; | |||
| } | |||
| } // namespace opt | |||
| @@ -23,6 +23,7 @@ | |||
| #include "ir/primitive.h" | |||
| #include "utils/utils.h" | |||
| #include "backend/optimizer/common/helper.h" | |||
| #include "runtime/device/gpu/kernel_info_setter.h" | |||
| namespace mindspore { | |||
| namespace opt { | |||
| @@ -43,7 +44,7 @@ const AnfNodePtr BatchNormReluFusion::Process(const FuncGraphPtr &graph, const A | |||
| auto batch_norm_ex = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple_get_item), 0); | |||
| MS_EXCEPTION_IF_NULL(batch_norm_ex); | |||
| if (AnfAlgo::GetOutputInferDataType(batch_norm_ex, 0) != kNumberTypeFloat16) { | |||
| if (AnfAlgo::GetInputFormat(batch_norm_ex, 0) != kOpFormat_NHWC) { | |||
| return nullptr; | |||
| } | |||
| @@ -78,6 +79,7 @@ const AnfNodePtr BatchNormReluFusion::Process(const FuncGraphPtr &graph, const A | |||
| auto manager = graph->manager(); | |||
| MS_EXCEPTION_IF_NULL(manager); | |||
| manager->Replace(batch_norm_ex, fused_batch_norm_with_relu); | |||
| device::gpu::SetKernelInfo(fused_batch_norm_with_relu); | |||
| return tuple_get_item; | |||
| } | |||
| } // namespace opt | |||
| @@ -23,6 +23,7 @@ | |||
| #include "ir/primitive.h" | |||
| #include "utils/utils.h" | |||
| #include "backend/optimizer/common/helper.h" | |||
| #include "runtime/device/gpu/kernel_info_setter.h" | |||
| namespace mindspore { | |||
| namespace opt { | |||
| @@ -38,7 +39,7 @@ const AnfNodePtr BatchNormReluGradFusion::Process(const FuncGraphPtr &graph, con | |||
| MS_EXCEPTION_IF_NULL(graph); | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| if (AnfAlgo::GetOutputInferDataType(node, 0) != kNumberTypeFloat16) { | |||
| if (AnfAlgo::GetInputFormat(node, 0) != kOpFormat_NHWC) { | |||
| return nullptr; | |||
| } | |||
| @@ -84,6 +85,7 @@ const AnfNodePtr BatchNormReluGradFusion::Process(const FuncGraphPtr &graph, con | |||
| } | |||
| AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, fused_batch_norm_grad_with_relu.get()); | |||
| AnfAlgo::CopyNodeAttrs(node, fused_batch_norm_grad_with_relu); | |||
| device::gpu::SetKernelInfo(fused_batch_norm_grad_with_relu); | |||
| return fused_batch_norm_grad_with_relu; | |||
| } | |||
| } // namespace opt | |||
| @@ -53,10 +53,10 @@ using AnfAlgo = mindspore::session::AnfRuntimeAlgorithm; | |||
| void GPUSession::SelectKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const { | |||
| MS_EXCEPTION_IF_NULL(kernel_graph); | |||
| bool graph_format_transform = IsSupportFormatTransform(kernel_graph); | |||
| device::gpu::FormatTransformChecker::GetInstance().CheckSupportFormatTransform(kernel_graph); | |||
| for (const auto &kernel_node : kernel_graph->execution_order()) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| device::gpu::SetKernelInfo(kernel_node, graph_format_transform); | |||
| device::gpu::SetKernelInfo(kernel_node); | |||
| } | |||
| } | |||
| @@ -82,12 +82,6 @@ void GPUSession::Optimize(const std::shared_ptr<KernelGraph> &kernel_graph) { | |||
| pm->AddPass(std::make_shared<opt::ReplaceBNGradCastFusion>()); | |||
| pm->AddPass(std::make_shared<opt::ReplaceMomentumCastFusion>()); | |||
| pm->AddPass(std::make_shared<opt::ReplaceAddNFusion>()); | |||
| if (IsSupportFormatTransform(kernel_graph) && context_ptr->get_param<int>(MS_CTX_EXECUTION_MODE) != kPynativeMode) { | |||
| pm->AddPass(std::make_shared<opt::BatchNormReluFusion>()); | |||
| pm->AddPass(std::make_shared<opt::BatchNormReluGradFusion>()); | |||
| pm->AddPass(std::make_shared<opt::BatchNormAddReluFusion>()); | |||
| // pm->AddPass(std::make_shared<opt::BatchNormAddReluGradFusion>()); | |||
| } | |||
| optimizer->AddPassManager(pm); | |||
| (void)optimizer->Optimize(kernel_graph); | |||
| kernel_graph->SetExecOrderByDefault(); | |||
| @@ -96,6 +90,10 @@ void GPUSession::Optimize(const std::shared_ptr<KernelGraph> &kernel_graph) { | |||
| void GPUSession::HardwareOptimize(const std::shared_ptr<KernelGraph> &kernel_graph) { | |||
| auto optimizer = std::make_shared<opt::GraphOptimizer>(); | |||
| auto pm = std::make_shared<opt::PassManager>(); | |||
| pm->AddPass(std::make_shared<opt::BatchNormReluFusion>()); | |||
| pm->AddPass(std::make_shared<opt::BatchNormReluGradFusion>()); | |||
| pm->AddPass(std::make_shared<opt::BatchNormAddReluFusion>()); | |||
| // pm->AddPass(std::make_shared<opt::BatchNormAddReluGradFusion>()); | |||
| pm->AddPass(std::make_shared<opt::InsertFormatTransformOp>()); | |||
| pm->AddPass(std::make_shared<opt::RemoveFormatTransformPair>()); | |||
| pm->AddPass(std::make_shared<opt::RemoveRedundantFormatTransform>()); | |||
| @@ -201,28 +199,6 @@ void GPUSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph) const | |||
| } | |||
| } | |||
| bool GPUSession::IsSupportFormatTransform(const std::shared_ptr<KernelGraph> &kernel_graph) const { | |||
| auto kernels = kernel_graph->execution_order(); | |||
| size_t conv_cnt = 0; | |||
| size_t bn_cnt = 0; | |||
| for (const auto &kernel : kernels) { | |||
| auto kernel_name = AnfAlgo::GetCNodeName(kernel); | |||
| if (kernel_name == prim::kPrimLayerNorm->name()) { | |||
| return false; | |||
| } | |||
| if (kernel_name == prim::kPrimConv2D->name()) { | |||
| conv_cnt++; | |||
| } | |||
| if (kernel_name == prim::kPrimFusedBatchNormEx->name()) { | |||
| bn_cnt++; | |||
| } | |||
| } | |||
| if (conv_cnt == kConv2dCount && bn_cnt == kFusedBatchNormCount) { | |||
| return false; | |||
| } | |||
| return true; | |||
| } | |||
| GraphId GPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) { | |||
| // Construct graph, if successfully, graph_sum_ + 1 | |||
| auto graph_id = graph_sum_; | |||
| @@ -232,26 +208,27 @@ GraphId GPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList | |||
| auto context_ptr = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(context_ptr); | |||
| bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG); | |||
| // Optimize | |||
| // Dump .pb graph before graph optimization | |||
| if (save_graphs) { | |||
| DumpIRProto(graph, "before_opt_" + std::to_string(graph_id)); | |||
| } | |||
| // Graph optimization irrelevant to device data format | |||
| Optimize(graph); | |||
| // Select kernel build info | |||
| SelectKernel(graph); | |||
| // Graph optimization relevant to device data format | |||
| HardwareOptimize(graph); | |||
| // Dump .pb graph after graph optimization | |||
| if (save_graphs) { | |||
| DumpIRProto(graph, "after_opt_" + std::to_string(graph_id)); | |||
| } | |||
| #if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU)) | |||
| // Assign parameter keys. | |||
| AssignParamKey(graph); | |||
| #endif | |||
| // Start gpu kernel runtime | |||
| StartKernelRT(); | |||
| // Dump .pb graph before hardware optimization | |||
| if (save_graphs) { | |||
| DumpIRProto(graph, "before_hwopt_" + std::to_string(graph_id)); | |||
| } | |||
| // HardwareOptimize | |||
| HardwareOptimize(graph); | |||
| // Dump .pb graph after hardware optimization | |||
| if (save_graphs) { | |||
| DumpIRProto(graph, "after_hwopt_" + std::to_string(graph_id)); | |||
| } | |||
| // Assign CUDA streams | |||
| AssignStream(graph); | |||
| // Hide NopOp from execution graph | |||
| @@ -67,8 +67,6 @@ class GPUSession : public SessionBasic { | |||
| void Execute(const std::shared_ptr<KernelGraph> &kernel_graph) const; | |||
| bool IsSupportFormatTransform(const std::shared_ptr<KernelGraph> &kernel_graph) const; | |||
| #ifdef ENABLE_DEBUGGER | |||
| void Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const; | |||
| @@ -82,9 +80,6 @@ class GPUSession : public SessionBasic { | |||
| void PostLoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const; | |||
| #endif | |||
| static constexpr size_t kConv2dCount = 96; | |||
| static constexpr size_t kFusedBatchNormCount = 94; | |||
| }; | |||
| using GPUSessionPtr = std::shared_ptr<GPUSession>; | |||
| MS_REG_SESSION(kGPUDevice, GPUSession); | |||
| @@ -47,7 +47,7 @@ bool CudaEnvChecker::CheckNvccInPath() { | |||
| }; | |||
| auto cuda_paths = GetCudaRealPaths(); | |||
| find_nvcc_ = any_of(cuda_paths.begin(), cuda_paths.end(), checker); | |||
| find_nvcc_ = std::any_of(cuda_paths.begin(), cuda_paths.end(), checker); | |||
| already_check_nvcc_ = true; | |||
| return find_nvcc_; | |||
| } | |||
| @@ -165,6 +165,9 @@ bool IsNeedProcessFormatInfo(const CNodePtr &kernel_node, const std::vector<Type | |||
| if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode) { | |||
| return false; | |||
| } | |||
| if (!FormatTransformChecker::GetInstance().format_transform()) { | |||
| return false; | |||
| } | |||
| if (!AnfAlgo::IsRealCNodeKernel(kernel_node)) { | |||
| return false; | |||
| } | |||
| @@ -232,7 +235,31 @@ void UpdateKernelFormatInfo(const CNodePtr &kernel_node, const std::vector<TypeI | |||
| } | |||
| } // namespace | |||
| void SetKernelInfo(const CNodePtr &kernel_node, bool graph_format_transform) { | |||
| void FormatTransformChecker::CheckSupportFormatTransform(const std::shared_ptr<session::KernelGraph> &kernel_graph) { | |||
| auto kernels = kernel_graph->execution_order(); | |||
| size_t conv_cnt = 0; | |||
| size_t bn_cnt = 0; | |||
| for (const auto &kernel : kernels) { | |||
| auto kernel_name = AnfAlgo::GetCNodeName(kernel); | |||
| if (kernel_name == prim::kPrimLayerNorm->name()) { | |||
| format_transform_ = false; | |||
| return; | |||
| } | |||
| if (kernel_name == prim::kPrimConv2D->name()) { | |||
| conv_cnt++; | |||
| } | |||
| if (kernel_name == prim::kPrimFusedBatchNormEx->name()) { | |||
| bn_cnt++; | |||
| } | |||
| } | |||
| if (conv_cnt == kConv2dCount && bn_cnt == kFusedBatchNormCount) { | |||
| format_transform_ = false; | |||
| return; | |||
| } | |||
| format_transform_ = true; | |||
| } | |||
| void SetKernelInfo(const CNodePtr &kernel_node) { | |||
| std::vector<std::string> inputs_format; | |||
| std::vector<TypeId> inputs_type; | |||
| for (size_t input_index = 0; input_index < AnfAlgo::GetInputTensorNum(kernel_node); ++input_index) { | |||
| @@ -246,7 +273,7 @@ void SetKernelInfo(const CNodePtr &kernel_node, bool graph_format_transform) { | |||
| outputs_type.push_back(AnfAlgo::GetOutputInferDataType(kernel_node, output_index)); | |||
| } | |||
| std::string origin_data_format = kOpFormat_DEFAULT; | |||
| if (graph_format_transform && IsNeedProcessFormatInfo(kernel_node, inputs_type)) { | |||
| if (IsNeedProcessFormatInfo(kernel_node, inputs_type)) { | |||
| UpdateKernelFormatInfo(kernel_node, inputs_type, &inputs_format, &outputs_format, &origin_data_format); | |||
| } | |||
| std::shared_ptr<KernelBuildInfo::KernelBuildInfoBuilder> builder = | |||
| @@ -20,11 +20,13 @@ | |||
| #include <utility> | |||
| #include <string> | |||
| #include <vector> | |||
| #include <memory> | |||
| #include <map> | |||
| #include "ir/anf.h" | |||
| #include "ir/dtype.h" | |||
| #include "utils/utils.h" | |||
| #include "frontend/operator/ops.h" | |||
| #include "backend/session/kernel_graph.h" | |||
| namespace mindspore { | |||
| namespace device { | |||
| @@ -53,7 +55,28 @@ static std::map<std::string, std::pair<std::vector<size_t>, std::vector<size_t>> | |||
| {prim::kPrimAddN->name(), {{}, {0}}}, | |||
| }; | |||
| void SetKernelInfo(const CNodePtr &kernel_node, bool graph_format_transform = false); | |||
| void SetKernelInfo(const CNodePtr &kernel_node); | |||
| class FormatTransformChecker { | |||
| public: | |||
| void CheckSupportFormatTransform(const std::shared_ptr<session::KernelGraph> &kernel_graph); | |||
| bool format_transform() const { return format_transform_; } | |||
| static FormatTransformChecker &GetInstance() { | |||
| static FormatTransformChecker instance; | |||
| return instance; | |||
| } | |||
| private: | |||
| FormatTransformChecker() = default; | |||
| ~FormatTransformChecker() = default; | |||
| FormatTransformChecker(const FormatTransformChecker &); | |||
| FormatTransformChecker &operator=(const FormatTransformChecker &); | |||
| bool format_transform_{true}; | |||
| static constexpr size_t kConv2dCount = 96; | |||
| static constexpr size_t kFusedBatchNormCount = 94; | |||
| }; | |||
| class KernelAttr { | |||
| public: | |||
| @@ -133,6 +133,10 @@ list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/frontend/parallel/ | |||
| list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/frontend/parallel/ps/scheduler.cc") | |||
| list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/frontend/parallel/ps/optimizer_info.cc") | |||
| list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/frontend/parallel/ps/optimizer_info_builder.cc") | |||
| list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/backend/optimizer/gpu/batch_norm_add_relu_fusion.cc") | |||
| list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/backend/optimizer/gpu/batch_norm_add_relu_grad_fusion.cc") | |||
| list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/backend/optimizer/gpu/batch_norm_relu_fusion.cc") | |||
| list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/backend/optimizer/gpu/batch_norm_relu_grad_fusion.cc") | |||
| add_library(_ut_mindspore_obj OBJECT ${MINDSPORE_SRC_LIST}) | |||
| add_library(_ut_ut_obj OBJECT ${UT_SRCS}) | |||