!30298 Package the cuda operators as a dynamic link library

Merge pull request !30298 from jinjiali-kali/cuda_ops
4 years ago · ac1463a192
--- a/cmake/package.cmake
+++ b/cmake/package.cmake
@@ -212,6 +212,11 @@ if(ENABLE_GPU)
        DESTINATION ${INSTALL_LIB_DIR}
        COMPONENT mindspore
    )
    install(
            TARGETS cuda_ops
            DESTINATION ${INSTALL_LIB_DIR}
            COMPONENT mindspore
    )
 endif()

 if(ENABLE_D)
--- a/mindspore/ccsrc/CMakeLists.txt
+++ b/mindspore/ccsrc/CMakeLists.txt
@@ -113,6 +113,8 @@ if(ENABLE_GPU)
            "plugin/device/gpu/kernel/*.cu"
            )

    list(REMOVE_ITEM GPU_SRC_LIST "plugin/device/gpu/kernel/cuda_impl/cuda_ops/*.cu")

    list(APPEND CUDA_NVCC_FLAGS -arch=sm_53 --expt-relaxed-constexpr)
    list(REMOVE_ITEM GPU_SRC_LIST "plugin/device/gpu/hal/device/blocking_queue.cc"
            "plugin/device/gpu/hal/device/gpu_buffer_mgr.cc")
@@ -144,6 +146,8 @@ if(ENABLE_GPU)
    cuda_add_library(gpu_cuda_lib STATIC ${GPU_SRC_LIST})
    set(CMAKE_CXX_FLAGS ${NVCC_TMP_CMAKE_CXX_FLAGS})
    add_compile_definitions(ENABLE_GPU)

    add_subdirectory(plugin/device/gpu/kernel/cuda_impl/cuda_ops)
 endif()


@@ -429,7 +433,7 @@ endif()

 if(ENABLE_GPU)
    message("add gpu lib to c_expression")
    target_link_libraries(_c_expression PRIVATE gpu_cuda_lib gpu_queue cublas
    target_link_libraries(_c_expression PRIVATE gpu_cuda_lib gpu_queue cublas cuda_ops
                          ${CUDA_PATH}/lib64/libcurand.so
                          ${CUDNN_LIBRARY_PATH}
                          ${CUDA_PATH}/lib64/libcudart.so
--- a/mindspore/ccsrc/cxx_api/CMakeLists.txt
+++ b/mindspore/ccsrc/cxx_api/CMakeLists.txt
@@ -142,7 +142,7 @@ if(ENABLE_D)
 endif()

 if(ENABLE_GPU)
    target_link_libraries(mindspore_shared_lib PRIVATE  gpu_cuda_lib gpu_queue cublas
    target_link_libraries(mindspore_shared_lib PRIVATE  gpu_cuda_lib gpu_queue cublas cuda_ops
                          ${CUDA_PATH}/lib64/libcurand.so
                          ${CUDNN_LIBRARY_PATH}
                          ${CUDA_PATH}/lib64/libcudart.so
--- a/mindspore/ccsrc/plugin/device/gpu/hal/device/kernel_info_setter.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/device/kernel_info_setter.cc
@@ -26,7 +26,7 @@
 #include "kernel/oplib/oplib.h"
 #include "backend/common/session/anf_runtime_algorithm.h"
 #include "plugin/device/gpu/kernel/custom/custom_aot_gpu_kernel.h"
 #include "plugin/device/gpu/hal/device/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "utils/ms_context.h"
 #include "utils/ms_utils.h"
 #include "utils/utils.h"
--- a/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.cc
@@ -28,7 +28,7 @@
 #include "plugin/device/gpu/hal/device/gpu_buffer_mgr.h"
 #include "kernel/common_utils.h"
 #include "plugin/device/gpu/hal/device/gpu_common.h"
 #include "plugin/device/gpu/hal/device/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/hal/hardware/optimizer.h"
 #include "utils/ms_device_shape_transfer.h"
 #include "utils/context/graph_kernel_flags.h"
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/argmax_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/argmax_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/argmax_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/argmax_impl.cuh"
 namespace mindspore {
 namespace kernel {
 template <typename T, typename S>
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/argmaxandminwithvalue_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/argmaxandminwithvalue_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/general_reduction_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/general_reduction_impl.cuh"
 namespace mindspore {
 namespace kernel {
 template <typename T, typename S>
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/batchtospace_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/batchtospace_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/batchtospace_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchtospace_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/cast_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/cast_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/cast_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/concatv2_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/concatv2_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <memory>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/concatv2_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/concatv2_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/crop_and_resize_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/crop_and_resize_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/crop_and_resize_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/crop_and_resize_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/depthtospace_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/depthtospace_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/depthtospace_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/depthtospace_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/dynamic_range_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/dynamic_range_gpu_kernel.h
@@ -21,7 +21,7 @@

 #include <vector>

 #include "plugin/device/gpu/kernel/cuda_impl/dynamic_range_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/dynamic_range_impl.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"

--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/embedding_lookup_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/embedding_lookup_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <algorithm>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/embedding_lookup_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/embedding_lookup_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/extract_image_patches_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/extract_image_patches_gpu_kernel.h
@@ -23,8 +23,8 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/transpose_impl_opt.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/extract_image_patches_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl_opt.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/extract_image_patches_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gather_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gather_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/gather.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gather_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gather_grad_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/gather_grad.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather_grad.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gathernd_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gathernd_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/gathernd.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/gathernd.cuh"
 #include "backend/common/session/anf_runtime_algorithm.h"

 namespace mindspore {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gatherv2_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/gatherv2_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <algorithm>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/gatherv2.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/gatherv2.cuh"
 #include "backend/common/session/anf_runtime_algorithm.h"

 namespace mindspore {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/in_top_k_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/in_top_k_gpu_kernel.h
@@ -22,9 +22,9 @@

 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/cast_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/in_top_k_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/topk_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/in_top_k_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/topk_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/matrix_band_part_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/matrix_band_part_gpu_kernel.h
@@ -23,9 +23,9 @@
 #include <cuda_runtime.h>
 #include <vector>
 #include <algorithm>
 #include "utils/complex.h"
 #include "plugin/device/gpu/kernel/cuda_impl/matrix_band_part_impl.cuh"
 #include "plugin/device/gpu/hal/device/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_band_part_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/matrix_diag_part_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/matrix_diag_part_gpu_kernel.h
@@ -25,9 +25,9 @@
 #include <string>
 #include <utility>
 #include <algorithm>
 #include "utils/complex.h"
 #include "plugin/device/gpu/kernel/cuda_impl/matrix_diag_part_impl.cuh"
 #include "plugin/device/gpu/hal/device/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_diag_part_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "kernel/common_utils.h"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/matrix_set_diag_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/matrix_set_diag_gpu_kernel.h
@@ -26,7 +26,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "kernel/common_utils.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
 #include "plugin/device/gpu/kernel/cuda_impl/matrix_set_diag_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/matrix_set_diag_impl.cuh"
 namespace mindspore {
 namespace kernel {
 template <typename T>
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/meshgrid_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/meshgrid_gpu_kernel.h
@@ -22,8 +22,8 @@
 #include <utility>
 #include <vector>

 #include "plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/oneslike_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/oneslike_impl.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/math/broadcast_gpu_kernel.h"
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/one_hot_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/one_hot_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/one_hot_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/one_hot_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/oneslike_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/oneslike_gpu_kernel.h
@@ -19,7 +19,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/oneslike_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops//oneslike_impl.cuh"
 namespace mindspore {
 namespace kernel {
 template <typename T>
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/pack_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/pack_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <memory>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/pack.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/pack.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/range_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/range_gpu_kernel.h
@@ -19,7 +19,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/range_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/range_impl.cuh"
 namespace mindspore {
 namespace kernel {
 constexpr float kStartDefault = 0.;
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/resize_nearest_neighbor_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/resize_nearest_neighbor_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/resize_nearest_neighbor_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_nearest_neighbor_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/resize_nearest_neighbor_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/resize_nearest_neighbor_grad_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/resize_nearest_neighbor_grad_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/resize_nearest_neighbor_grad_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/reverse_sequence_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/reverse_sequence_gpu_kernel.h
@@ -22,8 +22,8 @@
 #include <iostream>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/hal/device/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/reverse_sequence_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/reverse_sequence_impl.cuh"
 #include "plugin/device/gpu/kernel/kernel_constants.h"

 namespace mindspore {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/reverse_v2_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/reverse_v2_gpu_kernel.h
@@ -22,7 +22,7 @@

 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/reverse_v2_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/reverse_v2_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/scatter_functor_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/scatter_functor_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/scatter_functor_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_functor_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/scatter_nd_functor_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/scatter_nd_functor_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/scatter_nd_functor_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_nd_functor_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/scatter_nd_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/scatter_nd_gpu_kernel.h
@@ -19,7 +19,7 @@

 #include <vector>
 #include <algorithm>
 #include "plugin/device/gpu/kernel/cuda_impl/scatter_nd.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/scatter_nd.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"

--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/select_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/select_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/select_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/select_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/slice_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/slice_gpu_kernel.h
@@ -23,7 +23,7 @@
 #include <algorithm>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/slice_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/slice_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/slice_grad_gpu_kernel.h
@@ -23,7 +23,7 @@
 #include <utility>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/slice_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/sort_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/sort_gpu_kernel.h
@@ -24,9 +24,9 @@

 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/topk_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/transpose_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/unary_op_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/topk_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unary_op_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/spacetobatch_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/spacetobatch_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <string>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/spacetobatch_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/spacetobatch_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/spacetodepth_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/spacetodepth_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/spacetodepth_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/spacetodepth_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/split_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/split_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <memory>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/split_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/split_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/strided_slice_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/strided_slice_gpu_kernel.h
@@ -23,7 +23,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/arrays/strided_slice_gpu_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/slice_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/strided_slice_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/strided_slice_grad_gpu_kernel.h
@@ -23,7 +23,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/arrays/strided_slice_gpu_common.h"
 #include "plugin/device/gpu/kernel/cuda_impl/slice_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_copy_slices_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_copy_slices_gpu_kernel.h
@@ -25,7 +25,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "kernel/common_utils.h"
 #include "plugin/device/gpu/kernel/cuda_impl/slice_copy_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/slice_copy_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_add_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_add_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include <string>
 #include <algorithm>
 #include "plugin/device/gpu/kernel/cuda_impl/tensor_scatter_add.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_add.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"

--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_max_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_max_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include <algorithm>

 #include "plugin/device/gpu/kernel/cuda_impl/tensor_scatter_max.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_max.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"

--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_min_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_min_gpu_kernel.h
@@ -19,7 +19,7 @@

 #include <vector>
 #include <algorithm>
 #include "plugin/device/gpu/kernel/cuda_impl/tensor_scatter_min.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_min.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"

--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_sub_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_sub_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include <algorithm>

 #include "plugin/device/gpu/kernel/cuda_impl/tensor_scatter_sub.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_sub.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"

--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_update_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tensor_scatter_update_gpu_kernel.h
@@ -19,7 +19,7 @@

 #include <vector>
 #include <algorithm>
 #include "plugin/device/gpu/kernel/cuda_impl/tensor_scatter_update.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tensor_scatter_update.cuh"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"

--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tile_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/tile_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/tile_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tile_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/topk_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/topk_gpu_kernel.h
@@ -21,8 +21,8 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/cast_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/topk_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cast_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/topk_impl.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/transpose_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/transpose_gpu_kernel.h
@@ -21,8 +21,8 @@
 #include <algorithm>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/transpose_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/transpose_impl_opt.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/transpose_impl_opt.cuh"
 namespace mindspore {
 namespace kernel {
 constexpr size_t kDimSize4 = 4;
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unique_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unique_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/unique_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unique_impl.cuh"
 namespace mindspore {
 namespace kernel {
 template <typename T, typename S>
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unpack_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unpack_gpu_kernel.h
@@ -22,7 +22,7 @@
 #include <memory>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/unpack.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unpack.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unsorted_segment_max_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unsorted_segment_max_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <limits>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/unsorted_segment_max.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_max.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unsorted_segment_min_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unsorted_segment_min_gpu_kernel.h
@@ -21,7 +21,7 @@
 #include <limits>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/unsorted_segment_min.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_min.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unsorted_segment_sum_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unsorted_segment_sum_gpu_kernel.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/unsorted_segment_sum.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unsorted_segment_sum.cuh"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adagrad_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adagrad_impl.cuh
@@ -1,30 +0,0 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAGRAD_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAGRAD_IMPL_H_

 #include "plugin/device/gpu/hal/device/cuda_common.h"
 template <typename T, typename S, typename G>
 void ApplyAdagrad(const size_t size,
                  const bool update_slots,
                  const S *learning_rate,
                  const G *gradient,
                  T *variable,
                  T *accumulation,
                  cudaStream_t stream);

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAGRAD_IMPL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adam_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adam_impl.cuh
@@ -1,29 +0,0 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAM_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAM_IMPL_H_

 #include "plugin/device/gpu/hal/device/cuda_common.h"
 template <typename T>
 void ApplyAdam(const size_t size, const T *gradient, const T *beta1_power, const T *beta2_power, const T *learning_rate,
               const T *beta1, const T *beta2, const T *epsilon, T *variable, T *m, T *v, cudaStream_t cuda_stream);
 template <typename T>
 void AdamWeightDecayOp(const size_t size, const T *gradient, const float *learning_rate, const float *beta1,
                       const float *beta2, const float *epsilon, const float *decay, T *variable, T *m, T *v,
                       cudaStream_t cuda_stream);

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAM_IMPL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_grad_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_grad_impl.cuh
@@ -1,26 +0,0 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAPTIVE_AVGPOOL2D_GRAD_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAPTIVE_AVGPOOL2D_GRAD_IMPL_H_

 #include "plugin/device/gpu/hal/device/cuda_common.h"
 template <typename T>
 void ApplyAdaptiveAvgPool2DGrad(const uint size, const uint input_height, const uint input_width,
                                const uint output_height, const uint output_width, T *input_data,
                                T *output_data, cudaStream_t cuda_stream);

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAPTIVE_AVGPOOL2D_GRAD_IMPL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_impl.cuh
@@ -1,25 +0,0 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAPTIVEAVGPOOL2D_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAPTIVEAVGPOOL2D_IMPL_H_

 #include "plugin/device/gpu/hal/device/cuda_common.h"
 template <typename T>
 void ApplyAdaptiveAvgPool2D(const uint size, const uint input_height, const uint input_width, const uint output_height,
                            const uint output_width, T *input_data, T *output_data, cudaStream_t cuda_stream);

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAPTIVEAVGPOOL2D_IMPL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/add_relu_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/add_relu_impl.cuh
@@ -1,27 +0,0 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADD_RELU_V2_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADD_RELU_V2_IMPL_H_

 #include "plugin/device/gpu/hal/device/cuda_common.h"
 template <typename T>
 void AddReluV2(const size_t num, const T *x1, const T *x2, T *y, uint32_t *mask, cudaStream_t cuda_stream);

 template <typename T>
 void AddReluGradV2(const size_t size, const T *x1, const T *x2, const uint32_t *mask, T *dx, cudaStream_t cuda_stream);

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADD_RELU_IMPL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/add_relu_v2_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/add_relu_v2_impl.cuh
@@ -1,27 +0,0 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADD_RELU_V2_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADD_RELU_V2_IMPL_H_

 #include "plugin/device/gpu/hal/device/cuda_common.h"
 template <typename T>
 void AddReluV2(const size_t num, const T *x1, const T *x2, T *y, uint32_t *mask, cudaStream_t cuda_stream);

 template <typename T>
 void AddReluGradV2(const size_t size, const T *x1, const T *x2, const uint32_t *mask, T *dx, cudaStream_t cuda_stream);

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADD_RELU_IMPL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/argmax_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/argmax_impl.cuh
@@ -1,23 +0,0 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_ARGMAX_IMPL_CUH_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_ARGMAX_IMPL_CUH_
 template <typename T, typename S>
 void CalArgmax(const T *input, const S bound, const size_t outer_size, const size_t inner_size, S *output,
               cudaStream_t cuda_stream);

 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_ARGMAX_IMPL_CUH_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_fold2_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_fold2_impl.cuh
@@ -1,40 +0,0 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHNORMFOLD2_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHNORMFOLD2_H_

 #include "plugin/device/gpu/hal/device/cuda_common.h"
 template <typename T>
 void BatchNormFold2Forward(const T *x, const T *beta, const T *gamma, const T *batch_std, const T *batch_mean,
                           const T *running_std, const T *running_mean, const int *global_step, T *y, int freeze_bn,
                           size_t N, size_t C, size_t H, size_t W, cudaStream_t cuda_stream);
 template <typename T>
 void CalBatchNormFold2GradNotFreeze(const T *d_beta, const T *reduce_x, const T *batch_mean, const T *batch_std,
                                    const T *running_mean, const T *running_std, const T *gamma, T *d_gamma,
                                    T *d_batch_mean, T *d_batch_std, size_t C, cudaStream_t cuda_stream);
 template <typename T>
 void CalBatchNormFold2GradFreeze(const T *d_beta, const T *reduce_x, const T *batch_mean, const T *batch_std,
                                 const T *running_mean, const T *running_std, const T *gamma, T *d_gamma,
                                 T *d_batch_mean, T *d_batch_std, size_t C, cudaStream_t cuda_stream);
 template <typename T>
 void BatchNormFold2GradReduce(const T *dout, const T *x, T *d_beta, T *tmp, T *reduce_x, T *tmp2, T *tmp_x, size_t N,
                              size_t C, size_t H, size_t W, cudaStream_t cuda_stream);

 template <typename T>
 void CalBatchNormFold2GradNotFreezeDxMul(const T *batch_std, const T *running_std, T *d_x, size_t N, size_t C, size_t H,
                                         size_t W, cudaStream_t cuda_stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHNORMFOLD2_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_fold_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchnorm_fold_impl.cuh
@@ -1,32 +0,0 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHNORM_FOLD_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHNORM_FOLD_H_

 template <typename T>
 void CalUpdateRunningStd(int channel_size, double epsilon, T* running_std, cudaStream_t cuda_stream);

 template <typename T>
 void CalUpdateBatchStd(int channel_size, T* batch_std, cudaStream_t cuda_stream);

 template <typename T>
 void CalBatchNormFoldGrad(const T* d_batch_mean, const T* d_batch_std, const T* x, const T* batch_mean,
                          const T* batch_std, int batch_size, int channel_size, int height, int width, T* dx,
                          cudaStream_t cuda_stream);
 template <typename T>
 void ThrustFillWith(T* array, int size, T tofill, cudaStream_t cuda_stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_BATCHNORM_FOLD_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchtospace_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchtospace_impl.cu
@@ -1,133 +0,0 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <cuda_runtime.h>
 #include "batchtospace_impl.cuh"
 #include "plugin/device/gpu/hal/device/cuda_common.h"

 template <typename T>
 __global__ void BatchToSpace(const size_t size, const T *input, const size_t in,
                             const size_t ih, const size_t iw, const size_t ic,
                             const size_t on, const size_t oh, const size_t ow,
                             const size_t oc, const size_t crop_up, const size_t crop_dn,
                             const size_t crop_lft, const size_t crop_rht, const size_t block_num,
                             T *output) {
  size_t temp_stride = 0;
  size_t temp_pos = 0;
  size_t idx_on = 0;
  size_t idx_oc = 0;
  size_t idx_oh = 0;
  size_t idx_ow = 0;
  size_t idx_in = 0;
  size_t input_pos = 0;
  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size;
       pos += blockDim.x * gridDim.x) {
    temp_stride = oc * oh * ow;
    idx_on = pos / temp_stride;
    temp_pos = pos % temp_stride;

    temp_stride /= oc;
    idx_oc = temp_pos / temp_stride;
    temp_pos = pos % temp_stride;

    temp_stride /= oh;
    idx_oh = temp_pos / temp_stride;
    temp_pos = pos % temp_stride;

    temp_stride /= ow;
    idx_ow = temp_pos / temp_stride;

    idx_in = (((idx_oh + crop_up) % block_num) * block_num + ((idx_ow + crop_lft) % block_num)) * on + idx_on;
    input_pos = idx_in * ic;
    input_pos = (input_pos + idx_oc) * ih;
    input_pos = (input_pos + ((idx_oh + crop_up) - (idx_in / (on * block_num))) / block_num) * iw;
    input_pos = (input_pos + ((idx_ow + crop_lft) - ((idx_in / on) % block_num)) / block_num);
    output[pos] = input[input_pos];
  }
  return;
 }

 template <typename T>
 void CalBatchToSpace(const size_t size, const T *input, const size_t in,
  const size_t ih, const size_t iw, const size_t ic,
  const size_t on, const size_t oh, const size_t ow,
  const size_t oc, const size_t crop_up, const size_t crop_dn,
  const size_t crop_lft, const size_t crop_rht, const size_t block_num,
  T *output, cudaStream_t cuda_stream) {
  BatchToSpace<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(
    size, input, in, ih, iw, ic, on, oh, ow, oc, crop_up, crop_dn, crop_lft, crop_rht, block_num, output);
  return;
 }

 template void CalBatchToSpace<float>(const size_t size, const float *input, const size_t in,
                                     const size_t ih, const size_t iw, const size_t ic,
                                     const size_t on, const size_t oh, const size_t ow,
                                     const size_t oc, const size_t crop_up, const size_t crop_dn,
                                     const size_t crop_lft, const size_t crop_rht, const size_t block_num,
                                     float *output, cudaStream_t cuda_stream);
 template void CalBatchToSpace<half>(const size_t size, const half *input, const size_t in,
                                    const size_t ih, const size_t iw, const size_t ic,
                                    const size_t on, const size_t oh, const size_t ow,
                                    const size_t oc, const size_t crop_up, const size_t crop_dn,
                                    const size_t crop_lft, const size_t crop_rht, const size_t block_num,
                                    half *output, cudaStream_t cuda_stream);
 template void CalBatchToSpace<int>(const size_t size, const int *input, const size_t in,
                                   const size_t ih, const size_t iw, const size_t ic,
                                   const size_t on, const size_t oh, const size_t ow,
                                   const size_t oc, const size_t crop_up, const size_t crop_dn,
                                   const size_t crop_lft, const size_t crop_rht, const size_t block_num,
                                   int *output, cudaStream_t cuda_stream);
 template void CalBatchToSpace<int64_t>(const size_t size, const int64_t *input, const size_t in,
                                       const size_t ih, const size_t iw, const size_t ic,
                                       const size_t on, const size_t oh, const size_t ow,
                                       const size_t oc, const size_t crop_up, const size_t crop_dn,
                                       const size_t crop_lft, const size_t crop_rht, const size_t block_num,
                                       int64_t *output, cudaStream_t cuda_stream);
 template void CalBatchToSpace<int16_t>(const size_t size, const int16_t *input, const size_t in,
                                       const size_t ih, const size_t iw, const size_t ic,
                                       const size_t on, const size_t oh, const size_t ow,
                                       const size_t oc, const size_t crop_up, const size_t crop_dn,
                                       const size_t crop_lft, const size_t crop_rht, const size_t block_num,
                                       int16_t *output, cudaStream_t cuda_stream);
 template void CalBatchToSpace<int8_t>(const size_t size, const int8_t *input, const size_t in,
                                      const size_t ih, const size_t iw, const size_t ic,
                                      const size_t on, const size_t oh, const size_t ow,
                                      const size_t oc, const size_t crop_up, const size_t crop_dn,
                                      const size_t crop_lft, const size_t crop_rht, const size_t block_num,
                                      int8_t *output, cudaStream_t cuda_stream);
 template void CalBatchToSpace<uint8_t>(const size_t size, const uint8_t *input, const size_t in,
                                       const size_t ih, const size_t iw, const size_t ic,
                                       const size_t on, const size_t oh, const size_t ow,
                                       const size_t oc, const size_t crop_up, const size_t crop_dn,
                                       const size_t crop_lft, const size_t crop_rht, const size_t block_num,
                                       uint8_t *output, cudaStream_t cuda_stream);
 template void CalBatchToSpace<uint16_t>(const size_t size, const uint16_t *input, const size_t in,
                                        const size_t ih, const size_t iw, const size_t ic,
                                        const size_t on, const size_t oh, const size_t ow,
                                        const size_t oc, const size_t crop_up, const size_t crop_dn,
                                        const size_t crop_lft, const size_t crop_rht, const size_t block_num,
                                        uint16_t *output, cudaStream_t cuda_stream);
 template void CalBatchToSpace<uint32_t>(const size_t size, const uint32_t *input, const size_t in,
                                        const size_t ih, const size_t iw, const size_t ic,
                                        const size_t on, const size_t oh, const size_t ow,
                                        const size_t oc, const size_t crop_up, const size_t crop_dn,
                                        const size_t crop_lft, const size_t crop_rht, const size_t block_num,
                                        uint32_t *output, cudaStream_t cuda_stream);
 template void CalBatchToSpace<uint64_t>(const size_t size, const uint64_t *input, const size_t in,
                                        const size_t ih, const size_t iw, const size_t ic,
                                        const size_t on, const size_t oh, const size_t ow,
                                        const size_t oc, const size_t crop_up, const size_t crop_dn,
                                        const size_t crop_lft, const size_t crop_rht, const size_t block_num,
                                        uint64_t *output, cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchtospace_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/batchtospace_impl.cuh
@@ -1,27 +0,0 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHTOSPACE_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHTOSPACE_H_
 template <typename T>
 void CalBatchToSpace(const size_t size, const T *input, const size_t in,
                     const size_t ih, const size_t iw, const size_t ic,
                     const size_t on, const size_t oh, const size_t ow,
                     const size_t oc, const size_t crop_up, const size_t crop_dn,
                     const size_t crop_lft, const size_t crop_rht, const size_t block_num,
                     T *output, cudaStream_t cuda_stream);

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BATCHTOSPACE_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/bce_with_logits_loss_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/bce_with_logits_loss_impl.cuh
@@ -1,30 +0,0 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_BCE_WITH_LOGITS_LOSS_IMPL_CUH_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_BCE_WITH_LOGITS_LOSS_IMPL_CUH_

 #define MAX_LOGITS_DIMENSION 8
 #include "plugin/device/gpu/hal/device/cuda_common.h"

 template <typename T>
 void CalBCEWithLogitsLoss(const size_t input_size, const T *predict, const T *target, const size_t *input_shape,
                          const size_t shape_size, const T *weight, const size_t *weight_shape,
                          const bool weight_need_broadcast, const T *pos_weight, const size_t *pos_weight_shape,
                          const bool pos_weight_need_broadcast, T *shape_broadcasted, T *output,
                          cudaStream_t cuda_stream);

 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_BCE_WITH_LOGITS_LOSS_IMPL_CUH_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/bias_add_grad_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/bias_add_grad_impl.cuh
@@ -1,27 +0,0 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BIASADDGRAD_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BIASADDGRAD_H_

 #include "plugin/device/gpu/hal/device/cuda_common.h"
 template <typename T>
 void CalBiasAddGradNHWC(const size_t size, const size_t bias_size,
                        const T*  dy, T*  db,  cudaStream_t cuda_stream);
 template <typename T>
 void CalBiasAddGradNCHW(const size_t size, const size_t bias_size, const int height, const int width,
                        const T*  dy, T*  db,  cudaStream_t cuda_stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BIASADDGRAD_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/boundingbox_decode_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/boundingbox_decode_impl.cuh
@@ -1,27 +0,0 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_BOUNDINGBOX_DECODE_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_BOUNDINGBOX_DECODE_IMPL_H_

 #include "plugin/device/gpu/hal/device/cuda_common.h"
 template <typename T>
 void BoundingBoxDecode(const size_t size, const T *rois, const T *deltas, T *bboxes, const float &m1, const float &m2,
                       const float &m3, const float &m4, const float &s1, const float &s2, const float &s3,
                       const float &s4, const int &max_height, const int &max_width, const float &ratio_clip,
                       cudaStream_t cuda_stream);

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_BOUNDINGBOX_DECODE_IMPL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/boundingbox_encode_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/boundingbox_encode_impl.cuh
@@ -1,26 +0,0 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_BOUNDINGBOX_ENCODE_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_BOUNDINGBOX_ENCODE_IMPL_H_

 #include "plugin/device/gpu/hal/device/cuda_common.h"
 template <typename T>
 void BoundingBoxEncode(const size_t size, const T *anchor_box, const T *groundtruth_box, T *deltas, const float &m1,
                       const float &m2, const float &m3, const float &m4, const float &s1, const float &s2,
                       const float &s3, const float &s4, cudaStream_t cuda_stream);

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_BOUNDINGBOX_ENCODE_IMPL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/broadcast_grad_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/broadcast_grad_impl.cuh
@@ -1,38 +0,0 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_GRAD_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_GRAD_H_

 #include "plugin/device/gpu/hal/device/cuda_common.h"

 enum BroadcastGradOpType {
  BROADCAST_GRAD_TYPE_MAXIMUM = 0,
  BROADCAST_GRAD_TYPE_MINIMUM = 1,
  BROADCAST_GRAD_TYPE_INVALID = 0xffffffff,
 };

 template <typename T>
 void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
                   const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
                   const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op, const T *x1, const T *x2,
                   const T *dy, T *dx1, T *dx2, cudaStream_t stream);

 template <typename T>
 void NoBroadcastGrad(const int &nums, const bool &grad_x1, const bool &grad_x2, enum BroadcastGradOpType op,
                     const T *x1, const T *x2, const T *dy, T *dx1, T *dx2, cudaStream_t stream);

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_GRAD_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/broadcast_impl.cuh
@@ -1,89 +0,0 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_H_

 #include <vector>
 #include "plugin/device/gpu/hal/device/cuda_common.h"
 #include "utils/complex.h"

 const float kFloatEplison = 1e-37;

 enum BroadcastOpType {
  BROADCAST_TYPE_GREATER = 0,
  BROADCAST_TYPE_LESS = 1,
  BROADCAST_TYPE_MAXIMUM = 2,
  BROADCAST_TYPE_MINIMUM = 3,
  BROADCAST_TYPE_POWER = 4,
  BROADCAST_TYPE_REALDIV = 5,
  BROADCAST_TYPE_MUL = 6,
  BROADCAST_TYPE_SUB = 7,
  BROADCAST_TYPE_ADD = 8,
  BROADCAST_TYPE_FLOORDIV = 9,
  BROADCAST_TYPE_ABSGRAD = 10,
  BROADCAST_TYPE_DIV = 11,
  BROADCAST_TYPE_DIVNONAN = 12,
  BROADCAST_TYPE_EQUAL = 13,
  BROADCAST_TYPE_SQUARED_DIFFERENCE = 14,
  BROADCAST_TYPE_MOD = 15,
  BROADCAST_TYPE_FLOORMOD = 16,
  BROADCAST_TYPE_ATAN2 = 17,
  BROADCAST_TYPE_GREATER_EQUAL = 18,
  BROADCAST_TYPE_LESS_EQUAL = 19,
  BROADCAST_TYPE_NOT_EQUAL = 20,
  BROADCAST_TYPE_LOGICAL_AND = 21,
  BROADCAST_TYPE_LOGICAL_OR = 22,
  BROADCAST_TYPE_TRUNCATEDIV = 23,
  BROADCAST_TYPE_TRUNCATEMOD = 24,
  BROADCAST_TYPE_COMPLEX = 25,
  BROADCAST_TYPE_INVALID = 0xffffffff,
 };

 template <typename T>
 void ElewiseCmp(const int &nums, enum BroadcastOpType op, const T *x0, const T *x1, bool *y, cudaStream_t stream);

 template <typename T>
 void ElewiseArith(const int &nums, enum BroadcastOpType op, const T *x0, const T *x1, T *y, cudaStream_t stream);

 template <typename T1, typename T2, typename T3>
 void ElewiseComplexArith(const int &nums, enum BroadcastOpType op, const T1 *x0, const T2 *x1,
                         Complex<T3> *y, cudaStream_t stream);

 template <typename T>
 void BroadcastCmp(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
                  const std::vector<size_t> &y_dims, enum BroadcastOpType op, const T *x0, const T *x1, bool *y,
                  cudaStream_t stream);

 template <typename T>
 void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
                    const std::vector<size_t> &y_dims, enum BroadcastOpType op, const T *x0, const T *x1, T *y,
                    cudaStream_t stream);

 template <typename T1, typename T2, typename T3>
 void BroadcastComplexArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const T1 *x0, const T2 *x1,
                           Complex<T3> *y, cudaStream_t stream);
 template <typename T>
 void BroadcastComplexArith(const std::vector<size_t> &x0_dims, const std::vector<size_t> &x1_dims,
                           const std::vector<size_t> &y_dims, enum BroadcastOpType op, const T *x0, const T *x1,
                           Complex<T> *y, cudaStream_t stream);

 template <typename T>
 void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, const size_t &o0,
                 const size_t &o1, const size_t &o2, const size_t &o3, const T *input_addr, T *output_addr,
                 cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cast_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cast_impl.cu
@@ -1,318 +0,0 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include <vector>
 #include <iostream>

 #include "plugin/device/gpu/kernel/cuda_impl/cast_impl.cuh"
 #include "plugin/device/gpu/hal/device/cuda_common.h"

 // Generic cast
 template <typename S, typename T>
 __device__ __forceinline__ void CastBase(const S *input_addr, T *output_addr) {
  *output_addr = static_cast<T>((*input_addr));
 }

 // half --> integer
 __device__ __forceinline__ void CastBase(const half *input_addr, uint64_t *output_addr) {
  *output_addr = __half2ull_rz((*input_addr));
 }

 __device__ __forceinline__ void CastBase(const half *input_addr, int64_t *output_addr) {
  *output_addr = __half2ll_rz((*input_addr));
 }

 __device__ __forceinline__ void CastBase(const half *input_addr, uint32_t *output_addr) {
  *output_addr = __half2uint_rz((*input_addr));
 }

 __device__ __forceinline__ void CastBase(const half *input_addr, int32_t *output_addr) {
  *output_addr = __half2int_rz((*input_addr));
 }

 __device__ __forceinline__ void CastBase(const half *input_addr, uint16_t *output_addr) {
  *output_addr = __half2ushort_rz((*input_addr));
 }

 __device__ __forceinline__ void CastBase(const half *input_addr, int16_t *output_addr) {
  *output_addr = __half2short_rz((*input_addr));
 }

 __device__ __forceinline__ void CastBase(const half *input_addr, uint8_t *output_addr) {
  *output_addr = static_cast<uint8_t>(__half2ushort_rz((*input_addr)));
 }

 __device__ __forceinline__ void CastBase(const half *input_addr, int8_t *output_addr) {
  *output_addr = static_cast<int8_t>(__half2short_rz((*input_addr)));
 }

 // integer --> half
 __device__ __forceinline__ void CastBase(const uint64_t *input_addr, half *output_addr) {
  *output_addr = __ull2half_rn((*input_addr));
 }

 __device__ __forceinline__ void CastBase(const int64_t *input_addr, half *output_addr) {
  *output_addr = __ll2half_rn((*input_addr));
 }

 __device__ __forceinline__ void CastBase(const uint32_t *input_addr, half *output_addr) {
  *output_addr = __uint2half_rn((*input_addr));
 }

 __device__ __forceinline__ void CastBase(const int32_t *input_addr, half *output_addr) {
  *output_addr = __int2half_rn((*input_addr));
 }

 __device__ __forceinline__ void CastBase(const uint16_t *input_addr, half *output_addr) {
  *output_addr = __ushort2half_rn((*input_addr));
 }

 __device__ __forceinline__ void CastBase(const int16_t *input_addr, half *output_addr) {
  *output_addr = __short2half_rn((*input_addr));
 }

 __device__ __forceinline__ void CastBase(const uint8_t *input_addr, half *output_addr) {
  *output_addr = __ushort2half_rn(static_cast<uint16_t>(*input_addr));
 }

 __device__ __forceinline__ void CastBase(const int8_t *input_addr, half *output_addr) {
  *output_addr = __short2half_rn(static_cast<int16_t>(*input_addr));
 }

 // Cast
 template <typename S, typename T>
 __global__ void CastKernel(const int input_size, const S *input_addr, T *output_addr) {
  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < input_size; pos += blockDim.x * gridDim.x) {
    CastBase(input_addr + pos, output_addr + pos);
  }
 }

 template <typename S, typename T>
 void Cast(const int input_size, const S *input_addr, T *output_addr, cudaStream_t stream) {
  CastKernel<<<GET_BLOCKS(input_size), GET_THREADS, 0, stream>>>(input_size, input_addr, output_addr);
 }

 template void Cast(const int input_size, const int8_t *input_addr, int8_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int8_t *input_addr, int16_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int8_t *input_addr, int32_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int8_t *input_addr, int64_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int8_t *input_addr, uint8_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int8_t *input_addr, uint16_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int8_t *input_addr, uint32_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int8_t *input_addr, uint64_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int8_t *input_addr, float *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int8_t *input_addr, double *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int8_t *input_addr, half *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int8_t *input_addr, bool *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int8_t *input_addr, Complex<float> *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int8_t *input_addr, Complex<double> *output_addr, cudaStream_t stream);

 template void Cast(const int input_size, const int16_t *input_addr, int8_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int16_t *input_addr, int16_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int16_t *input_addr, int32_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int16_t *input_addr, int64_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int16_t *input_addr, uint8_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int16_t *input_addr, uint16_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int16_t *input_addr, uint32_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int16_t *input_addr, uint64_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int16_t *input_addr, float *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int16_t *input_addr, double *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int16_t *input_addr, half *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int16_t *input_addr, bool *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int16_t *input_addr, Complex<float> *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int16_t *input_addr, Complex<double> *output_addr, cudaStream_t stream);


 template void Cast(const int input_size, const int32_t *input_addr, int8_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int32_t *input_addr, int16_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int32_t *input_addr, int32_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int32_t *input_addr, int64_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int32_t *input_addr, uint8_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int32_t *input_addr, uint16_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int32_t *input_addr, uint32_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int32_t *input_addr, uint64_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int32_t *input_addr, float *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int32_t *input_addr, double *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int32_t *input_addr, half *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int32_t *input_addr, bool *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int32_t *input_addr, Complex<float> *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int32_t *input_addr, Complex<double> *output_addr, cudaStream_t stream);


 template void Cast(const int input_size, const int64_t *input_addr, int8_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int64_t *input_addr, int16_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int64_t *input_addr, int32_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int64_t *input_addr, int64_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int64_t *input_addr, uint8_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int64_t *input_addr, uint16_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int64_t *input_addr, uint32_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int64_t *input_addr, uint64_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int64_t *input_addr, float *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int64_t *input_addr, double *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int64_t *input_addr, half *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int64_t *input_addr, bool *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int64_t *input_addr, Complex<float> *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const int64_t *input_addr, Complex<double> *output_addr, cudaStream_t stream);

 template void Cast(const int input_size, const uint8_t *input_addr, int8_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint8_t *input_addr, int16_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint8_t *input_addr, int32_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint8_t *input_addr, int64_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint8_t *input_addr, uint8_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint8_t *input_addr, uint16_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint8_t *input_addr, uint32_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint8_t *input_addr, uint64_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint8_t *input_addr, float *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint8_t *input_addr, double *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint8_t *input_addr, half *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint8_t *input_addr, bool *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint8_t *input_addr, Complex<float> *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint8_t *input_addr, Complex<double> *output_addr, cudaStream_t stream);

 template void Cast(const int input_size, const uint16_t *input_addr, int8_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint16_t *input_addr, int16_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint16_t *input_addr, int32_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint16_t *input_addr, int64_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint16_t *input_addr, uint8_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint16_t *input_addr, uint16_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint16_t *input_addr, uint32_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint16_t *input_addr, uint64_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint16_t *input_addr, float *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint16_t *input_addr, double *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint16_t *input_addr, half *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint16_t *input_addr, bool *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint16_t *input_addr, Complex<float> *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint16_t *input_addr, Complex<double> *output_addr, cudaStream_t stream);

 template void Cast(const int input_size, const uint32_t *input_addr, int8_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint32_t *input_addr, int16_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint32_t *input_addr, int32_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint32_t *input_addr, int64_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint32_t *input_addr, uint8_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint32_t *input_addr, uint16_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint32_t *input_addr, uint32_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint32_t *input_addr, uint64_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint32_t *input_addr, float *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint32_t *input_addr, double *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint32_t *input_addr, half *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint32_t *input_addr, bool *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint32_t *input_addr, Complex<float> *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint32_t *input_addr, Complex<double> *output_addr, cudaStream_t stream);

 template void Cast(const int input_size, const uint64_t *input_addr, int8_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint64_t *input_addr, int16_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint64_t *input_addr, int32_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint64_t *input_addr, int64_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint64_t *input_addr, uint8_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint64_t *input_addr, uint16_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint64_t *input_addr, uint32_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint64_t *input_addr, uint64_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint64_t *input_addr, float *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint64_t *input_addr, double *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint64_t *input_addr, half *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint64_t *input_addr, bool *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint64_t *input_addr, Complex<float> *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const uint64_t *input_addr, Complex<double> *output_addr, cudaStream_t stream);

 template void Cast(const int input_size, const half *input_addr, int8_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const half *input_addr, int16_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const half *input_addr, int32_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const half *input_addr, int64_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const half *input_addr, uint8_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const half *input_addr, uint16_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const half *input_addr, uint32_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const half *input_addr, uint64_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const half *input_addr, float *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const half *input_addr, double *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const half *input_addr, half *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const half *input_addr, bool *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const half *input_addr, Complex<float> *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const half *input_addr, Complex<double> *output_addr, cudaStream_t stream);

 template void Cast(const int input_size, const float *input_addr, int8_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const float *input_addr, int16_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const float *input_addr, int32_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const float *input_addr, int64_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const float *input_addr, uint8_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const float *input_addr, uint16_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const float *input_addr, uint32_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const float *input_addr, uint64_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const float *input_addr, float *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const float *input_addr, double *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const float *input_addr, half *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const float *input_addr, bool *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const float *input_addr, Complex<float> *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const float *input_addr, Complex<double> *output_addr, cudaStream_t stream);

 template void Cast(const int input_size, const double *input_addr, int8_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const double *input_addr, int16_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const double *input_addr, int32_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const double *input_addr, int64_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const double *input_addr, uint8_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const double *input_addr, uint16_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const double *input_addr, uint32_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const double *input_addr, uint64_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const double *input_addr, float *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const double *input_addr, double *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const double *input_addr, half *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const double *input_addr, bool *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const double *input_addr, Complex<float> *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const double *input_addr, Complex<double> *output_addr, cudaStream_t stream);

 template void Cast(const int input_size, const bool *input_addr, int8_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const bool *input_addr, int16_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const bool *input_addr, int32_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const bool *input_addr, int64_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const bool *input_addr, uint8_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const bool *input_addr, uint16_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const bool *input_addr, uint32_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const bool *input_addr, uint64_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const bool *input_addr, float *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const bool *input_addr, double *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const bool *input_addr, half *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const bool *input_addr, bool *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const bool *input_addr, Complex<float> *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const bool *input_addr, Complex<double> *output_addr, cudaStream_t stream);

 template void Cast(const int input_size, const Complex<float> *input_addr, int8_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const Complex<float> *input_addr, int16_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const Complex<float> *input_addr, int32_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const Complex<float> *input_addr, int64_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const Complex<float> *input_addr, uint8_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const Complex<float> *input_addr, uint16_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const Complex<float> *input_addr, uint32_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const Complex<float> *input_addr, uint64_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const Complex<float> *input_addr, float *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const Complex<float> *input_addr, double *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const Complex<float> *input_addr, half *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const Complex<float> *input_addr, bool *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const Complex<float> *input_addr, Complex<double> *output_addr,
                   cudaStream_t stream);

 template void Cast(const int input_size, const Complex<double> *input_addr, int8_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const Complex<double> *input_addr, int16_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const Complex<double> *input_addr, int32_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const Complex<double> *input_addr, int64_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const Complex<double> *input_addr, uint8_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const Complex<double> *input_addr, uint16_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const Complex<double> *input_addr, uint32_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const Complex<double> *input_addr, uint64_t *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const Complex<double> *input_addr, float *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const Complex<double> *input_addr, double *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const Complex<double> *input_addr, half *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const Complex<double> *input_addr, bool *output_addr, cudaStream_t stream);
 template void Cast(const int input_size, const Complex<double> *input_addr, Complex<float> *output_addr,
                   cudaStream_t stream);
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/clip_grad_norm_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/clip_grad_norm_impl.cuh
@@ -1,29 +0,0 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_CLIP_GRAD_NORM_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_CLIP_GRAD_NORM_IMPL_H_

 #include "plugin/device/gpu/hal/device/cuda_common.h"
 template <typename T>
 void ScalingGradOp(const size_t size, const T *x, const float *scaling_factor, float *scaling_out_addr,
                   cudaStream_t cuda_stream);

 template <typename T>
 void ClipGradNormOp(const size_t size, const float *x, const T *clip_norm, const float *reduce_sum_value,
                    float *output_addr, cudaStream_t cuda_stream);

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_CLIP_GRAD_NORM_IMPL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/concatv2_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/concatv2_impl.cu
@@ -1,92 +0,0 @@
 /**
 * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include <stdio.h>
 #include <stdint.h>
 #include <cuda_runtime.h>
 #include "plugin/device/gpu/kernel/cuda_impl/concatv2_impl.cuh"
 template <typename T>
 __global__ void Concat(const size_t size, const int input_num, const int all_size_before_axis, const int all_size_axis,
                       int *len_axis, T **inputs, T *output) {
  for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
    int num = pos % all_size_before_axis / all_size_axis;
    int block = -1;
    int axis_inc = 0;
    int block_len = 0;
    for (int i = 0; i < input_num; i++) {
      if (axis_inc <= num) {
        block++;
        axis_inc += len_axis[i];
      } else {
        break;
      }
    }
    block_len = len_axis[block];
    axis_inc -= len_axis[block];
    int block_pos =
      pos / all_size_before_axis * block_len * all_size_axis + (num - axis_inc) * all_size_axis + pos % all_size_axis;
    output[pos] = inputs[block][block_pos];
  }
  return;
 }

 template <typename T>
 void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis, const int all_size_axis,
                  int *len_axis, T **inputs, T *output, cudaStream_t cuda_stream) {
  Concat<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_num, all_size_before_axis, all_size_axis,
                                                            len_axis, inputs, output);
  return;
 }

 template void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
                           const int all_size_axis, int *len_axis, double **inputs, double *output,
                           cudaStream_t cuda_stream);
 template void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
                           const int all_size_axis, int *len_axis, float **inputs, float *output,
                           cudaStream_t cuda_stream);
 template void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
                           const int all_size_axis, int *len_axis, half **inputs, half *output,
                           cudaStream_t cuda_stream);

 template void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
                           const int all_size_axis, int *len_axis, int64_t **inputs, int64_t *output,
                           cudaStream_t cuda_stream);
 template void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
                           const int all_size_axis, int *len_axis, int **inputs, int *output,
                           cudaStream_t cuda_stream);
 template void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
                           const int all_size_axis, int *len_axis, short **inputs, short *output,  // NOLINT
                           cudaStream_t cuda_stream);
 template void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
                           const int all_size_axis, int *len_axis, char **inputs, char *output,
                           cudaStream_t cuda_stream);

 template void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
                           const int all_size_axis, int *len_axis, uint64_t **inputs, uint64_t *output,
                           cudaStream_t cuda_stream);
 template void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
                           const int all_size_axis, int *len_axis, uint32_t **inputs, uint32_t *output,
                           cudaStream_t cuda_stream);
 template void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
                           const int all_size_axis, int *len_axis, uint16_t **inputs, uint16_t *output,
                           cudaStream_t cuda_stream);
 template void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
                           const int all_size_axis, int *len_axis, unsigned char **inputs, unsigned char *output,
                           cudaStream_t cuda_stream);

 template void ConcatKernel(const size_t size, const int input_num, const int all_size_before_axis,
                           const int all_size_axis, int *len_axis, bool **inputs, bool *output,
                           cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/convert_gradient_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/convert_gradient_impl.cuh
@@ -1,34 +0,0 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_CONVERTGRADIENT_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_CONVERTGRADIENT_H_

 #include "plugin/device/gpu/hal/device/cuda_common.h"
 template <typename T>
 void ConvertGradient(const size_t size, const size_t height_h, const size_t height_w, const size_t batchwidth,
                     const size_t width, T *input_addr, T *outt_addr, cudaStream_t cuda_stream);

 template <typename T>
 void ConvertGradientBack(const size_t size, const size_t height_h, const size_t height_w, const size_t batchwidth,
                         const size_t width, T *input_addr, T *output_addr, cudaStream_t cuda_stream);

 template <typename T>
 void ConvertGradientBack(const size_t size, const size_t height_h, const size_t height_w, const size_t ori_h,
                         const size_t ori_w, const size_t batchwidth, const size_t width, T *input_addr, T *output_addr,
                         cudaStream_t cuda_stream);

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_CONVERTGRADIENT_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/correction_mul_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/correction_mul_impl.cuh
@@ -1,27 +0,0 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CORRECTIONMUL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CORRECTIONMUL_H_

 template <typename T>
 void CalCorrectionMul(const T* weight, const T* gamma, const T* running_std, int batch_size, int channel_size,
                      int height, int width, T* output, cudaStream_t cuda_stream);

 template <typename T>
 void CalCorrectionMulGrad(const T* d_out, const T* weight, const T* running_std, int batch_size, int channel_size,
                          int height, int width, T* d_gamma, T* tmp, cudaStream_t cuda_stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_CORRECTIONMUL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/crop_and_resize_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/crop_and_resize_impl.cuh
@@ -1,25 +0,0 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_CROP_AND_RESIZE_IMPL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_CROP_AND_RESIZE_IMPL_H_
 #include <cuda_runtime.h>
 #include "plugin/device/gpu/hal/device/cuda_common.h"
 template <typename T>
 void CalCropAndResize(const size_t size, const T *input_image, float *input_boxes, int *input_box_index, int batch,
                      int input_height, int input_width, int final_height, int final_width, int channel,
                      int method, float extrapol_val, float *output, cudaStream_t cuda_stream);
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_CROP_AND_RESIZE_IMPL_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cross_entropy_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cross_entropy_impl.cuh
@@ -1,36 +0,0 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CROSSENTROPY_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CROSSENTROPY_H_

 #include "plugin/device/gpu/hal/device/cuda_common.h"

 // The batch size limit to judge whether to use multiple threads.
 constexpr int kLargeBatchLowLimit = 32768;

 template <typename T, typename S>
 void CrossEntropyWithSparse(const T *logits, const S *labels, const size_t batch_size, const size_t class_num, T *loss,
                            cudaStream_t cuda_stream);

 template <typename T, typename S>
 void CrossEntropyGradWithSparse(const T *logits, const S *labels, const size_t batch_size, const size_t class_num,
                                T *grad, cudaStream_t cuda_stream);

 template <typename T, typename S>
 void CrossEntropy(const T *logits, const S *labels, const size_t batch_size, const size_t class_num, T *losses,
                  T *dlogits, cudaStream_t cuda_stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CROSSENTROPY_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/ctcloss_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/ctcloss_impl.cuh
@@ -1,51 +0,0 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_IMPL_CUH
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_IMPL_CUH

 template <typename T>
 void CalculateFwdVar(T *log_alpha_b, int *label_value_with_blank, T *softmax_probs, const int *sequence_length,
                     bool ctc_merge_repeated, int batch, int SOffSet, int maxtime, int blank, int *label_squence_length,
                     int *cum_labels_length, bool ignore_longer_outputs_than_inputs, cudaStream_t stream);

 template <typename T>
 void CalculateBwdVar(T *log_beta_b, int *label_value_with_blank, T *softmax_probs, const int *sequence_length,
                     bool ctc_merge_repeated, int batch, int SOffSet, int maxtime, int blank, int *label_squence_length,
                     int *cum_labels_length, bool ignore_longer_outputs_than_inputs, cudaStream_t stream);

 template <typename T>
 void InnerSoftMax(const T *probs, T *softmax_cost, const int *sequence_length, int max_time, int batch, int numclass,
                  cudaStream_t stream);

 void GenLabelValuePCR(int *label_value_sp, int *label_value_pcr, int *label_squence_length, int *cum_labels_length,
                      int *max_labels_length, int batch, cudaStream_t stream);

 void GenLabelWithBlank(int *label_value, int *label_value_with_blank, int *label_squence_length,
                       int *precum_labels_length, int *cum_labels_length, int batch, int blank, cudaStream_t stream);

 void GenLabelValue(int *label_value_sp, const int64_t *label_indices, const int *label_values,
                   int *label_squence_length, int *cum_labels_length, int *max_labels_length, int size, int blank,
                   int batch, cudaStream_t stream);

 void CalculatePreLength(int *label_squence_length, int *precum_labels_length, int *cum_labels_length,
                        int *max_labels_length, const int64_t *label_indices, int batch, int size, cudaStream_t stream);
 void CalculateMaxSequence(const int *sequence_length, int *max_labels_length, int batch, cudaStream_t stream);
 template <typename T>
 void CTCLoss(T *log_alpha_b, T *log_beta_b, T *softmax_probs, int *label_value_with_blank, int batch, int SOffSet,
             int maxtime, int numclass, const int *sequence_length, int *label_squence_length, int *cum_labels_length,
             T *cost, T *grads, T *prob_num, bool ignore_longer_outputs_than_inputs, cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_IMPL_CUH
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/CMakeLists.txt
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/CMakeLists.txt
@@ -0,0 +1,27 @@
 file(GLOB_RECURSE CUDA_OPS_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cu")

 if(CMAKE_SYSTEM_NAME MATCHES "Darwin")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-delete-non-abstract-non-virtual-dtor -Wno-overloaded-virtual")
 endif()

 if(${CUDA_VERSION} VERSION_LESS 11.0)
    string(REPLACE "-std=c++17" "-std=c++11" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 else()
    string(REPLACE "-std=c++17" "-std=c++14" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 endif()

 set_property(SOURCE ${CUDA_OPS_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_KERNEL)
 if(ENABLE_GPU)
    add_library(cuda_common_obj OBJECT cuda_common.cc)
    target_compile_options(cuda_common_obj PRIVATE "-std=c++17")
    cuda_add_library(cuda_ops SHARED ${CUDA_OPS_SRC_LIST} $<TARGET_OBJECTS:cuda_common_obj>)
    message("add gpu lib to cuda_ops")
    target_link_libraries(cuda_ops mindspore_core
            ${CUDA_PATH}/lib64/libcurand.so
            ${CUDNN_LIBRARY_PATH}
            ${CUDA_PATH}/lib64/libcudart.so
            ${CUDA_PATH}/lib64/stubs/libcuda.so
            ${CUDA_PATH}/lib64/libcusolver.so
            ${CUDA_PATH}/lib64/libcufft.so
            ${CUDA_PATH}/lib64/libcublas.so)
 endif()
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adagrad_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adagrad_impl.cu
@@ -14,7 +14,8 @@
 * limitations under the License.
 */

 #include "plugin/device/gpu/kernel/cuda_impl/adagrad_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/adagrad_impl.cuh"
 #include "include/cuda_fp16.h"

 template <typename T>
 __device__ __forceinline__ T SqrtFunc(T input) {
@@ -113,50 +114,50 @@ void ApplyAdagrad(const size_t size,
          size, update_slots, learning_rate, gradient, variable, accumulation);
 }

 template void ApplyAdagrad<float, float, float>(const size_t size,
                                  const bool update_slots,
                                  const float *learning_rate,
                                  const float *gradient,
                                  float *variable,
                                  float *accumulation,
                                  cudaStream_t cuda_stream);
 template CUDA_LIB_EXPORT void ApplyAdagrad<float, float, float>(const size_t size,
                                                                const bool update_slots,
                                                                const float *learning_rate,
                                                                const float *gradient,
                                                                float *variable,
                                                                float *accumulation,
                                                                cudaStream_t cuda_stream);

 template void ApplyAdagrad<half, half, half>(const size_t size,
                                 const bool update_slots,
                                 const half *learning_rate,
                                 const half *gradient,
                                 half *variable,
                                 half *accumulation,
                                 cudaStream_t cuda_stream);
 template CUDA_LIB_EXPORT void ApplyAdagrad<half, half, half>(const size_t size,
                                                             const bool update_slots,
                                                             const half *learning_rate,
                                                             const half *gradient,
                                                             half *variable,
                                                             half *accumulation,
                                                             cudaStream_t cuda_stream);

 template void ApplyAdagrad<half, float, half>(const size_t size,
                                 const bool update_slots,
                                 const float *learning_rate,
                                 const half *gradient,
                                 half *variable,
                                 half *accumulation,
                                 cudaStream_t cuda_stream);
 template CUDA_LIB_EXPORT void ApplyAdagrad<half, float, half>(const size_t size,
                                                              const bool update_slots,
                                                              const float *learning_rate,
                                                              const half *gradient,
                                                              half *variable,
                                                              half *accumulation,
                                                              cudaStream_t cuda_stream);

 template void ApplyAdagrad<float, float, half>(const size_t size,
                                 const bool update_slots,
                                 const float *learning_rate,
                                 const half *gradient,
                                 float *variable,
                                 float *accumulation,
                                 cudaStream_t cuda_stream);
 template CUDA_LIB_EXPORT void ApplyAdagrad<float, float, half>(const size_t size,
                                                               const bool update_slots,
                                                               const float *learning_rate,
                                                               const half *gradient,
                                                               float *variable,
                                                               float *accumulation,
                                                               cudaStream_t cuda_stream);

 template void ApplyAdagrad<float, half, float>(const size_t size,
                                 const bool update_slots,
                                 const half *learning_rate,
                                 const float *gradient,
                                 float *variable,
                                 float *accumulation,
                                 cudaStream_t cuda_stream);
 template CUDA_LIB_EXPORT void ApplyAdagrad<float, half, float>(const size_t size,
                                                               const bool update_slots,
                                                               const half *learning_rate,
                                                               const float *gradient,
                                                               float *variable,
                                                               float *accumulation,
                                                               cudaStream_t cuda_stream);

 template void ApplyAdagrad<half, float, float>(const size_t size,
                                 const bool update_slots,
                                 const float *learning_rate,
                                 const float *gradient,
                                 half *variable,
                                 half *accumulation,
                                 cudaStream_t cuda_stream);
 template CUDA_LIB_EXPORT void ApplyAdagrad<half, float, float>(const size_t size,
                                                               const bool update_slots,
                                                               const float *learning_rate,
                                                               const float *gradient,
                                                               half *variable,
                                                               half *accumulation,
                                                               cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adagrad_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adagrad_impl.cuh
@@ -0,0 +1,29 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAGRAD_IMPL_CUH_
 #define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAGRAD_IMPL_CUH_
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template <typename T, typename S, typename G>
 CUDA_LIB_EXPORT void ApplyAdagrad(const size_t size,
                                  const bool update_slots,
                                  const S *learning_rate,
                                  const G *gradient,
                                  T *variable,
                                  T *accumulation,
                                  cudaStream_t stream);

 #endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAGRAD_IMPL_CUH_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_impl.cu
@@ -14,7 +14,8 @@
 * limitations under the License.
 */

 #include "plugin/device/gpu/kernel/cuda_impl/adam_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_impl.cuh"
 #include "include/cuda_fp16.h"

 template <typename T>
 __device__ __forceinline__ T SqrtFunc(T input) {
@@ -82,16 +83,19 @@ void AdamWeightDecayOp(const size_t size, const T *gradient, const float *learni
                                                                           epsilon, decay, variable, m, v);
 }

 template void ApplyAdam<float>(const size_t size, const float *gradient, const float *beta1_power,
                               const float *beta2_power, const float *learning_rate, const float *beta1,
                               const float *beta2, const float *epsilon, float *variable, float *m, float *v,
                               cudaStream_t cuda_stream);
 template void ApplyAdam<half>(const size_t size, const half *gradient, const half *beta1_power, const half *beta2_power,
                              const half *learning_rate, const half *beta1, const half *beta2, const half *epsilon,
                              half *variable, half *m, half *v, cudaStream_t cuda_stream);
 template void AdamWeightDecayOp<float>(const size_t size, const float *gradient, const float *learning_rate,
                                       const float *beta1, const float *beta2, const float *epsilon, const float *decay,
                                       float *variable, float *m, float *v, cudaStream_t cuda_stream);
 template void AdamWeightDecayOp<half>(const size_t size, const half *gradient, const float *learning_rate,
                                      const float *beta1, const float *beta2, const float *epsilon, const float *decay,
                                      half *variable, half *m, half *v, cudaStream_t cuda_stream);
 template CUDA_LIB_EXPORT void ApplyAdam<float>(const size_t size, const float *gradient, const float *beta1_power,
                                               const float *beta2_power, const float *learning_rate, const float *beta1,
                                               const float *beta2, const float *epsilon, float *variable, float *m,
                                               float *v, cudaStream_t cuda_stream);
 template CUDA_LIB_EXPORT void ApplyAdam<half>(const size_t size, const half *gradient, const half *beta1_power,
                                              const half *beta2_power, const half *learning_rate, const half *beta1,
                                              const half *beta2, const half *epsilon, half *variable, half *m, half *v,
                                              cudaStream_t cuda_stream);
 template CUDA_LIB_EXPORT void AdamWeightDecayOp<float>(const size_t size, const float *gradient,
                                                       const float *learning_rate, const float *beta1,
                                                       const float *beta2, const float *epsilon, const float *decay,
                                                       float *variable, float *m, float *v, cudaStream_t cuda_stream);
 template CUDA_LIB_EXPORT void AdamWeightDecayOp<half>(const size_t size, const half *gradient,
                                                      const float *learning_rate, const float *beta1,
                                                      const float *beta2, const float *epsilon, const float *decay,
                                                      half *variable, half *m, half *v, cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_impl.cuh
@@ -0,0 +1,29 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAM_IMPL_CUH_
 #define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAM_IMPL_CUH_
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template <typename T>
 CUDA_LIB_EXPORT void ApplyAdam(const size_t size, const T *gradient, const T *beta1_power, const T *beta2_power,
                               const T *learning_rate, const T *beta1, const T *beta2, const T *epsilon, T *variable,
                               T *m, T *v, cudaStream_t cuda_stream);
 template <typename T>
 CUDA_LIB_EXPORT void AdamWeightDecayOp(const size_t size, const T *gradient, const float *learning_rate,
                                       const float *beta1, const float *beta2, const float *epsilon, const float *decay,
                                       T *variable, T *m, T *v, cudaStream_t cuda_stream);

 #endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAM_IMPL_CUH_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_weight_decay_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_weight_decay_impl.cu
@@ -15,7 +15,6 @@
 */

 #include "adam_weight_decay_impl.cuh"
 #include "plugin/device/gpu/hal/device/cuda_common.h"

 template <typename T>
 __global__ void AdamWeightDecayKernel(const int element_num_, const bool need_decay, const float *beta1,
@@ -44,7 +43,8 @@ void AdamWeightDecay(const int &element_num_, const bool &need_decay, const floa
    gradient);
 }

 template void AdamWeightDecay(const int &element_num_, const bool &need_decay, const float *beta1,
                              const float *one_sub_beta1, const float *beta2, const float *one_sub_beta2,
                              const float *epsilon, const float *lr, const float *weight_decay, float *m, float *v,
                              float *param, float *gradient, cudaStream_t stream);
 template CUDA_LIB_EXPORT void AdamWeightDecay(const int &element_num_, const bool &need_decay, const float *beta1,
                                              const float *one_sub_beta1, const float *beta2,
                                              const float *one_sub_beta2, const float *epsilon, const float *lr,
                                              const float *weight_decay, float *m, float *v, float *param,
                                              float *gradient, cudaStream_t stream);
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_weight_decay_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adam_weight_decay_impl.cuh
@@ -0,0 +1,26 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAM_WEIGHT_DECAY_IMPL_CUH_
 #define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAM_WEIGHT_DECAY_IMPL_CUH_
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template <typename T>
 CUDA_LIB_EXPORT void AdamWeightDecay(const int &element_num_, const bool &need_decay, const float *beta1,
                                     const float *one_sub_beta1, const float *beta2, const float *one_sub_beta2,
                                     const float *epsilon, const float *lr, const float *weight_decay, T *m, T *v,
                                     T *param, T *gradient, cudaStream_t stream);

 #endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAM_WEIGHT_DECAY_IMPL_CUH_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_grad_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_grad_impl.cu
@@ -14,7 +14,8 @@
 * limitations under the License.
 */

 #include "plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_grad_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_grad_impl.cuh"
 #include "include/cuda_fp16.h"

 __device__ inline uint start_index(uint a, uint b, uint c) {
  return floorf(__uint2float_rn(a * c) / __uint2float_rn(b));
@@ -168,14 +169,17 @@ void ApplyAdaptiveAvgPool2DGrad(const uint size, const uint input_height, const
    size, input_height, input_width, output_height, output_width, input_data, output_data);
 }

 template void ApplyAdaptiveAvgPool2DGrad<float>(const uint size, const uint input_height, const uint input_width,
                                                const uint output_height, const uint output_width, float *input_data,
                                                float *output_data, cudaStream_t cuda_stream);
 template CUDA_LIB_EXPORT void ApplyAdaptiveAvgPool2DGrad<float>(const uint size, const uint input_height,
                                                                const uint input_width, const uint output_height,
                                                                const uint output_width, float *input_data,
                                                                float *output_data, cudaStream_t cuda_stream);

 template void ApplyAdaptiveAvgPool2DGrad<half>(const uint size, const uint input_height, const uint input_width,
                                               const uint output_height, const uint output_width, half *input_data,
                                               half *output_data, cudaStream_t cuda_stream);
 template CUDA_LIB_EXPORT void ApplyAdaptiveAvgPool2DGrad<half>(const uint size, const uint input_height,
                                                               const uint input_width, const uint output_height,
                                                               const uint output_width, half *input_data,
                                                               half *output_data, cudaStream_t cuda_stream);

 template void ApplyAdaptiveAvgPool2DGrad<double>(const uint size, const uint input_height, const uint input_width,
                                                 const uint output_height, const uint output_width, double *input_data,
                                                 double *output_data, cudaStream_t cuda_stream);
 template CUDA_LIB_EXPORT void ApplyAdaptiveAvgPool2DGrad<double>(const uint size, const uint input_height,
                                                                 const uint input_width, const uint output_height,
                                                                 const uint output_width, double *input_data,
                                                                 double *output_data, cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_grad_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_grad_impl.cuh
@@ -0,0 +1,25 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAPTIVE_AVGPOOL2D_GRAD_IMPL_CUH_
 #define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAPTIVE_AVGPOOL2D_GRAD_IMPL_CUH_
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template <typename T>
 CUDA_LIB_EXPORT void ApplyAdaptiveAvgPool2DGrad(const uint size, const uint input_height, const uint input_width,
                                                const uint output_height, const uint output_width, T *input_data,
                                                T *output_data, cudaStream_t cuda_stream);

 #endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAPTIVE_AVGPOOL2D_GRAD_IMPL_CUH_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_impl.cu
@@ -14,7 +14,8 @@
 * limitations under the License.
 */

 #include "plugin/device/gpu/kernel/cuda_impl/adaptive_avg_pool2d_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_impl.cuh"
 #include "include/cuda_fp16.h"

 __device__ inline uint start_index(uint a, uint b, uint c) {
  return floorf(__uint2float_rn(a * c) / __uint2float_rn(b));
@@ -155,14 +156,17 @@ void ApplyAdaptiveAvgPool2D(const uint size, const uint input_height, const uint
    size, input_height, input_width, output_height, output_width, input_data, output_data);
 }

 template void ApplyAdaptiveAvgPool2D<float>(const uint size, const uint input_height, const uint input_width,
                                            const uint output_height, const uint output_width, float *input_data,
                                            float *output_data, cudaStream_t cuda_stream);
 template CUDA_LIB_EXPORT void ApplyAdaptiveAvgPool2D<float>(const uint size, const uint input_height,
                                                            const uint input_width, const uint output_height,
                                                            const uint output_width, float *input_data,
                                                            float *output_data, cudaStream_t cuda_stream);

 template void ApplyAdaptiveAvgPool2D<half>(const uint size, const uint input_height, const uint input_width,
                                           const uint output_height, const uint output_width, half *input_data,
                                           half *output_data, cudaStream_t cuda_stream);
 template CUDA_LIB_EXPORT void ApplyAdaptiveAvgPool2D<half>(const uint size, const uint input_height,
                                                           const uint input_width, const uint output_height,
                                                           const uint output_width, half *input_data,
                                                           half *output_data, cudaStream_t cuda_stream);

 template void ApplyAdaptiveAvgPool2D<double>(const uint size, const uint input_height, const uint input_width,
                                             const uint output_height, const uint output_width, double *input_data,
                                             double *output_data, cudaStream_t cuda_stream);
 template CUDA_LIB_EXPORT void ApplyAdaptiveAvgPool2D<double>(const uint size, const uint input_height,
                                                             const uint input_width, const uint output_height,
                                                             const uint output_width, double *input_data,
                                                             double *output_data, cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/adaptive_avg_pool2d_impl.cuh
@@ -0,0 +1,25 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAPTIVE_AVGPOOL2D_IMPL_CUH_
 #define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAPTIVE_AVGPOOL2D_IMPL_CUH_
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template <typename T>
 CUDA_LIB_EXPORT void ApplyAdaptiveAvgPool2D(const uint size, const uint input_height, const uint input_width,
                                            const uint output_height, const uint output_width, T *input_data,
                                            T *output_data, cudaStream_t cuda_stream);

 #endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADAPTIVE_AVGPOOL2D_IMPL_CUH_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sigmoid_cross_entropy_with_logits_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/sigmoid_cross_entropy_with_logits_impl.cuh
@@ -14,12 +14,15 @@
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_IMPL_H_
 #ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADD_RELU_IMPL_CUH_
 #define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADD_RELU_IMPL_CUH_
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template <typename T>
 CUDA_LIB_EXPORT void AddReluV2(const size_t num, const T *x1, const T *x2, T *y, uint32_t *mask,
                               cudaStream_t cuda_stream);

 #include "plugin/device/gpu/hal/device/cuda_common.h"
 template <typename T, typename S>
 void SigmoidCrossEntropyWithLogits(const size_t size, const T *logits, const S *labels, T *outputs,
 template <typename T>
 CUDA_LIB_EXPORT void AddReluGradV2(const size_t size, const T *x1, const T *x2, const uint32_t *mask, T *dx,
                                   cudaStream_t cuda_stream);

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_IMPL_H_
 #endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADD_RELU_IMPL_CUH_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/add_relu_v2_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/add_relu_v2_impl.cu
@@ -14,8 +14,8 @@
 * limitations under the License.
 */

 #include "plugin/device/gpu/kernel/cuda_impl/add_relu_v2_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/util.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/add_relu_v2_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/util.cuh"

 template <typename T>
 __global__ void AddReluV2Kernel(const size_t num, const T *x1, const T *x2, T *y, uint32_t *mask) {
@@ -49,20 +49,20 @@ void AddReluGradV2(const size_t num, const T *x1, const T *x2, const uint32_t *m
  AddReluGradV2Kernel<<<kBlocksPerGrid(num), kThreadsPerBlock, 0, cuda_stream>>>(num, x1, x2, mask, dx);
 }

 template void AddReluV2(const size_t num, const float *x1, const float *x2, float *y, uint32_t *mask,
                        cudaStream_t cuda_stream);
 template void AddReluV2(const size_t num, const half *x1, const half *x2, half *y, uint32_t *mask,
                        cudaStream_t cuda_stream);
 template void AddReluV2(const size_t num, const int32_t *x1, const int32_t *x2, int32_t *y, uint32_t *mask,
                        cudaStream_t cuda_stream);
 template void AddReluV2(const size_t num, const int64_t *x1, const int64_t *x2, int64_t *y, uint32_t *mask,
                        cudaStream_t cuda_stream);
 template CUDA_LIB_EXPORT void AddReluV2(const size_t num, const float *x1, const float *x2, float *y, uint32_t *mask,
                                        cudaStream_t cuda_stream);
 template CUDA_LIB_EXPORT void AddReluV2(const size_t num, const half *x1, const half *x2, half *y, uint32_t *mask,
                                        cudaStream_t cuda_stream);
 template CUDA_LIB_EXPORT void AddReluV2(const size_t num, const int32_t *x1, const int32_t *x2, int32_t *y,
                                        uint32_t *mask, cudaStream_t cuda_stream);
 template CUDA_LIB_EXPORT void AddReluV2(const size_t num, const int64_t *x1, const int64_t *x2, int64_t *y,
                                        uint32_t *mask, cudaStream_t cuda_stream);

 template void AddReluGradV2(const size_t num, const float *x1, const float *x2, const uint32_t *mask, float *dx,
                            cudaStream_t cuda_stream);
 template void AddReluGradV2(const size_t num, const half *x1, const half *x2, const uint32_t *mask, half *dx,
                            cudaStream_t cuda_stream);
 template void AddReluGradV2(const size_t num, const int32_t *x1, const int32_t *x2, const uint32_t *mask, int32_t *dx,
                            cudaStream_t cuda_stream);
 template void AddReluGradV2(const size_t num, const int64_t *x1, const int64_t *x2, const uint32_t *mask, int64_t *dx,
                            cudaStream_t cuda_stream);
 template CUDA_LIB_EXPORT void AddReluGradV2(const size_t num, const float *x1, const float *x2, const uint32_t *mask,
                                            float *dx, cudaStream_t cuda_stream);
 template CUDA_LIB_EXPORT void AddReluGradV2(const size_t num, const half *x1, const half *x2, const uint32_t *mask,
                                            half *dx, cudaStream_t cuda_stream);
 template CUDA_LIB_EXPORT void AddReluGradV2(const size_t num, const int32_t *x1, const int32_t *x2,
                                            const uint32_t *mask, int32_t *dx, cudaStream_t cuda_stream);
 template CUDA_LIB_EXPORT void AddReluGradV2(const size_t num, const int64_t *x1, const int64_t *x2,
                                            const uint32_t *mask, int64_t *dx, cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/add_relu_v2_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/add_relu_v2_impl.cuh
@@ -0,0 +1,28 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADD_RELU_V2_IMPL_CUH_
 #define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADD_RELU_V2_IMPL_CUH_
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
 template <typename T>
 CUDA_LIB_EXPORT void AddReluV2(const size_t num, const T *x1, const T *x2, T *y, uint32_t *mask,
                               cudaStream_t cuda_stream);

 template <typename T>
 CUDA_LIB_EXPORT void AddReluGradV2(const size_t size, const T *x1, const T *x2, const uint32_t *mask, T *dx,
                                   cudaStream_t cuda_stream);

 #endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_ADD_RELU_V2_IMPL_CUH_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/apply_gradient_descent_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/apply_gradient_descent_impl.cu
@@ -14,7 +14,8 @@
 * limitations under the License.
 */

 #include "plugin/device/gpu/kernel/cuda_impl/apply_gradient_descent_impl.cuh"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/apply_gradient_descent_impl.cuh"
 #include "include/cuda_fp16.h"

 template <typename T>
 __global__ void ApplyGradientDescent(const size_t size, T *var, const T *alpha, const T *delta, T *output) {
@@ -31,7 +32,8 @@ void CalApplyGradientDescent(const size_t &size, T *var, const T *alpha, const T
  ApplyGradientDescent<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, var, alpha, delta, output);
 }

 template void CalApplyGradientDescent<float>(const size_t &size, float *var, const float *alpha, const float *delta,
                                             float *output, cudaStream_t cuda_stream);
 template void CalApplyGradientDescent<half>(const size_t &size, half *var, const half *alpha, const half *delta,
                                            half *output, cudaStream_t cuda_stream);
 template CUDA_LIB_EXPORT void CalApplyGradientDescent<float>(const size_t &size, float *var, const float *alpha,
                                                             const float *delta, float *output,
                                                             cudaStream_t cuda_stream);
 template CUDA_LIB_EXPORT void CalApplyGradientDescent<half>(const size_t &size, half *var, const half *alpha,
                                                            const half *delta, half *output, cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/apply_gradient_descent_impl.cuh
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/apply_gradient_descent_impl.cuh
@@ -14,16 +14,13 @@
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_HSIGMOID_IMPL_CUH_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_HSIGMOID_IMPL_CUH_

 #ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_APPLY_GRADIENT_DESCENT_IMPL_CUH_
 #define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_APPLY_GRADIENT_DESCENT_IMPL_CUH_
 #include <cuda_runtime.h>
 #include "plugin/device/gpu/hal/device/cuda_common.h"

 template <typename T>
 void CalHSigmoid(const size_t &size, const T *input, T *output, cudaStream_t cuda_stream);
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"

 template <typename T>
 void CalHSigmoidGrad(const size_t &size, const T *dout, const T *x, T *output, cudaStream_t cuda_stream);
 CUDA_LIB_EXPORT void CalApplyGradientDescent(const size_t &size, T *var, const T *alpha, const T *delta, T *output,
                                             cudaStream_t cuda_stream);

 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_HSIGMOID_IMPL_CUH_
 #endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_APPLY_GRADIENT_DESCENT_IMPL_CUH_