Merge branch 'master' into code_sync_incubator_f3c32baf_to_master_fcfc75a3_0811

5 years ago · 4964f7703a
--- a/.gitmodules
+++ b/.gitmodules
@@ -5,9 +5,6 @@
 [submodule "third_party/googletest"]
 	path = third_party/googletest
 	url = https://github.com/google/googletest.git
 [submodule "third_party/incubator-tvm"]
 	path = third_party/incubator-tvm
 	url = https://github.com/apache/incubator-tvm.git
 [submodule "third_party/protobuf"]
 	path = third_party/protobuf
 	url = https://github.com/protocolbuffers/protobuf.git
@@ -17,7 +14,7 @@
 	url = https://gitee.com/mindspore/akg.git
 [submodule "graphengine"]
 	path = graphengine
 	url = https://gitee.com/ms-incubator/graphengine.git
 	url = https://gitee.com/mindspore/graphengine.git
 [submodule "third_party/OpenCL-CLHPP"]
 	path = third_party/OpenCL-CLHPP
 	url = https://github.com/KhronosGroup/OpenCL-CLHPP.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -98,6 +98,7 @@ endif()

 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden")
 add_subdirectory(mindspore/ccsrc)
 add_subdirectory(mindspore/core)
 if (ENABLE_TESTCASES)
    add_subdirectory(tests)
 endif()
--- a/+ 1
+++ b/+ 1
@@ -1 +1 @@
 Subproject commit 5fe7e5c8377dccfd35c9f661e10ed3dc136208c5
 Subproject commit 8f9af74f59837579034610a741f5b8f33db12515
--- a/build.sh
+++ b/build.sh
@@ -109,7 +109,7 @@ checkopts()
  ENABLE_GPU="off"

  # Process the options
  while getopts 'drvj:c:t:hsb:a:g:p:ie:m:l:I:LRP:Q:D:zM:V:K:swB:En' opt
  while getopts 'drvj:c:t:hsb:a:g:p:ie:m:l:I:LRP:Q:D:zM:V:K:swB:EnT:' opt
  do
    OPTARG=$(echo ${OPTARG} | tr '[A-Z]' '[a-z]')
    case "${opt}" in
@@ -282,6 +282,11 @@ checkopts()
        ENABLE_IBVERBS="on"
        echo "enable IBVERBS for parameter server"
        ;;
      T) 
        check_on_off $OPTARG T
        SUPPORT_TRAIN=$OPTARG
        echo "support train on device "
        ;;
      *)
        echo "Unknown option ${opt}!"
        usage
@@ -397,7 +402,7 @@ checkndk() {
    if [ "${ANDROID_NDK}" ]; then
        echo -e "\e[31mANDROID_NDK_PATH=$ANDROID_NDK  \e[0m"
    else
        echo -e "\e[31mplease set ANDROID_NDK_PATH in environment variable for example: export ANDROID_NDK=/root/usr/android-ndk-r20b/ \e[0m"
        echo -e "\e[31mplease set ANDROID_NDK in environment variable for example: export ANDROID_NDK=/root/usr/android-ndk-r20b/ \e[0m"
        exit 1
    fi
 }
@@ -569,6 +574,39 @@ build_minddata_lite_deps()
  build_jpeg_turbo
 }

 prepare_md_lite() {
    if [ "${COMPILE_MINDDATA_LITE}" == "on" ]; then
    echo "packaging minddata"
        cp ${BASEPATH}/mindspore/ccsrc/minddata/dataset/include/*h ${OUTPUT_DIR}/include/
 	cp ${BASEPATH}/mindspore/lite/build/minddata/libminddata-lite.so ${OUTPUT_DIR}/lib/
        if [[ "$LITE_PLATFORM" == "x86_64" ]]; then
 	    mkdir -p ${OUTPUT_DIR}/third_party/libjpeg-turbo/lib
            cp -r ${BASEPATH}/third_party/libjpeg-turbo/lib/libjpeg.so ${OUTPUT_DIR}/third_party/libjpeg-turbo/lib/
            cp -r ${BASEPATH}/third_party/libjpeg-turbo/lib/libturbojpeg.so ${OUTPUT_DIR}/third_party/libjpeg-turbo/lib/
 	    mkdir -p ${OUTPUT_DIR}/third_party/opencv/lib/
 	    cp -r ${BASEPATH}/third_party/opencv/build/lib/libopencv_core.so ${OUTPUT_DIR}/third_party/opencv/lib/
            cp -r ${BASEPATH}/third_party/opencv/build/lib/libopencv_imgcodecs.so ${OUTPUT_DIR}/third_party/opencv/lib/
            cp -r ${BASEPATH}/third_party/opencv/build/lib/libopencv_imgproc.so ${OUTPUT_DIR}/third_party/opencv/lib/
        elif [[ "$LITE_PLATFORM" == "arm64" ]]; then
            mkdir -p ${OUTPUT_DIR}/third_party/libjpeg-turbo/lib
            cp -r ${BASEPATH}/third_party/libjpeg-turbo/lib/libjpeg.so ${OUTPUT_DIR}/third_party/libjpeg-turbo/lib/
            cp -r ${BASEPATH}/third_party/libjpeg-turbo/lib/libturbojpeg.so ${OUTPUT_DIR}/third_party/libjpeg-turbo/lib/
            mkdir -p ${OUTPUT_DIR}/third_party/opencv/lib/arm64-v8a/
            cp -r ${BASEPATH}/third_party/opencv/build/lib/arm64-v8a/libopencv_core.so ${OUTPUT_DIR}/third_party/opencv/lib/arm64-v8a/
            cp -r ${BASEPATH}/third_party/opencv/build/lib/arm64-v8a/libopencv_imgcodecs.so ${OUTPUT_DIR}/third_party/opencv/lib/arm64-v8a/
            cp -r ${BASEPATH}/third_party/opencv/build/lib/arm64-v8a/libopencv_imgproc.so ${OUTPUT_DIR}/third_party/opencv/lib/arm64-v8a/
        elif [[ "$LITE_PLATFORM" == "arm32" ]]; then
            mkdir -p ${OUTPUT_DIR}/third_party/libjpeg-turbo/lib
            cp -r ${BASEPATH}/third_party/libjpeg-turbo/lib/libjpeg.so ${OUTPUT_DIR}/third_party/libjpeg-turbo/lib/
            cp -r ${BASEPATH}/third_party/libjpeg-turbo/lib/libturbojpeg.so ${OUTPUT_DIR}/third_party/libjpeg-turbo/lib/
            mkdir -p ${OUTPUT_DIR}/third_party/opencv/lib/armeabi-v7a/
            cp -r ${BASEPATH}/third_party/opencv/build/lib/armeabi-v7a/libopencv_core.so ${OUTPUT_DIR}/third_party/opencv/lib/armeabi-v7a/
            cp -r ${BASEPATH}/third_party/opencv/build/lib/armeabi-v7a/libopencv_imgcodecs.so ${OUTPUT_DIR}/third_party/opencv/lib/armeabi-v7a/
            cp -r ${BASEPATH}/third_party/opencv/build/lib/armeabi-v7a/libopencv_imgproc.so ${OUTPUT_DIR}/third_party/opencv/lib/armeabi-v7a/
        fi
    fi
 }

 build_lite()
 {
    echo "start build mindspore lite project"
@@ -632,6 +670,7 @@ build_lite()
            mkdir -p ${OUTPUT_DIR}/converter && mkdir -p ${OUTPUT_DIR}/time_profile
            mkdir -p ${OUTPUT_DIR}/benchmark && mkdir -p ${OUTPUT_DIR}/include && mkdir -p ${OUTPUT_DIR}/lib
            mkdir -p ${OUTPUT_DIR}/third_party
 	    prepare_md_lite
            cp ${BASEPATH}/mindspore/lite/build/tools/converter/converter_lite ${OUTPUT_DIR}/converter/
            cp ${BASEPATH}/mindspore/lite/build/tools/benchmark/benchmark ${OUTPUT_DIR}/benchmark/
            cp ${BASEPATH}/mindspore/lite/build/tools/time_profile/timeprofile ${OUTPUT_DIR}/time_profile/
@@ -643,8 +682,7 @@ build_lite()
            cp ${BASEPATH}/mindspore/lite/build/src/libmindspore-lite.so ${OUTPUT_DIR}/lib/
            mkdir -p ${OUTPUT_DIR}/third_party/protobuf/lib
            cp -r ${BASEPATH}/third_party/protobuf/build/include/ ${OUTPUT_DIR}/third_party/protobuf/
            cp -r ${BASEPATH}/third_party/protobuf/build/lib/libprotobuf.so.19 ${OUTPUT_DIR}/third_party/protobuf/lib/
            cp -r ${BASEPATH}/third_party/protobuf/build/lib/libprotobuf.so.19.0.0 ${OUTPUT_DIR}/third_party/protobuf/lib/
            cp -r ${BASEPATH}/third_party/protobuf/build/lib/libprotobuf.so.19.0.0 ${OUTPUT_DIR}/third_party/protobuf/lib/libprotobuf.so.19
            mkdir -p ${OUTPUT_DIR}/third_party/flatbuffers
            cp -r ${BASEPATH}/third_party/flatbuffers/include/ ${OUTPUT_DIR}/third_party/flatbuffers/
            cd ..
@@ -657,6 +695,7 @@ build_lite()
            mkdir -p ${OUTPUT_DIR}/time_profile && mkdir -p ${OUTPUT_DIR}/benchmark
            mkdir -p ${OUTPUT_DIR}/include && mkdir -p ${OUTPUT_DIR}/lib
            mkdir -p ${OUTPUT_DIR}/third_party
 	    prepare_md_lite
            cp ${BASEPATH}/mindspore/lite/build/tools/benchmark/benchmark ${OUTPUT_DIR}/benchmark/
            cp ${BASEPATH}/mindspore/lite/build/tools/time_profile/timeprofile ${OUTPUT_DIR}/time_profile/
            cp ${BASEPATH}/mindspore/lite/include/*.h ${OUTPUT_DIR}/include/
@@ -677,6 +716,7 @@ build_lite()
            mkdir -p ${OUTPUT_DIR}/time_profile && mkdir -p ${OUTPUT_DIR}/benchmark
            mkdir -p ${OUTPUT_DIR}/include && mkdir -p ${OUTPUT_DIR}/lib
            mkdir -p ${OUTPUT_DIR}/third_party
 	    prepare_md_lite
            cp ${BASEPATH}/mindspore/lite/build/tools/benchmark/benchmark ${OUTPUT_DIR}/benchmark/
            cp ${BASEPATH}/mindspore/lite/build/tools/time_profile/timeprofile ${OUTPUT_DIR}/time_profile/
            cp ${BASEPATH}/mindspore/lite/include/*.h ${OUTPUT_DIR}/include/
--- a/cmake/external_libs/jpeg_turbo.cmake
+++ b/cmake/external_libs/jpeg_turbo.cmake
@@ -8,11 +8,12 @@ endif()
 set(jpeg_turbo_LDFLAGS "-Wl,-z,relro,-z,now,-z,noexecstack")
 mindspore_add_pkg(jpeg_turbo
        VER 2.0.4
        LIBS jpeg
        LIBS jpeg turbojpeg
        URL https://github.com/libjpeg-turbo/libjpeg-turbo/archive/2.0.4.tar.gz
        MD5 44c43e4a9fb352f47090804529317c88
        CMAKE_OPTION -DCMAKE_BUILD_TYPE=Release -DCMAKE_SKIP_RPATH=TRUE
        CMAKE_OPTION -DCMAKE_BUILD_TYPE=Release -DCMAKE_SKIP_RPATH=TRUE -DWITH_SIMD=ON
        PATCHES ${CMAKE_SOURCE_DIR}/third_party/patch/jpeg_turbo/jpeg_turbo.patch001
        )
 include_directories(${jpeg_turbo_INC})
 add_library(mindspore::jpeg_turbo ALIAS jpeg_turbo::jpeg)
 add_library(mindspore::turbojpeg ALIAS jpeg_turbo::turbojpeg)
--- a/cmake/package.cmake
+++ b/cmake/package.cmake
@@ -52,12 +52,6 @@ install(
    COMPONENT mindspore
 )

 install(
    TARGETS mindspore_gvar
    DESTINATION ${INSTALL_LIB_DIR}
    COMPONENT mindspore
 )

 if (USE_GLOG)
    file(GLOB_RECURSE GLOG_LIB_LIST ${glog_LIBPATH}/libglog*)
    install(
@@ -146,15 +140,6 @@ if (ENABLE_MPI)
            COMPONENT mindspore
        )
    endif ()
    file(GLOB_RECURSE MPI_LIB_LIST
        ${ompi_LIBPATH}/libmpi${CMAKE_SHARED_LIBRARY_SUFFIX}*
        ${ompi_LIBPATH}/libopen*${CMAKE_SHARED_LIBRARY_SUFFIX}*
    )
    install(
        FILES ${MPI_LIB_LIST}
        DESTINATION ${INSTALL_LIB_DIR}
        COMPONENT mindspore
    )
 endif ()

 if (ENABLE_GPU)
--- a/+ 1
+++ b/+ 1
@@ -1 +1 @@
 Subproject commit 377b2165184fbfbb32829266822438e439861f14
 Subproject commit 622af6c1c50034bea5a08bd409c5a410782bfe53
--- a/mindspore/_extends/parse/init.py
+++ b/mindspore/_extends/parse/init.py
@@ -22,7 +22,7 @@ from .parser import (Parser, create_obj_instance, generate_scope,
                     get_dataclass_attributes, get_dataclass_methods, get_obj_id,
                     get_module_namespace, get_obj_type, get_object_key,
                     get_parse_method_of_class, get_scope_name,
                     is_class_member, parse_cb, resolve_symbol)
                     is_class_member, parse_cb, resolve_symbol, convert_to_ms_tensor)
 from .serialize import *

 __all__ = ['parse_cb', 'get_parse_method_of_class', 'get_bprop_method_of_class', 'resolve_symbol',
@@ -30,4 +30,4 @@ __all__ = ['parse_cb', 'get_parse_method_of_class', 'get_bprop_method_of_class',
           'get_obj_type', 'get_obj_id', 'create_obj_instance', 'get_module_namespace',
           'get_class_member_namespace_symbol', 'get_obj_id', 'Parser', 'get_dataclass_attributes',
           'get_dataclass_methods', 'dump_obj', 'load_obj', 'get_dataclass_methods', 'get_scope_name',
           'create_slice_obj']
           'create_slice_obj', 'convert_to_ms_tensor']
--- a/mindspore/_extends/parse/parser.py
+++ b/mindspore/_extends/parse/parser.py
@@ -25,6 +25,7 @@ from dataclasses import is_dataclass
 import asttokens
 import mindspore.nn as nn
 from mindspore import log as logger
 from mindspore import Tensor as MsTensor
 from mindspore import ops
 from mindspore.common.dtype import pytype_to_dtype
 from mindspore.common.api import _MindSporeFunction
@@ -316,6 +317,11 @@ def get_dataclass_methods(cls):
    return methods


 def convert_to_ms_tensor(data):
    """Convert C++ tensor to mindspore tensor."""
    return MsTensor(data)


 class Parser:
    """
    Parser python code to ast tree.
--- a/mindspore/ccsrc/CMakeLists.txt
+++ b/mindspore/ccsrc/CMakeLists.txt
@@ -130,7 +130,7 @@ set(SUB_COMP
        frontend/operator
        pipeline/jit
        pipeline/pynative
        common debug gvar pybind_api utils vm
        common debug pybind_api utils vm
 )

 foreach (_comp ${SUB_COMP})
@@ -141,32 +141,21 @@ foreach (_comp ${SUB_COMP})
        add_dependencies(_mindspore_${sub}_obj proto_input )
    endif ()
 endforeach ()
 add_subdirectory(${CMAKE_SOURCE_DIR}/mindspore/core/base base)
 list(APPEND SUB_OBJECTS_SRC $<TARGET_OBJECTS:_mindspore_base_obj>)
 add_subdirectory(${CMAKE_SOURCE_DIR}/mindspore/core/abstract abstract)
 list(APPEND SUB_OBJECTS_SRC $<TARGET_OBJECTS:_mindspore_abstract_obj>)
 add_subdirectory(${CMAKE_SOURCE_DIR}/mindspore/core/utils util)
 list(APPEND SUB_OBJECTS_SRC $<TARGET_OBJECTS:_mindspore_core_utils_obj>)
 add_subdirectory(${CMAKE_SOURCE_DIR}/mindspore/core/ir ir)
 list(APPEND SUB_OBJECTS_SRC $<TARGET_OBJECTS:_mindspore_ir_obj>)
 add_dependencies(_mindspore_core_utils_obj _mindspore_base_obj _mindspore_ir_obj _mindspore_abstract_obj proto_input )

 set_property(SOURCE ${SUB_OBJECTS_SRC} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_ME)
 add_library(mindspore STATIC ${SUB_OBJECTS_SRC})

 target_link_libraries(proto_input mindspore::protobuf)

 target_link_libraries(mindspore mindspore_core)

 if (ENABLE_DEBUGGER)
    # debugger: link grpc
    target_link_libraries(proto_input mindspore::grpc++)
 endif()

 target_link_libraries(mindspore proto_input)
 if (ENABLE_MPI AND ENABLE_CPU)
    target_link_libraries(mindspore securec mindspore::flatbuffers mpi_adapter)
 else ()
    target_link_libraries(mindspore securec mindspore::flatbuffers)
 endif ()
 target_link_libraries(mindspore securec mindspore::flatbuffers)

 if (NOT WIN32)
  target_link_libraries(mindspore dl)
@@ -242,7 +231,6 @@ set_target_properties(_c_expression PROPERTIES INSTALL_RPATH ${ORIGIN_PATH})

 if (CMAKE_SYSTEM_NAME MATCHES "Windows")
    target_link_libraries(mindspore mindspore::pybind11_module)
    target_link_libraries(mindspore mindspore_gvar)
    target_link_libraries(_c_expression PRIVATE -Wl,--whole-archive mindspore -Wl,--no-whole-archive)
 else ()
    if (ENABLE_CPU AND (ENABLE_D OR ENABLE_GPU))
@@ -253,7 +241,6 @@ else ()
    endif()
    target_link_libraries(_c_expression PRIVATE -Wl,--whole-archive mindspore -Wl,--no-whole-archive)
    target_link_libraries(_c_expression PRIVATE mindspore::pybind11_module)
    target_link_libraries(_c_expression PRIVATE mindspore_gvar)
 endif ()

 if (USE_GLOG)
@@ -297,7 +284,7 @@ add_library(inference SHARED
        ${LOAD_ONNX_SRC}
        )
 target_link_libraries(inference PRIVATE ${PYTHON_LIBRARIES} ${SECUREC_LIBRARY}
        -Wl,--whole-archive mindspore -Wl,--no-whole-archive mindspore_gvar mindspore::protobuf)
        -Wl,--whole-archive mindspore -Wl,--no-whole-archive mindspore::protobuf)

 if (ENABLE_CPU)
    target_link_libraries(inference PRIVATE mindspore::dnnl mindspore::mkldnn)
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.h
@@ -25,6 +25,7 @@
 #include "backend/kernel_compiler/kernel.h"
 #include "ir/dtype.h"
 #include "ir/primitive.h"
 #include "pybind11/pybind11.h"
 #include <nlohmann/json.hpp>
 #include "backend/kernel_compiler/common_utils.h"
 #include "backend/kernel_compiler/oplib/oplib.h"
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/allgather_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/allgather_cpu_kernel.cc
@@ -15,7 +15,7 @@
 */
 #include "backend/kernel_compiler/cpu/allgather_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "runtime/device/cpu/mpi/mpi_adapter.h"
 #include "runtime/device/cpu/mpi/mpi_interface.h"
 #include "utils/log_adapter.h"

 namespace mindspore {
@@ -45,9 +45,7 @@ bool AllGatherCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
  auto input_addr = reinterpret_cast<float *>(inputs[0]->addr);
  auto output_addr = reinterpret_cast<float *>(outputs[0]->addr);
  auto input_data_num = inputs[0]->size / sizeof(float);
  auto mpi_instance = device::cpu::MPIAdapter::Instance();
  MS_EXCEPTION_IF_NULL(mpi_instance);
  return mpi_instance->AllGather(input_addr, output_addr, ranks_group_, input_data_num);
  return MPIAllGather(input_addr, output_addr, ranks_group_, input_data_num);
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/embedding_look_up_comm_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/embedding_look_up_comm_grad_cpu_kernel.cc
@@ -16,7 +16,7 @@
 #include <thread>
 #include "backend/kernel_compiler/cpu/embedding_look_up_comm_grad_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "runtime/device/cpu/mpi/mpi_adapter.h"
 #include "runtime/device/cpu/mpi/mpi_interface.h"

 namespace mindspore {
 namespace kernel {
@@ -49,11 +49,8 @@ bool EmbeddingLookUpCommGradCPUKernel::Launch(const std::vector<kernel::AddressP
  const std::vector<int> &rank_group = {0, 1, 2, 3, 4, 5, 6, 7};
  size_t input_split_lens = input_size / split_num_ / sizeof(float_t);
  size_t output_split_lens = output_size / split_num_ / sizeof(float_t);
  auto mpi_instance = device::cpu::MPIAdapter::Instance();
  MS_EXCEPTION_IF_NULL(mpi_instance);
  for (int i = 0; i < split_num_; i++) {
    mpi_instance->AllGather(input_addr + i * input_split_lens, output_addr + i * output_split_lens, rank_group,
                            input_split_lens);
    MPIAllGather(input_addr + i * input_split_lens, output_addr + i * output_split_lens, rank_group, input_split_lens);
  }
 #if defined(_WIN32) || defined(_WIN64)
  auto end_time = std::chrono::steady_clock::now();
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_scatter_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_scatter_cpu_kernel.cc
@@ -15,7 +15,7 @@
 */
 #include "backend/kernel_compiler/cpu/reduce_scatter_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "runtime/device/cpu/mpi/mpi_adapter.h"
 #include "runtime/device/cpu/mpi/mpi_interface.h"
 #include "ir/primitive.h"

 namespace mindspore {
@@ -24,7 +24,7 @@ namespace {
 constexpr auto kRanksGroup = "group";
 }  // namespace

 ReduceScatterCPUKernel::ReduceScatterCPUKernel() : op_type_(device::cpu::kOpTypeSum) {}
 ReduceScatterCPUKernel::ReduceScatterCPUKernel() : op_type_(kMPIOpTypeSum) {}

 void ReduceScatterCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  auto op = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("op");
@@ -46,9 +46,7 @@ bool ReduceScatterCPUKernel::Launch(const std::vector<kernel::AddressPtr> &input
  auto input_addr = reinterpret_cast<float *>(inputs[0]->addr);
  auto output_addr = reinterpret_cast<float *>(outputs[0]->addr);
  auto output_data_num = outputs[0]->size / sizeof(float);
  auto mpi_instance = device::cpu::MPIAdapter::Instance();
  MS_EXCEPTION_IF_NULL(mpi_instance);
  return mpi_instance->ReduceScatter(input_addr, output_addr, ranks_group_, output_data_num, op_type_);
  return MPIReduceScatter(input_addr, output_addr, ranks_group_, output_data_num, op_type_);
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sub_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sub_cpu_kernel.cc
@@ -13,8 +13,9 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <thread>
 #include "backend/kernel_compiler/cpu/sub_cpu_kernel.h"
 #include <sys/time.h>
 #include <thread>
 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/array_reduce_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/array_reduce_gpu_kernel.h
@@ -182,30 +182,59 @@ class ArrayReduceGpuKernel : public GpuKernel {
  void InferInAndOutDesc(const std::vector<size_t> &input_shape, const std::vector<size_t> &output_shape) {
    std::vector<int> inputA;
    std::vector<size_t> outputC_shape = output_shape;
    ShapeNdTo4d(input_shape, &inputA);
    CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(inputA_descriptor_, CUDNN_TENSOR_NCHW, data_type_, inputA[0],
                                                           inputA[1], inputA[2], inputA[3]),
                                "cudnnSetTensor4dDescriptor failed");
    const int split_dim = 4;

    if (input_shape.size() <= split_dim) {
      ShapeNdTo4d(input_shape, &inputA);
      CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(inputA_descriptor_, CUDNN_TENSOR_NCHW, data_type_,
                                                             inputA[0], inputA[1], inputA[2], inputA[3]),
                                  "cudnnSetTensor4dDescriptor failed");
    } else {
      CudnnSetTensorNdDescriptor(input_shape, inputA_descriptor_, data_type_);
      for (auto dim : input_shape) {
        inputA.emplace_back(SizeToInt(dim));
      }
    }

    if (axis_[0] == -1) {
      CHECK_CUDNN_RET_WITH_EXCEPT(
        cudnnSetTensor4dDescriptor(outputC_descriptor_, CUDNN_TENSOR_NCHW, data_type_, 1, 1, 1, 1),
        "cudnnSetTensor4dDescriptor failed");
      if (inputA[0] == 1 && inputA[1] == 1 && inputA[2] == 1 && inputA[3] == 1) {
        all_match_ = true;
      outputC_shape.resize(input_shape.size(), 1);
      if (outputC_shape.size() <= split_dim) {
        CHECK_CUDNN_RET_WITH_EXCEPT(
          cudnnSetTensor4dDescriptor(outputC_descriptor_, CUDNN_TENSOR_NCHW, data_type_, 1, 1, 1, 1),
          "cudnnSetTensor4dDescriptor failed");
      } else {
        CudnnSetTensorNdDescriptor(outputC_shape, outputC_descriptor_, data_type_);
      }

      for (auto dim : inputA) {
        if (dim != 1) {
          return;
        }
      }

      all_match_ = true;
      return;
    }

    std::vector<int> outputC;
    if (!keep_dims_) {
      for (auto i : axis_) {
        (void)(outputC_shape.insert(outputC_shape.begin() + i, 1));
      }
    }
    std::vector<int> outputC;
    ShapeNdTo4d(outputC_shape, &outputC);
    CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(outputC_descriptor_, CUDNN_TENSOR_NCHW, data_type_,
                                                           outputC[0], outputC[1], outputC[2], outputC[3]),
                                "cudnnSetTensor4dDescriptor failed");

    if (outputC_shape.size() <= split_dim) {
      ShapeNdTo4d(outputC_shape, &outputC);
      CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(outputC_descriptor_, CUDNN_TENSOR_NCHW, data_type_,
                                                             outputC[0], outputC[1], outputC[2], outputC[3]),
                                  "cudnnSetTensor4dDescriptor failed");
    } else {
      CudnnSetTensorNdDescriptor(outputC_shape, outputC_descriptor_, data_type_);
      for (auto dim : outputC_shape) {
        outputC.emplace_back(SizeToInt(dim));
      }
    }

    if (inputA == outputC) {
      all_match_ = true;
    }
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_nd_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_nd_gpu_kernel.h
@@ -69,6 +69,10 @@ class ScatterNdGpuFwdKernel : public GpuKernel {
      memcpy_flag_ = true;
    }

    CHECK_CUDA_RET_WITH_EXCEPT(
      cudaMemsetAsync(output, static_cast<T>(0.0), output_size_, reinterpret_cast<cudaStream_t>(stream_ptr)),
      "cudaMemSet failed in ScatterNdGpuFwdKernel::Launch.");

    const size_t input_size = input_size_ / sizeof(T);
    const size_t output_size = output_size_ / sizeof(T);

--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cu
@@ -14,6 +14,7 @@
 * limitations under the License.
 */

 #include <vector>
 #include "backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh"
 #include "runtime/device/gpu/cuda_common.h"

@@ -54,6 +55,11 @@ struct RealDivFunc {
  __device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return (lhs / rhs); }
 };

 template <typename T, typename S>
 struct DivFunc {
  __device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return (lhs / rhs); }
 };

 template <typename T, typename S>
 struct MulFunc {
  __device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return (lhs * rhs); }
@@ -77,7 +83,7 @@ struct FloorDivFunc {
 template <>
 struct FloorDivFunc<half, half> {
  __device__ __forceinline__ half operator()(const half &lhs, const half &rhs) {
    return __float2half(floor(__half2float(lhs)/ __half2float(rhs)));
    return __float2half(floor(__half2float(lhs) / __half2float(rhs)));
  }
 };

@@ -95,7 +101,6 @@ struct AbsGradFunc {
  }
 };


 template <>
 struct PowerFunc<half, bool> {
  // invalid branch
@@ -104,72 +109,100 @@ struct PowerFunc<half, bool> {

 __device__ __forceinline__ int Index(const int &index, const int &dim) { return dim == 1 ? 0 : index; }


 template <typename T, typename S, typename Func>
 __device__ __forceinline__ void BroadcastOperator(const int &l0, const int &l1, const int &l2, const int &l3,
                                                  const int &r0, const int &r1, const int &r2, const int &r3,
                                                  const int &d0, const int &d1, const int &d2, const int &d3,
                                                  const T *input0, const T *input1, S *output) {
  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < d0 * d1 * d2 * d3; pos += blockDim.x * gridDim.x) {
    int i = pos / (d1 * d2 * d3) % d0;
    int j = pos / (d2 * d3) % d1;
    int k = pos / d3 % d2;
    int l = pos % d3;
                                                  const int &l4, const int &l5, const int &l6, const int &r0,
                                                  const int &r1, const int &r2, const int &r3, const int &r4,
                                                  const int &r5, const int &r6, const int &d0, const int &d1,
                                                  const int &d2, const int &d3, const int &d4, const int &d5,
                                                  const int &d6, const T *input0, const T *input1, S *output) {
  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < d0 * d1 * d2 * d3 * d4 * d5 * d6;
       pos += blockDim.x * gridDim.x) {
    int i = pos / (d1 * d2 * d3 * d4 * d5 * d6) % d0;
    int j = pos / (d2 * d3 * d4 * d5 * d6) % d1;
    int k = pos / (d3 * d4 * d5 * d6) % d2;
    int l = pos / (d4 * d5 * d6) % d3;
    int m = pos / (d5 * d6) % d4;
    int n = pos / d6 % d5;
    int o = pos % d6;

    int l_index = Index(i, l0) * l1 * l2 * l3 + Index(j, l1) * l2 * l3 + Index(k, l2) * l3 + Index(l, l3);
    int r_index = Index(i, r0) * r1 * r2 * r3 + Index(j, r1) * r2 * r3 + Index(k, r2) * r3 + Index(l, r3);
    int l_index = Index(i, l0) * l1 * l2 * l3 * l4 * l5 * l6;
    l_index += Index(j, l1) * l2 * l3 * l4 * l5 * l6;
    l_index += Index(k, l2) * l3 * l4 * l5 * l6;
    l_index += Index(l, l3) * l4 * l5 * l6;
    l_index += Index(m, l4) * l5 * l6;
    l_index += Index(n, l5) * l6;
    l_index += Index(o, l6);
    int r_index = Index(i, r0) * r1 * r2 * r3 * r4 * r5 * r6;
    r_index += Index(j, r1) * r2 * r3 * r4 * r5 * r6;
    r_index += Index(k, r2) * r3 * r4 * r5 * r6;
    r_index += Index(l, r3) * r4 * r5 * r6;
    r_index += Index(m, r4) * r5 * r6;
    r_index += Index(n, r5) * r6;
    r_index += Index(o, r6);
    output[pos] = Func()(input0[l_index], input1[r_index]);
  }
 }

 template <typename T, typename S>
 __global__ void BroadcastKernel(const int l0, const int l1, const int l2, const int l3, const int r0, const int r1,
                                const int r2, const int r3, const int d0, const int d1, const int d2, const int d3,
                                enum BroadcastOpType op, const T *input0, const T *input1, S *output) {
 __global__ void BroadcastKernel(const int l0, const int l1, const int l2, const int l3, const int l4, const int l5,
                                const int l6, const int r0, const int r1, const int r2, const int r3, const int r4,
                                const int r5, const int r6, const int d0, const int d1, const int d2, const int d3,
                                const int d4, const int d5, const int d6, enum BroadcastOpType op, const T *input0,
                                const T *input1, S *output) {
  switch (op) {
    case BROADCAST_TYPE_GREATER:
      return BroadcastOperator<T, S, GreaterFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
                                                        output);
      return BroadcastOperator<T, S, GreaterFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1,
                                                        d2, d3, d4, d5, d6, input0, input1, output);
    case BROADCAST_TYPE_LESS:
      return BroadcastOperator<T, S, LessFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
                                                     output);
      return BroadcastOperator<T, S, LessFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1, d2,
                                                     d3, d4, d5, d6, input0, input1, output);
    case BROADCAST_TYPE_MINIMUM:
      return BroadcastOperator<T, S, MinimumFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
                                                        output);
      return BroadcastOperator<T, S, MinimumFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1,
                                                        d2, d3, d4, d5, d6, input0, input1, output);
    case BROADCAST_TYPE_MAXIMUM:
      return BroadcastOperator<T, S, MaximumFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
                                                        output);
      return BroadcastOperator<T, S, MaximumFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1,
                                                        d2, d3, d4, d5, d6, input0, input1, output);
    case BROADCAST_TYPE_POWER:
      return BroadcastOperator<T, S, PowerFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
                                                      output);
      return BroadcastOperator<T, S, PowerFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1,
                                                      d2, d3, d4, d5, d6, input0, input1, output);
    case BROADCAST_TYPE_REALDIV:
      return BroadcastOperator<T, S, RealDivFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
                                                      output);
      return BroadcastOperator<T, S, RealDivFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1,
                                                        d2, d3, d4, d5, d6, input0, input1, output);
    case BROADCAST_TYPE_MUL:
      return BroadcastOperator<T, S, MulFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
                                                      output);
      return BroadcastOperator<T, S, MulFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1, d2,
                                                    d3, d4, d5, d6, input0, input1, output);
    case BROADCAST_TYPE_SUB:
      return BroadcastOperator<T, S, SubFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
                                                      output);
      return BroadcastOperator<T, S, SubFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1, d2,
                                                    d3, d4, d5, d6, input0, input1, output);
    case BROADCAST_TYPE_ADD:
      return BroadcastOperator<T, S, AddFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
                                                      output);
      return BroadcastOperator<T, S, AddFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1, d2,
                                                    d3, d4, d5, d6, input0, input1, output);
    case BROADCAST_TYPE_FLOORDIV:
      return BroadcastOperator<T, S, FloorDivFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
                                                      output);
      return BroadcastOperator<T, S, FloorDivFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1,
                                                         d2, d3, d4, d5, d6, input0, input1, output);
    case BROADCAST_TYPE_ABSGRAD:
      return BroadcastOperator<T, S, AbsGradFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
                                                      output);
      return BroadcastOperator<T, S, AbsGradFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1,
                                                        d2, d3, d4, d5, d6, input0, input1, output);
    case BROADCAST_TYPE_DIV:
      return BroadcastOperator<T, S, DivFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1, d2,
                                                    d3, d4, d5, d6, input0, input1, output);
  }
 }

 template <typename T, typename S>
 void Broadcast(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1, const int &r2,
               const int &r3, const int &d0, const int &d1, const int &d2, const int &d3, enum BroadcastOpType op,
               const T *input0, const T *input1, S *output, cudaStream_t stream) {
  int size = d0 * d1 * d2 * d3;
  BroadcastKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, op,
                                                                input0, input1, output);
 void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_shape,
               const std::vector<int> &output_shape, enum BroadcastOpType op, const T *input0, const T *input1,
               S *output, cudaStream_t stream) {
  int size = 1;
  for (auto d : output_shape) {
    size *= d;
  }
  BroadcastKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(
    lhs_shape[0], lhs_shape[1], lhs_shape[2], lhs_shape[3], lhs_shape[4], lhs_shape[5], lhs_shape[6], rhs_shape[0],
    rhs_shape[1], rhs_shape[2], rhs_shape[3], rhs_shape[4], rhs_shape[5], rhs_shape[6], output_shape[0],
    output_shape[1], output_shape[2], output_shape[3], output_shape[4], output_shape[5], output_shape[6], op, input0,
    input1, output);
 }

 template <typename T, typename S, typename Func>
@@ -205,6 +238,8 @@ __global__ void NoBroadcastKernel(const int nums, enum BroadcastOpType op, const
      return NoBroadcastOperator<T, S, FloorDivFunc<T, S>>(nums, input0, input1, output);
    case BROADCAST_TYPE_ABSGRAD:
      return NoBroadcastOperator<T, S, AbsGradFunc<T, S>>(nums, input0, input1, output);
    case BROADCAST_TYPE_DIV:
      return NoBroadcastOperator<T, S, DivFunc<T, S>>(nums, input0, input1, output);
  }
 }

@@ -215,8 +250,8 @@ void NoBroadcast(const int &nums, enum BroadcastOpType op, const T *input0, cons
 }

 template <typename T>
 __global__ void BroadcastToKernel(const int i0, const int i1, const int i2, const int i3, const int o0,
                                  const int o1, const int o2, const int o3, const T *input_addr, T *output_addr) {
 __global__ void BroadcastToKernel(const int i0, const int i1, const int i2, const int i3, const int o0, const int o1,
                                  const int o2, const int o3, const T *input_addr, T *output_addr) {
  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < o0 * o1 * o2 * o3; pos += blockDim.x * gridDim.x) {
    int i = pos / (o1 * o2 * o3) % o0;
    int j = pos / (o2 * o3) % o1;
@@ -233,33 +268,27 @@ void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, con
                 const int &o2, const int &o3, const T *input_addr, T *output_addr, cudaStream_t stream) {
  int nums = o0 * o1 * o2 * o3;
  BroadcastToKernel<<<GET_BLOCKS(nums), GET_THREADS, 0, stream>>>(i0, i1, i2, i3, o0, o1, o2, o3, input_addr,
          output_addr);
                                                                  output_addr);
 }

 template void Broadcast(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
                        const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
                        enum BroadcastOpType op, const float *input0, const float *input1, bool *output,
                        cudaStream_t stream);
 template void Broadcast(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
                        const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
                        enum BroadcastOpType op, const float *input0, const float *input1, float *output,
                        cudaStream_t stream);
 template void Broadcast(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
                        const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
                        enum BroadcastOpType op, const half *input0, const half *input1, bool *output,
                        cudaStream_t stream);
 template void Broadcast(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
                        const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
                        enum BroadcastOpType op, const half *input0, const half *input1, half *output,
                        cudaStream_t stream);
 template void Broadcast(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
                        const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
                        enum BroadcastOpType op, const int *input0, const int *input1, int *output,
                        cudaStream_t stream);
 template void Broadcast(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
                        const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
                        enum BroadcastOpType op, const int *input0, const int *input1, bool *output,
                        cudaStream_t stream);
 template void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_shape,
                        const std::vector<int> &output_shape, enum BroadcastOpType op, const float *input0,
                        const float *input1, bool *output, cudaStream_t stream);
 template void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_shape,
                        const std::vector<int> &output_shape, enum BroadcastOpType op, const float *input0,
                        const float *input1, float *output, cudaStream_t stream);
 template void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_shape,
                        const std::vector<int> &output_shape, enum BroadcastOpType op, const half *input0,
                        const half *input1, bool *output, cudaStream_t stream);
 template void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_shape,
                        const std::vector<int> &output_shape, enum BroadcastOpType op, const half *input0,
                        const half *input1, half *output, cudaStream_t stream);
 template void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_shape,
                        const std::vector<int> &output_shape, enum BroadcastOpType op, const int *input0,
                        const int *input1, int *output, cudaStream_t stream);
 template void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_shape,
                        const std::vector<int> &output_shape, enum BroadcastOpType op, const int *input0,
                        const int *input1, bool *output, cudaStream_t stream);
 template void NoBroadcast(const int &nums, enum BroadcastOpType op, const float *input0, const float *input1,
                          bool *output, cudaStream_t stream);
 template void NoBroadcast(const int &nums, enum BroadcastOpType op, const float *input0, const float *input1,
@@ -268,10 +297,10 @@ template void NoBroadcast(const int &nums, enum BroadcastOpType op, const half *
                          bool *output, cudaStream_t stream);
 template void NoBroadcast(const int &nums, enum BroadcastOpType op, const half *input0, const half *input1,
                          half *output, cudaStream_t stream);
 template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1,
                          int *output, cudaStream_t stream);
 template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1,
                          bool *output, cudaStream_t stream);
 template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1, int *output,
                          cudaStream_t stream);
 template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1, bool *output,
                          cudaStream_t stream);
 template void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1,
                          const int &o2, const int &o3, const float *input_addr, float *output_addr,
                          cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh
@@ -17,6 +17,7 @@
 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_H_

 #include <vector>
 #include "runtime/device/gpu/cuda_common.h"

 enum BroadcastOpType {
@@ -31,13 +32,14 @@ enum BroadcastOpType {
  BROADCAST_TYPE_ADD = 8,
  BROADCAST_TYPE_FLOORDIV = 9,
  BROADCAST_TYPE_ABSGRAD = 10,
  BROADCAST_TYPE_DIV = 11,
  BROADCAST_TYPE_INVALID = 0xffffffff,
 };

 template <typename T, typename S>
 void Broadcast(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1, const int &r2,
               const int &r3, const int &d0, const int &d1, const int &d2, const int &d3, enum BroadcastOpType op,
               const T *input0, const T *input1, S *output, cudaStream_t stream);
 void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_shape,
               const std::vector<int> &output_shape, enum BroadcastOpType op, const T *input0, const T *input1,
               S *output, cudaStream_t stream);

 template <typename T, typename S>
 void NoBroadcast(const int &size, enum BroadcastOpType op, const T *input0, const T *input1, S *output,
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/check_valid_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/check_valid_impl.cu
@@ -25,10 +25,10 @@ __global__ void CheckValidKernel(const size_t size, const T *box, const T *img_m
    const size_t right_y = i * 4 + 3;

    S valid_flag = false;
    valid_flag |= !(box[left_x] >= 0.f);
    valid_flag |= !(box[left_y] >= 0.f);
    valid_flag |= !(img_metas[0] * img_metas[2] - 1.f >= box[right_x]);
    valid_flag |= !(img_metas[1] * img_metas[2] - 1.f >= box[right_y]);
    valid_flag |= !(box[left_x] >= static_cast<T>(0.0));
    valid_flag |= !(box[left_y] >= static_cast<T>(0.0));
    valid_flag |= !(img_metas[1] * img_metas[2] - static_cast<T>(1.0) >= box[right_x]);
    valid_flag |= !(img_metas[0] * img_metas[2] - static_cast<T>(1.0) >= box[right_y]);

    valid[i] = !valid_flag;
  }
@@ -43,3 +43,5 @@ void CheckValid(const size_t &size, const T *box, const T *img_metas, S *valid,

 template void CheckValid(const size_t &size, const float *box, const float *img_metas, bool *valid,
                         cudaStream_t cuda_stream);
 template void CheckValid(const size_t &size, const half *box, const half *img_metas, bool *valid,
                         cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/cumsum_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/cumsum_impl.cu
@@ -18,12 +18,85 @@
 #include "runtime/device/gpu/cuda_common.h"

 template <typename T>
 __global__ void CumSumKernel(T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride,
 __global__ void Copy(T *input, T *output, size_t size) {
  size_t step = blockDim.x * gridDim.x;
  for (size_t write_index = blockIdx.x * blockDim.x + threadIdx.x; write_index < size; write_index += step) {
    input[write_index] = output[write_index];
  }
 }

 template <typename T>
 __global__ void LeftMove(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride,
                         size_t stride2) {
  size_t num = dim0 * dim2;
  size_t i, k, offset;
  size_t step = blockDim.x * gridDim.x;
  for (size_t write_index = blockIdx.x * blockDim.x + threadIdx.x; write_index < num; write_index += step) {
    i = write_index / dim2 % dim0;
    k = write_index % dim2;
    offset = i * stride + k;
    for (size_t j = 0; j < dim1; ++j) {
      size_t read_index = j * stride2 + offset;
      if (j == 0) {
        output[read_index] = 0;
      } else {
        size_t read_index2 = (j - 1) * stride2 + offset;
        output[read_index] = input[read_index2];
      }
    }
  }
 }

 template <typename T>
 __global__ void RightMove(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride,
                          size_t stride2) {
  size_t num = dim0 * dim2;
  size_t i, k, offset;
  size_t step = blockDim.x * gridDim.x;
  for (size_t write_index = blockIdx.x * blockDim.x + threadIdx.x; write_index < num; write_index += step) {
    i = write_index / dim2 % dim0;
    k = write_index % dim2;
    offset = i * stride + k;
    for (int j = dim1 - 1; j >= 0; --j) {
      size_t read_index = j * stride2 + offset;
      if (j == dim1 - 1) {
        output[read_index] = 0;
      } else {
        size_t read_index2 = (j + 1) * stride2 + offset;
        output[read_index] = input[read_index2];
      }
    }
  }
 }
 template <typename T>
 __global__ void CumSumKernelReverse(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride,
                                    size_t stride2) {
  size_t num = dim0 * dim2;
  size_t i, k, offset;
  size_t step = blockDim.x * gridDim.x;
  for (size_t write_index = blockIdx.x * blockDim.x + threadIdx.x; write_index < num; write_index += step) {
    i = write_index / dim2 % dim0;
    k = write_index % dim2;
    offset = i * stride + k;
    for (int j = dim1 - 1; j >= 0; --j) {
      size_t read_index = j * stride2 + offset;
      if (j == dim1 - 1) {
        output[read_index] = input[read_index];
      } else {
        size_t read_index2 = (j + 1) * stride2 + offset;
        output[read_index] = output[read_index2] + input[read_index];
      }
    }
  }
 }

 template <typename T>
 __global__ void CumSumKernel(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride,
                             size_t stride2) {
  size_t num = dim0 * dim2;
  size_t i, k, offset;
  for (size_t write_index = blockIdx.x * blockDim.x + threadIdx.x; write_index < num;
       write_index += blockDim.x * gridDim.x) {
  size_t step = blockDim.x * gridDim.x;
  for (size_t write_index = blockIdx.x * blockDim.x + threadIdx.x; write_index < num; write_index += step) {
    i = write_index / dim2 % dim0;
    k = write_index % dim2;
    offset = i * stride + k;
@@ -39,12 +112,32 @@ __global__ void CumSumKernel(T *input, T *output, size_t dim0, size_t dim1, size
  }
 }
 template <typename T>
 void CumSum(T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2,
            cudaStream_t stream) {
 void CumSum(const T *input, T *output, T *workspace, size_t dim0, size_t dim1, size_t dim2, size_t stride,
            size_t stride2, bool exclusive_, bool reverse_, cudaStream_t stream) {
  int size = dim0 * dim2;
  CumSumKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, output, dim0, dim1, dim2, stride, stride2);
  if (exclusive_) {
    if (reverse_) {
      RightMove<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, output, dim0, dim1, dim2, stride, stride2);
      Copy<<<GET_BLOCKS(size * dim1), GET_THREADS, 0, stream>>>(workspace, output, size * dim1);
      CumSumKernelReverse<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(workspace, output, dim0, dim1, dim2, stride,
                                                                        stride2);
    } else {
      LeftMove<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, output, dim0, dim1, dim2, stride, stride2);
      Copy<<<GET_BLOCKS(size * dim1), GET_THREADS, 0, stream>>>(workspace, output, size * dim1);
      CumSumKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(workspace, output, dim0, dim1, dim2, stride, stride2);
    }
  } else {
    if (reverse_) {
      CumSumKernelReverse<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, output, dim0, dim1, dim2, stride,
                                                                        stride2);
    } else {
      CumSumKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, output, dim0, dim1, dim2, stride, stride2);
    }
  }
  return;
 }

 template void CumSum<float>(float *input, float *output, size_t dim0, size_t dim1, size_t dim2, size_t stride,
                            size_t stride2, cudaStream_t stream);
 template void CumSum<float>(const float *input, float *output, float *workspace, size_t dim0, size_t dim1, size_t dim2,
                            size_t stride, size_t stride2, bool exclusive_, bool reverse_, cudaStream_t stream);
 template void CumSum<half>(const half *input, half *output, half *workspace, size_t dim0, size_t dim1, size_t dim2,
                           size_t stride, size_t stride2, bool exclusive_, bool reverse_, cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/cumsum_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/cumsum_impl.cuh
@@ -17,6 +17,6 @@
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUMSUM_IMPL_CUH_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUMSUM_IMPL_CUH_
 template <typename T>
 void CumSum(T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2,
            cudaStream_t stream);
 void CumSum(const T *input, T *output, T *workspace, size_t dim0, size_t dim1, size_t dim2, size_t stride,
            size_t stride2, bool exclusive_, bool reverse_, cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUMSUM_IMPL_CUH_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/iou_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/iou_impl.cu
@@ -16,27 +16,26 @@

 #include "backend/kernel_compiler/gpu/cuda_impl/iou_impl.cuh"

 template <typename T>
 __device__ T CoordinateMax(const T a, const T b) {
 __device__ float CoordinateMax(const float a, const float b) {
  return (a > b ? a : b);
 }

 template <typename T>
 __device__ T CoordinateMin(const T a, const T b) {
 __device__ float CoordinateMin(const float a, const float b) {
  return (a < b ? a : b);
 }

 template <typename T>
 __global__ void IOUKernel(const size_t size, const T *box1, const T *box2, T *iou_results, const size_t mode,
                          const size_t input_len_0) {
  T location_coordinate[IOU_LOCATION_NUM][IOU_DIMENSION];
  T overlaps_coordinate[IOU_DIMENSION];
  const T epsilon = 1e-10;
  float location_coordinate[IOU_LOCATION_NUM][IOU_DIMENSION];
  float overlaps_coordinate[IOU_DIMENSION];
  const float epsilon = 1e-10;
  const float offset = 1.0;

  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) {
    for (size_t j = 0; j < IOU_DIMENSION; j++) {
      location_coordinate[0][j] = box1[(i % input_len_0) * IOU_DIMENSION + j];
      location_coordinate[1][j] = box2[(i / input_len_0) * IOU_DIMENSION + j];
      location_coordinate[0][j] = static_cast<float>(box1[(i % input_len_0) * IOU_DIMENSION + j]);
      location_coordinate[1][j] = static_cast<float>(box2[(i / input_len_0) * IOU_DIMENSION + j]);
    }

    overlaps_coordinate[0] = CoordinateMax(location_coordinate[0][0], location_coordinate[1][0]);
@@ -44,18 +43,18 @@ __global__ void IOUKernel(const size_t size, const T *box1, const T *box2, T *io
    overlaps_coordinate[2] = CoordinateMin(location_coordinate[0][2], location_coordinate[1][2]);
    overlaps_coordinate[3] = CoordinateMin(location_coordinate[0][3], location_coordinate[1][3]);

    T overlaps_w = CoordinateMax(0.f, overlaps_coordinate[2] - overlaps_coordinate[0] + 1);
    T overlaps_h = CoordinateMax(0.f, overlaps_coordinate[3] - overlaps_coordinate[1] + 1);
    T overlaps = overlaps_w * overlaps_h;
    float overlaps_w = CoordinateMax(0.0, overlaps_coordinate[2] - overlaps_coordinate[0] + offset);
    float overlaps_h = CoordinateMax(0.0, overlaps_coordinate[3] - overlaps_coordinate[1] + offset);
    float overlaps = overlaps_w * overlaps_h;

    T area1 = (location_coordinate[0][2] - location_coordinate[0][0] + 1) * (location_coordinate[0][3] -
               location_coordinate[0][1] + 1);
    T area2 = (location_coordinate[1][2] - location_coordinate[1][0] + 1) * (location_coordinate[1][3] -
                                                                             location_coordinate[1][1] + 1);
    float area1 = (location_coordinate[0][2] - location_coordinate[0][0] + offset) * (location_coordinate[0][3] -
               location_coordinate[0][1] + offset);
    float area2 = (location_coordinate[1][2] - location_coordinate[1][0] + offset) * (location_coordinate[1][3] -
                                                                             location_coordinate[1][1] + offset);
    if (mode == 0) {
      iou_results[i] = overlaps / (area1 + area2 - overlaps + epsilon);
      iou_results[i] = static_cast<T>(overlaps / (area1 + area2 - overlaps + epsilon));
    } else {
      iou_results[i] = overlaps / (area2 + epsilon);
      iou_results[i] = static_cast<T>(overlaps / (area2 + epsilon));
    }
  }

@@ -70,3 +69,5 @@ void IOU(const size_t &size, const T *box1, const T *box2, T *iou_results, const

 template void IOU(const size_t &size, const float *box1, const float *box2, float *iou_results, const size_t &mode,
                  const size_t &input_len_0, cudaStream_t cuda_stream);
 template void IOU(const size_t &size, const half *box1, const half *box2, half *iou_results, const size_t &mode,
                  const size_t &input_len_0, cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel.h
@@ -84,6 +84,40 @@ class GpuKernel : public KernelMod {
    }
  }

  // set the tensor descriptor for cudnn/cublas
  void CudnnSetTensorNdDescriptor(const std::vector<size_t> &shape, cudnnTensorDescriptor_t descriptor,
                                  cudnnDataType_t data_type) {
    if (shape.size() < 3) {
      MS_EXCEPTION(ValueError) << "cudnnSetTensorNdDescriptor don't support" << shape.size() << "D.";
    }
    const int nbDims = shape.size();
    int *dim = new (std::nothrow) int[nbDims];
    if (dim == nullptr) {
      MS_LOG(EXCEPTION) << "malloc dim failed.";
    }
    int *stride = new (std::nothrow) int[nbDims];
    if (stride == nullptr) {
      MS_LOG(EXCEPTION) << "malloc stride failed.";
    }

    for (int i = 0; i < nbDims; i++) {
      dim[i] = SizeToInt(shape[i]);
      stride[i] = 1;
    }

    for (int i = nbDims - 2; i >= 0; i--) {
      stride[i] = stride[i + 1] * SizeToInt(shape[i + 1]);
    }

    CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(descriptor, data_type, nbDims, dim, stride),
                                "cudnnSetTensorNdDescriptor failed");

    delete[] dim;
    dim = nullptr;
    delete[] stride;
    stride = nullptr;
  }

  // choose the suitable datatype for cudnn/cublas
  inline cudnnDataType_t GetCudnnDataType(const std::string &Type) {
    auto type = kCudnnDtypeMap.find(Type);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.cc
@@ -59,6 +59,9 @@ MS_REG_GPU_KERNEL_TWO(
  AbsGrad,
  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
  BroadcastOpGpuKernel, float, float)
 MS_REG_GPU_KERNEL_TWO(
  Div, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
  BroadcastOpGpuKernel, float, float)

 // fp16
 MS_REG_GPU_KERNEL_TWO(
@@ -101,6 +104,9 @@ MS_REG_GPU_KERNEL_TWO(
  AbsGrad,
  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
  BroadcastOpGpuKernel, half, half)
 MS_REG_GPU_KERNEL_TWO(
  Div, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
  BroadcastOpGpuKernel, half, half)

 // int32
 MS_REG_GPU_KERNEL_TWO(
@@ -118,14 +124,14 @@ MS_REG_GPU_KERNEL_TWO(
 MS_REG_GPU_KERNEL_TWO(
  Mul, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
  BroadcastOpGpuKernel, int, int)
 MS_REG_GPU_KERNEL_TWO(
  RealDiv, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
  BroadcastOpGpuKernel, int, int)
 MS_REG_GPU_KERNEL_TWO(
  FloorDiv, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
  BroadcastOpGpuKernel, int, int)
 MS_REG_GPU_KERNEL_TWO(
  AbsGrad, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
  BroadcastOpGpuKernel, int, int)
 MS_REG_GPU_KERNEL_TWO(
  Div, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
  BroadcastOpGpuKernel, int, int)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.h
@@ -27,6 +27,7 @@
 #include "backend/kernel_compiler/gpu/kernel_constants.h"
 namespace mindspore {
 namespace kernel {
 constexpr int MAX_DIMS = 7;
 template <typename T, typename S>
 class BroadcastOpGpuKernel : public GpuKernel {
 public:
@@ -45,9 +46,8 @@ class BroadcastOpGpuKernel : public GpuKernel {
    S *output = GetDeviceAddress<S>(outputs, 0);

    if (need_broadcast_) {
      Broadcast(lhs_shape_[0], lhs_shape_[1], lhs_shape_[2], lhs_shape_[3], rhs_shape_[0], rhs_shape_[1], rhs_shape_[2],
                rhs_shape_[3], output_shape_[0], output_shape_[1], output_shape_[2], output_shape_[3], op_type_, lhs,
                rhs, output, reinterpret_cast<cudaStream_t>(stream_ptr));
      Broadcast(lhs_shape_, rhs_shape_, output_shape_, op_type_, lhs, rhs, output,
                reinterpret_cast<cudaStream_t>(stream_ptr));
    } else {
      NoBroadcast(output_num_, op_type_, lhs, rhs, output, reinterpret_cast<cudaStream_t>(stream_ptr));
    }
@@ -60,10 +60,13 @@ class BroadcastOpGpuKernel : public GpuKernel {
    auto shape2 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    auto shape3 = AnfAlgo::GetOutputInferShape(kernel_node, 0);
    need_broadcast_ = IsBroadcast(shape1, shape2);
    if (need_broadcast_ && shape1.size() > 4) {
      MS_LOG(EXCEPTION) << "Broadcast operation not support dim greater than 4";
    if (need_broadcast_ && shape1.size() > 7) {
      MS_LOG(EXCEPTION) << "Broadcast operation not support dim greater than 7";
    }

    lhs_shape_.resize(MAX_DIMS, 1);
    rhs_shape_.resize(MAX_DIMS, 1);
    output_shape_.resize(MAX_DIMS, 1);
    for (size_t i = 0; i < shape3.size(); i++) {
      output_shape_[i] = shape3[i];
      output_num_ *= shape3[i];
@@ -99,7 +102,7 @@ class BroadcastOpGpuKernel : public GpuKernel {
      {"Greater", BROADCAST_TYPE_GREATER},   {"Less", BROADCAST_TYPE_LESS},       {"Maximum", BROADCAST_TYPE_MAXIMUM},
      {"Minimum", BROADCAST_TYPE_MINIMUM},   {"Pow", BROADCAST_TYPE_POWER},       {"RealDiv", BROADCAST_TYPE_REALDIV},
      {"Mul", BROADCAST_TYPE_MUL},           {"Sub", BROADCAST_TYPE_SUB},         {"TensorAdd", BROADCAST_TYPE_ADD},
      {"FloorDiv", BROADCAST_TYPE_FLOORDIV}, {"AbsGrad", BROADCAST_TYPE_ABSGRAD},
      {"FloorDiv", BROADCAST_TYPE_FLOORDIV}, {"AbsGrad", BROADCAST_TYPE_ABSGRAD}, {"Div", BROADCAST_TYPE_DIV},
    };

    auto iter = kBroadcastTypeMap.find(kernel_name);
@@ -127,9 +130,9 @@ class BroadcastOpGpuKernel : public GpuKernel {
  int input1_num_;
  int input2_num_;
  int output_num_;
  int lhs_shape_[4] = {1, 1, 1, 1};
  int rhs_shape_[4] = {1, 1, 1, 1};
  int output_shape_[4] = {1, 1, 1, 1};
  std::vector<int> lhs_shape_;
  std::vector<int> rhs_shape_;
  std::vector<int> output_shape_;

  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cumsum_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cumsum_gpu_kernel.cc
@@ -20,5 +20,7 @@ namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_ONE(CumSum, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
                      CumSumGpuKernel, float)
 MS_REG_GPU_KERNEL_ONE(CumSum, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
                      CumSumGpuKernel, half)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cumsum_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cumsum_gpu_kernel.h
@@ -27,7 +27,7 @@ namespace kernel {
 template <typename T>
 class CumSumGpuKernel : public GpuKernel {
 public:
  CumSumGpuKernel() : axis_(0), input_size_0_(0), stride_(0), stride2_(0) {}
  CumSumGpuKernel() : exclusive_(false), reverse_(false), axis_(0), input_size_0_(0), stride_(0), stride2_(0) {}
  ~CumSumGpuKernel() = default;

  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
@@ -38,7 +38,8 @@ class CumSumGpuKernel : public GpuKernel {
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    T *input_addr = GetDeviceAddress<T>(inputs, 0);
    T *output_addr = GetDeviceAddress<T>(outputs, 0);
    CumSum(input_addr, output_addr, dims_[0], dims_[1], dims_[2], stride_, stride2_,
    T *ws_addr = GetDeviceAddress<T>(workspace, 0);
    CumSum(input_addr, output_addr, ws_addr, dims_[0], dims_[1], dims_[2], stride_, stride2_, exclusive_, reverse_,
           reinterpret_cast<cudaStream_t>(stream_ptr));
    return true;
  }
@@ -51,6 +52,8 @@ class CumSumGpuKernel : public GpuKernel {
    input_size_0_ = sizeof(T);
    shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    axis_ = GetAttr<int>(kernel_node, "axis");
    exclusive_ = GetAttr<bool>(kernel_node, "exclusive");
    reverse_ = GetAttr<bool>(kernel_node, "reverse");
    int input_dim_length = SizeToInt(shape_.size());
    if (axis_ >= input_dim_length) {
      MS_LOG(EXCEPTION) << "Axis out of bounds.";
@@ -70,6 +73,7 @@ class CumSumGpuKernel : public GpuKernel {
  void InitSizeLists() override {
    input_size_list_.push_back(input_size_0_);
    output_size_list_.push_back(input_size_0_);
    workspace_size_list_.push_back(input_size_0_);
  }

 private:
@@ -87,6 +91,8 @@ class CumSumGpuKernel : public GpuKernel {
    stride2_ = dims_[2];
    return;
  }
  bool exclusive_;
  bool reverse_;
  int axis_;
  size_t input_size_0_;
  size_t stride_;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_gpu_kernel.h
@@ -83,12 +83,19 @@ class ActivationGpuFwdKernel : public GpuKernel {
      return true;
    }
    std::vector<int> shape;
    ShapeNdTo4d(input_shape, &shape);
    CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetActivationDescriptor(activation_desc_, mode_, CUDNN_NOT_PROPAGATE_NAN, 0.0),
                                "cudnnSetActivationDescriptor failed");
    CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(data_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_,
                                                           shape[0], shape[1], shape[2], shape[3]),
                                "cudnnSetTensor4dDescriptor failed");

    const int split_dim = 4;
    if (input_shape.size() <= split_dim) {
      ShapeNdTo4d(input_shape, &shape);
      CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(data_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_,
                                                             shape[0], shape[1], shape[2], shape[3]),
                                  "cudnnSetTensor4dDescriptor failed");
    } else {
      CudnnSetTensorNdDescriptor(input_shape, data_descriptor_, cudnn_data_type_);
    }

    InitSizeLists();
    return true;
  }
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_grad_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_grad_kernel.h
@@ -90,12 +90,18 @@ class ActivationGradGpuKernel : public GpuKernel {
      return true;
    }
    std::vector<int> shape;
    ShapeNdTo4d(input_shape, &shape);
    CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetActivationDescriptor(activation_desc_, mode_, CUDNN_PROPAGATE_NAN, 0.0),
                                "SetActivationDescriptor failed");
    CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(data_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_,
                                                           shape[0], shape[1], shape[2], shape[3]),
                                "SetTensor4dDescriptor failed");

    const int split_dim = 4;
    if (input_shape.size() <= split_dim) {
      ShapeNdTo4d(input_shape, &shape);
      CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(data_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_,
                                                             shape[0], shape[1], shape[2], shape[3]),
                                  "SetTensor4dDescriptor failed");
    } else {
      CudnnSetTensorNdDescriptor(input_shape, data_descriptor_, cudnn_data_type_);
    }

    InitSizeLists();
    return true;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/dropout_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/dropout_gpu_kernel.h
@@ -54,12 +54,18 @@ class DropoutGpuFwdKernel : public GpuKernel {
    float *mask_f = GetDeviceAddress<float>(workspace, 0);

    if (!states_init_) {
      curandCreateGenerator(&mask_generator_, CURAND_RNG_PSEUDO_DEFAULT);
      curandSetPseudoRandomGeneratorSeed(mask_generator_, time(NULL));
      CHECK_CURAND_RET_WITH_EXCEPT(curandCreateGenerator(&mask_generator_, CURAND_RNG_PSEUDO_DEFAULT),
                                   "Failed to create generator");
      CHECK_CURAND_RET_WITH_EXCEPT(curandSetPseudoRandomGeneratorSeed(mask_generator_, time(NULL)),
                                   "Failed to SetPseudoRandomGeneratorSeed");
      MS_EXCEPTION_IF_NULL(mask_generator_);
      states_init_ = true;
    }
    CHECK_CURAND_RET_WITH_EXCEPT(curandSetStream(mask_generator_, reinterpret_cast<cudaStream_t>(stream_ptr)),
                                 "Failed to set stream for generator");
    // curandGen only support float or double for mask.
    curandGenerateUniform(mask_generator_, mask_f, num_count_);
    CHECK_CURAND_RET_WITH_EXCEPT(curandGenerateUniform(mask_generator_, mask_f, num_count_),
                                 "Failed to generate uniform");
    DropoutForward(input, mask, output, mask_f, num_count_, keep_prob_, reinterpret_cast<cudaStream_t>(stream_ptr));

    return true;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/other/check_valid_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/other/check_valid_gpu_kernel.cc
@@ -22,5 +22,9 @@ MS_REG_GPU_KERNEL_TWO(
  CheckValid,
  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeBool),
  CheckValidGpuKernel, float, bool)
 MS_REG_GPU_KERNEL_TWO(
  CheckValid,
  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeBool),
  CheckValidGpuKernel, half, bool)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/other/iou_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/other/iou_gpu_kernel.cc
@@ -21,5 +21,8 @@ namespace kernel {
 MS_REG_GPU_KERNEL_ONE(
  IOU, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
  IOUGpuKernel, float)
 MS_REG_GPU_KERNEL_ONE(
  IOU, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
  IOUGpuKernel, half)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/optimizer/ascend/ascend_backend_optimization.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/ascend_backend_optimization.cc
@@ -80,6 +80,7 @@
 #include "backend/optimizer/ascend/buffer_fusion/conv_single_in_fusion_pass.h"
 #include "backend/optimizer/ascend/buffer_fusion/conv_double_in_fusion_pass.h"
 #include "backend/optimizer/ascend/buffer_fusion/matmul_eltwise_fusion_pass.h"
 #include "backend/optimizer/ascend/buffer_fusion/matmul_confusiontranspose_fusion_pass.h"
 #include "backend/optimizer/ascend/buffer_fusion/depthwiseconv_eltwise_fusion_pass.h"
 #include "backend/optimizer/ascend/buffer_fusion/bnupdate_eltwise_fusion_pass.h"
 #include "backend/optimizer/ascend/buffer_fusion/bnupdate_eltwise_eltwise_fusion_pass.h"
@@ -124,6 +125,10 @@ void AddAscendIRFusionRulesPass(PassManager *ir_fusion_pm) {
  ir_fusion_pm->AddPass(std::make_shared<LambNextMVRuleCond4>());
  ir_fusion_pm->AddPass(std::make_shared<LambNextRightRule>());
  ir_fusion_pm->AddPass(std::make_shared<LambUpdateWithLrV2>());
  ir_fusion_pm->AddPass(std::make_shared<AdamApplyOneAssignCond1Fusion>());
  ir_fusion_pm->AddPass(std::make_shared<AdamApplyOneAssignCond2Fusion>());
  ir_fusion_pm->AddPass(std::make_shared<AdamApplyOneAssignCond3Fusion>());
  ir_fusion_pm->AddPass(std::make_shared<AdamApplyOneAssignCond4Fusion>());
  ir_fusion_pm->AddPass(std::make_shared<AdamApplyOneCond1Fusion>());
  ir_fusion_pm->AddPass(std::make_shared<AdamApplyOneCond2Fusion>());
  ir_fusion_pm->AddPass(std::make_shared<AdamApplyOneCond3Fusion>());
@@ -308,6 +313,7 @@ void RunOpAscendBackendIRFusionOptimization(const std::shared_ptr<session::Kerne
  }
  auto optimizer = std::make_shared<GraphOptimizer>();
  auto ir_fusion_pm = std::make_shared<PassManager>("ir_fusion_pm");
  ir_fusion_pm->AddPass(std::make_shared<SplitFission>());
  ir_fusion_pm->AddPass(std::make_shared<BnSplit>());
  ir_fusion_pm->AddPass(std::make_shared<LayerNormGradSplit>());
  ir_fusion_pm->AddPass(std::make_shared<TopKSplit>());
--- a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/bnupdate_eltwise_fusion_pass.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/bnupdate_eltwise_fusion_pass.cc
@@ -27,15 +27,15 @@

 namespace mindspore {
 namespace opt {
 void BnupdateEltwiseFusionPass::MatchBnupdateRelu(const CNodePtr &cnode, const AnfNodePtr &relu_input,
                                                  const session::KernelGraph &kernel_graph,
                                                  FusedNodeRecord *candidate_fusion) {
 void BnupdateEltwiseFusionPass::MatchBnupdateDoubleOutputEltwise(const CNodePtr &cnode, const AnfNodePtr &eltwise_input,
                                                                 const session::KernelGraph &kernel_graph,
                                                                 FusedNodeRecord *candidate_fusion) {
  MS_EXCEPTION_IF_NULL(cnode);
  MS_EXCEPTION_IF_NULL(candidate_fusion);
  auto manager = kernel_graph.manager();
  MS_EXCEPTION_IF_NULL(manager);
  MS_EXCEPTION_IF_NULL(relu_input);
  auto getitem = relu_input->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(eltwise_input);
  auto getitem = eltwise_input->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(getitem);
  auto bnupdate = getitem->input(1);
  MS_EXCEPTION_IF_NULL(bnupdate);
@@ -68,10 +68,11 @@ void BnupdateEltwiseFusionPass::MatchSingleFusionPattern(const session::KernelGr
    auto cnode = node->cast<CNodePtr>();
    MS_EXCEPTION_IF_NULL(cnode);
    if (AnfAlgo::GetKernelType(cnode) == KernelType::TBE_KERNEL &&
        AnfAlgo::GetFusionType(cnode) == kernel::FusionType::ELEMWISE) {
        AnfAlgo::GetFusionType(cnode) == kernel::FusionType::ELEMWISE &&
        AnfAlgo::GetOutputTensorNum(cnode) == ELTWISE_DOUBLE_OUTPUT_SIZE) {
      auto eltwise_input = cnode->input(1);
      if (eltwise_input->isa<CNode>() && AnfAlgo::CheckPrimitiveType(eltwise_input, prim::kPrimTupleGetItem)) {
        MatchBnupdateRelu(cnode, eltwise_input, kernel_graph, candidate_fusion);
        MatchBnupdateDoubleOutputEltwise(cnode, eltwise_input, kernel_graph, candidate_fusion);
      }
    }
  }
--- a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/bnupdate_eltwise_fusion_pass.h
+++ b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/bnupdate_eltwise_fusion_pass.h
@@ -39,8 +39,8 @@ class BnupdateEltwiseFusionPass : public FusionBasePass {
  void MatchSingleFusionPattern(const session::KernelGraph &kernel_graph, FusedNodeRecord *candidate_fusion) override;

 private:
  void MatchBnupdateRelu(const CNodePtr &cnode, const AnfNodePtr &relu_input, const session::KernelGraph &kernel_graph,
                         FusedNodeRecord *candidate_fusion);
  void MatchBnupdateDoubleOutputEltwise(const CNodePtr &cnode, const AnfNodePtr &eltwise_input,
                                        const session::KernelGraph &kernel_graph, FusedNodeRecord *candidate_fusion);
 };
 }  // namespace opt
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/fusion_base_pass.h
+++ b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/fusion_base_pass.h
@@ -33,6 +33,7 @@ const int8_t MAX_ELTWISE_NUM = 3;
 const int8_t MIN_ELTWISE_SIZE = 2;
 const int8_t ELTWISE_INPUT_SIZE = 2;
 const int8_t ELTWISE_DOUBLE_IN_INPUT_SIZE = 3;
 const int8_t ELTWISE_DOUBLE_OUTPUT_SIZE = 2;
 const int8_t CONV_DOUBLE_IN_INPUT_SIZE = 3;
 const int8_t CONV_QUART_IN_INPUT_SIZE = 5;
 const int8_t ELTWISE_USE = 1;
--- a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/matmul_confusiontranspose_fusion_pass.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/matmul_confusiontranspose_fusion_pass.cc
@@ -0,0 +1,66 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/optimizer/ascend/buffer_fusion/matmul_confusiontranspose_fusion_pass.h"
 #include <vector>
 #include <unordered_set>
 #include <memory>
 #include <string>
 #include "backend/kernel_compiler/kernel_fusion.h"
 #include "debug/anf_ir_dump.h"
 #include "backend/session/anf_runtime_algorithm.h"
 #include "frontend/operator/ops.h"
 #include "utils/ms_context.h"
 #include "backend/optimizer/common/fusion_id_allocator.h"

 namespace mindspore {
 namespace opt {
 void MatmulConfusionTranposeFusionPass::MatchMatmulConfusionTranpose(const CNodePtr &cnode,
                                                                     const session::KernelGraph &kernel_graph,
                                                                     FusedNodeRecord *candidate_fusion) {
  MS_EXCEPTION_IF_NULL(cnode);
  MS_EXCEPTION_IF_NULL(candidate_fusion);
  auto manager = kernel_graph.manager();
  MS_EXCEPTION_IF_NULL(manager);
  auto matmul = cnode->input(1);
  MS_EXCEPTION_IF_NULL(matmul);
  if (matmul->isa<CNode>() && AnfAlgo::CheckPrimitiveType(matmul, prim::kPrimMatMul)) {
    std::vector<int> output_used_num{SizeToInt(manager->node_users()[matmul].size())};
    AnfAlgo::SetNodeAttr(kAttrOutputUsedNum, MakeValue(output_used_num), matmul);
    std::unordered_set<AnfNodePtr> record{cnode, matmul};
    candidate_fusion->push_back(record);
    SetRecordFusionId(record);
  }
 }

 void MatmulConfusionTranposeFusionPass::MatchSingleFusionPattern(const session::KernelGraph &kernel_graph,
                                                                 FusedNodeRecord *candidate_fusion) {
  MS_EXCEPTION_IF_NULL(candidate_fusion);
  std::vector<AnfNodePtr> node_list = TopoSort(kernel_graph.get_return());
  for (auto &node : node_list) {
    if (!AnfAlgo::IsRealCNodeKernel(node) || fusion_id_allocator->HasFusionIdAttr(node) ||
        AnfAlgo::CheckPrimitiveType(node, prim::kPrimReturn)) {
      continue;
    }
    auto cnode = node->cast<CNodePtr>();
    MS_EXCEPTION_IF_NULL(cnode);

    if (AnfAlgo::GetCNodeName(cnode) == kConfusionTransposeDOpName) {
      MatchMatmulConfusionTranpose(cnode, kernel_graph, candidate_fusion);
    }
  }
 }
 }  // namespace opt
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/matmul_confusiontranspose_fusion_pass.h
+++ b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/matmul_confusiontranspose_fusion_pass.h
@@ -0,0 +1,48 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_BUFFER_FUSION_PASS_MATMUL_CONFUSIONTRANSPOSE_FUSION_PASS_H_
 #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_BUFFER_FUSION_PASS_MATMUL_CONFUSIONTRANSPOSE_FUSION_PASS_H_

 #include <unordered_set>
 #include <vector>

 #include "backend/optimizer/ascend/buffer_fusion/fusion_base_pass.h"
 #include "ir/anf.h"
 #include "backend/optimizer/common/pass.h"
 #include "backend/optimizer/common/fusion_id_allocator.h"
 #include "runtime/device/kernel_info.h"
 #include "backend/kernel_compiler/kernel.h"
 #include "backend/session/kernel_graph.h"

 namespace mindspore {
 namespace opt {
 using FusedNodeRecord = std::vector<std::unordered_set<AnfNodePtr>>;

 class MatmulConfusionTranposeFusionPass : public FusionBasePass {
 public:
  explicit MatmulConfusionTranposeFusionPass(FusionIdAllocatorPtr idAllocator)
      : FusionBasePass("MatmulConfusionTranposeFusionPass", idAllocator) {}
  ~MatmulConfusionTranposeFusionPass() override = default;
  void MatchSingleFusionPattern(const session::KernelGraph &kernel_graph, FusedNodeRecord *candidate_fusion) override;

 private:
  void MatchMatmulConfusionTranpose(const CNodePtr &cnode, const session::KernelGraph &kernel_graph,
                                    FusedNodeRecord *candidate_fusion);
 };
 }  // namespace opt
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_BUFFER_FUSION_PASS_MATMUL_CONFUSIONTRANSPOSE_FUSION_PASS_H_
--- a/mindspore/ccsrc/backend/optimizer/ascend/format_type/merge_cast_to_op.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/format_type/merge_cast_to_op.cc
@@ -172,7 +172,6 @@ AnfNodePtr MergeCastToNextOp(const FuncGraphPtr &graph, const CNodePtr &node, co
               << "ori kernel info" << ori_kernel_info->ToString() << "alternative kernel info"
               << (*alternative_kernel_info)->ToString();
  AnfAlgo::SetSelectKernelBuildInfo(*alternative_kernel_info, next_cnode.get());
  ChangeNodeInferInfo(next_cnode, node, cast_index);
  if (node->inputs().size() < kCastInputNum) {
    MS_LOG(EXCEPTION) << "Op[" << node->DebugString() << "] has wrong input num:";
  }
--- a/mindspore/ccsrc/backend/optimizer/ascend/ir_fusion/adam_apply_one_fusion.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/ir_fusion/adam_apply_one_fusion.cc
@@ -15,30 +15,9 @@
 */
 #include "backend/optimizer/ascend/ir_fusion/adam_apply_one_fusion.h"
 #include "backend/optimizer/common/helper.h"
 #include "backend/session/anf_runtime_algorithm.h"
 namespace mindspore {
 namespace opt {
 AnfNodePtr AdamApplyOneFusion::CreateAdamApplyOneNode(const FuncGraphPtr &func_graph, const EquivPtr &equiv) const {
  MS_EXCEPTION_IF_NULL(func_graph);
  MS_EXCEPTION_IF_NULL(equiv);
  auto prim = std::make_shared<Primitive>(kAdamApplyOneOpName);
  std::vector<AnfNodePtr> new_node_inputs = {NewValueNode(prim)};
  for (const auto &input_var : input_vars_) {
    auto input_node = utils::cast<AnfNodePtr>((*equiv)[input_var]);
    MS_EXCEPTION_IF_NULL(input_node);
    new_node_inputs.push_back(input_node);
  }
  for (const auto &mul_x_input_var : mul_x_input_vars_) {
    auto mul_x_input_node = utils::cast<AnfNodePtr>((*equiv)[mul_x_input_var]);
    MS_EXCEPTION_IF_NULL(mul_x_input_node);
    new_node_inputs.push_back(mul_x_input_node);
  }
  auto add2_y_node = utils::cast<AnfNodePtr>((*equiv)[add2_y_]);
  MS_EXCEPTION_IF_NULL(add2_y_node);
  new_node_inputs.push_back(add2_y_node);
  auto new_node = func_graph->NewCNode(new_node_inputs);
  return new_node;
 }

 const BaseRef AdamApplyOneFusion::DefinePattern() const {
  const auto prim_sqrt = std::make_shared<Primitive>(kSqrtOpName);
  const auto prim_real_div = std::make_shared<Primitive>(kRealDivOpName);
@@ -104,16 +83,152 @@ const BaseRef AdamApplyOneCond4Fusion::DefinePattern() const {
  return VectorRef({prim::kPrimSub, input_vars_[3], VectorRef({prim::kPrimMul, true_div0, input_vars_[4]})});
 }

 const BaseRef AdamApplyOneAssignFusion::DefinePattern() const {
  const auto prim_sqrt = std::make_shared<Primitive>(kSqrtOpName);
  const auto prim_real_div = std::make_shared<Primitive>(kRealDivOpName);
  VectorRef mul2 = VectorRef({prim::kPrimMul, mul_x_input_vars_[2], input_vars_[1]});
  VectorRef mul3 = VectorRef({prim::kPrimMul, mul_x_input_vars_[3], VectorRef({prim::kPrimSquare, input_vars_[0]})});
  VectorRef add1 = VectorRef({add1_var_, mul2, mul3});
  VectorRef sqrt0 = VectorRef({prim_sqrt, add1});
  VectorRef mul1 = VectorRef({prim::kPrimMul, mul_x_input_vars_[1], input_vars_[0]});
  VectorRef mul0 = VectorRef({prim::kPrimMul, mul_x_input_vars_[0], input_vars_[2]});
  VectorRef add0 = VectorRef({add0_var_, mul0, mul1});
  VectorRef true_div0 = VectorRef({prim_real_div, add0, VectorRef({prim::kPrimTensorAdd, sqrt0, add2_y_})});
  VectorRef sub0 = VectorRef({sub0_var_, input_vars_[3], VectorRef({prim::kPrimMul, input_vars_[4], true_div0})});
  VectorRef assign0 = VectorRef({prim::kPrimAssign, input_vars_[3], sub0});
  VectorRef depend0 = VectorRef({prim::kPrimDepend, sub0, assign0});
  VectorRef assign1 = VectorRef({prim::kPrimAssign, input_vars_[2], add0});
  VectorRef depend1 = VectorRef({prim::kPrimDepend, depend0, assign1});
  VectorRef assign2 = VectorRef({prim::kPrimAssign, input_vars_[1], add1});
  return VectorRef({prim::kPrimDepend, depend1, assign2});
 }

 const BaseRef AdamApplyOneAssignCond1Fusion::DefinePattern() const {
  const auto prim_sqrt = std::make_shared<Primitive>(kSqrtOpName);
  const auto prim_real_div = std::make_shared<Primitive>(kRealDivOpName);
  VectorRef mul2 = VectorRef({prim::kPrimMul, mul_x_input_vars_[2], input_vars_[1]});
  VectorRef mul3 = VectorRef({prim::kPrimMul, mul_x_input_vars_[3], VectorRef({prim::kPrimSquare, input_vars_[0]})});
  VectorRef add1 = VectorRef({add1_var_, mul2, mul3});
  VectorRef sqrt0 = VectorRef({prim_sqrt, add1});
  VectorRef mul1 = VectorRef({prim::kPrimMul, mul_x_input_vars_[1], input_vars_[0]});
  VectorRef mul0 = VectorRef({prim::kPrimMul, mul_x_input_vars_[0], input_vars_[2]});
  VectorRef add0 = VectorRef({add0_var_, mul0, mul1});
  VectorRef true_div0 = VectorRef({prim_real_div, add0, VectorRef({prim::kPrimTensorAdd, add2_y_, sqrt0})});
  VectorRef sub0 = VectorRef({sub0_var_, input_vars_[3], VectorRef({prim::kPrimMul, input_vars_[4], true_div0})});
  VectorRef assign0 = VectorRef({prim::kPrimAssign, input_vars_[3], sub0});
  VectorRef depend0 = VectorRef({prim::kPrimDepend, sub0, assign0});
  VectorRef assign1 = VectorRef({prim::kPrimAssign, input_vars_[2], add0});
  VectorRef depend1 = VectorRef({prim::kPrimDepend, depend0, assign1});
  VectorRef assign2 = VectorRef({prim::kPrimAssign, input_vars_[1], add1});
  return VectorRef({prim::kPrimDepend, depend1, assign2});
 }

 const BaseRef AdamApplyOneAssignCond2Fusion::DefinePattern() const {
  const auto prim_sqrt = std::make_shared<Primitive>(kSqrtOpName);
  const auto prim_real_div = std::make_shared<Primitive>(kRealDivOpName);
  VectorRef mul2 = VectorRef({prim::kPrimMul, mul_x_input_vars_[2], input_vars_[1]});
  VectorRef mul3 = VectorRef({prim::kPrimMul, VectorRef({prim::kPrimSquare, input_vars_[0]}), mul_x_input_vars_[3]});
  VectorRef add1 = VectorRef({add1_var_, mul2, mul3});
  VectorRef sqrt0 = VectorRef({prim_sqrt, add1});
  VectorRef mul1 = VectorRef({prim::kPrimMul, mul_x_input_vars_[1], input_vars_[0]});
  VectorRef mul0 = VectorRef({prim::kPrimMul, mul_x_input_vars_[0], input_vars_[2]});
  VectorRef add0 = VectorRef({add0_var_, mul0, mul1});
  VectorRef true_div0 = VectorRef({prim_real_div, add0, VectorRef({prim::kPrimTensorAdd, sqrt0, add2_y_})});
  VectorRef sub0 = VectorRef({sub0_var_, input_vars_[3], VectorRef({prim::kPrimMul, true_div0, input_vars_[4]})});
  VectorRef assign0 = VectorRef({prim::kPrimAssign, input_vars_[3], sub0});
  VectorRef depend0 = VectorRef({prim::kPrimDepend, sub0, assign0});
  VectorRef assign1 = VectorRef({prim::kPrimAssign, input_vars_[2], add0});
  VectorRef depend1 = VectorRef({prim::kPrimDepend, depend0, assign1});
  VectorRef assign2 = VectorRef({prim::kPrimAssign, input_vars_[1], add1});
  return VectorRef({prim::kPrimDepend, depend1, assign2});
 }

 const BaseRef AdamApplyOneAssignCond3Fusion::DefinePattern() const {
  const auto prim_sqrt = std::make_shared<Primitive>(kSqrtOpName);
  const auto prim_real_div = std::make_shared<Primitive>(kRealDivOpName);
  VectorRef mul2 = VectorRef({prim::kPrimMul, mul_x_input_vars_[2], input_vars_[1]});
  VectorRef mul3 = VectorRef({prim::kPrimMul, mul_x_input_vars_[3], VectorRef({prim::kPrimSquare, input_vars_[0]})});
  VectorRef add1 = VectorRef({add1_var_, mul2, mul3});
  VectorRef sqrt0 = VectorRef({prim_sqrt, add1});
  VectorRef mul1 = VectorRef({prim::kPrimMul, mul_x_input_vars_[1], input_vars_[0]});
  VectorRef mul0 = VectorRef({prim::kPrimMul, mul_x_input_vars_[0], input_vars_[2]});
  VectorRef add0 = VectorRef({add0_var_, mul0, mul1});
  VectorRef true_div0 = VectorRef({prim_real_div, add0, VectorRef({prim::kPrimTensorAdd, sqrt0, add2_y_})});
  VectorRef sub0 = VectorRef({sub0_var_, input_vars_[3], VectorRef({prim::kPrimMul, true_div0, input_vars_[4]})});
  VectorRef assign0 = VectorRef({prim::kPrimAssign, input_vars_[3], sub0});
  VectorRef depend0 = VectorRef({prim::kPrimDepend, sub0, assign0});
  VectorRef assign1 = VectorRef({prim::kPrimAssign, input_vars_[2], add0});
  VectorRef depend1 = VectorRef({prim::kPrimDepend, depend0, assign1});
  VectorRef assign2 = VectorRef({prim::kPrimAssign, input_vars_[1], add1});
  return VectorRef({prim::kPrimDepend, depend1, assign2});
 }

 const BaseRef AdamApplyOneAssignCond4Fusion::DefinePattern() const {
  const auto prim_sqrt = std::make_shared<Primitive>(kSqrtOpName);
  const auto prim_real_div = std::make_shared<Primitive>(kRealDivOpName);
  VectorRef mul2 = VectorRef({prim::kPrimMul, mul_x_input_vars_[2], input_vars_[1]});
  VectorRef mul3 = VectorRef({prim::kPrimMul, mul_x_input_vars_[3], VectorRef({prim::kPrimSquare, input_vars_[0]})});
  VectorRef add1 = VectorRef({add1_var_, mul2, mul3});
  VectorRef sqrt0 = VectorRef({prim_sqrt, add1});
  VectorRef mul1 = VectorRef({prim::kPrimMul, mul_x_input_vars_[1], input_vars_[0]});
  VectorRef mul0 = VectorRef({prim::kPrimMul, mul_x_input_vars_[0], input_vars_[2]});
  VectorRef add0 = VectorRef({add0_var_, mul0, mul1});
  VectorRef true_div0 = VectorRef({prim_real_div, add0, VectorRef({prim::kPrimTensorAdd, add2_y_, sqrt0})});
  VectorRef sub0 = VectorRef({sub0_var_, input_vars_[3], VectorRef({prim::kPrimMul, true_div0, input_vars_[4]})});
  VectorRef assign0 = VectorRef({prim::kPrimAssign, input_vars_[3], sub0});
  VectorRef depend0 = VectorRef({prim::kPrimDepend, sub0, assign0});
  VectorRef assign1 = VectorRef({prim::kPrimAssign, input_vars_[2], add0});
  VectorRef depend1 = VectorRef({prim::kPrimDepend, depend0, assign1});
  VectorRef assign2 = VectorRef({prim::kPrimAssign, input_vars_[1], add1});
  return VectorRef({prim::kPrimDepend, depend1, assign2});
 }

 AnfNodePtr AdamApplyOneFusion::CreateAdamApplyOneNode(const FuncGraphPtr &func_graph, const EquivPtr &equiv,
                                                      const AnfNodePtr &final_node) const {
  MS_EXCEPTION_IF_NULL(func_graph);
  MS_EXCEPTION_IF_NULL(equiv);
  PrimitivePtr prim = nullptr;
  if (AnfAlgo::CheckPrimitiveType(final_node, prim::kPrimDepend)) {
    prim = std::make_shared<Primitive>(kAdamApplyOneAssignOpName);
  } else {
    prim = std::make_shared<Primitive>(kAdamApplyOneOpName);
  }
  std::vector<AnfNodePtr> new_node_inputs = {NewValueNode(prim)};
  for (const auto &input_var : input_vars_) {
    auto input_node = utils::cast<AnfNodePtr>((*equiv)[input_var]);
    MS_EXCEPTION_IF_NULL(input_node);
    new_node_inputs.push_back(input_node);
  }
  for (const auto &mul_x_input_var : mul_x_input_vars_) {
    auto mul_x_input_node = utils::cast<AnfNodePtr>((*equiv)[mul_x_input_var]);
    MS_EXCEPTION_IF_NULL(mul_x_input_node);
    new_node_inputs.push_back(mul_x_input_node);
  }
  auto add2_y_node = utils::cast<AnfNodePtr>((*equiv)[add2_y_]);
  MS_EXCEPTION_IF_NULL(add2_y_node);
  new_node_inputs.push_back(add2_y_node);
  auto new_node = func_graph->NewCNode(new_node_inputs);
  return new_node;
 }

 const AnfNodePtr AdamApplyOneFusion::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
                                             const EquivPtr &equiv) const {
  MS_EXCEPTION_IF_NULL(func_graph);
  MS_EXCEPTION_IF_NULL(node);
  if (!CheckSupportDataType(node, kFloatDataTypeSet)) {
  auto sub0 = node;
  if (AnfAlgo::CheckPrimitiveType(node, prim::kPrimDepend)) {
    auto iter_sub0 = (*equiv).find(sub0_var_);
    if (iter_sub0 == (*equiv).end()) {
      MS_LOG(EXCEPTION) << "The equiv map is expected to contains the sub0 var after matched.";
    }
    sub0 = utils::cast<AnfNodePtr>(iter_sub0->second);
  }
  MS_EXCEPTION_IF_NULL(sub0);
  if (!CheckSupportDataType(sub0, kFloatDataTypeSet)) {
    return nullptr;
  }
  auto new_node = CreateAdamApplyOneNode(func_graph, equiv);
  auto new_node = CreateAdamApplyOneNode(func_graph, equiv, node);
  MS_EXCEPTION_IF_NULL(new_node);
  new_node->set_scope(node->scope());
  new_node->set_scope(sub0->scope());
  // Set abstract of new node
  AbstractBasePtrList new_node_abstract_list;
  auto iter_add0 = (*equiv).find(add0_var_);
@@ -130,7 +245,7 @@ const AnfNodePtr AdamApplyOneFusion::Process(const FuncGraphPtr &func_graph, con
  MS_EXCEPTION_IF_NULL(add1);
  new_node_abstract_list.push_back(add1->abstract());
  new_node_abstract_list.push_back(add0->abstract());
  new_node_abstract_list.push_back(node->abstract());
  new_node_abstract_list.push_back(sub0->abstract());
  auto abstract_tuple = std::make_shared<abstract::AbstractTuple>(new_node_abstract_list);
  new_node->set_abstract(abstract_tuple);
  // Create tuple_getitem node for outputs
--- a/mindspore/ccsrc/backend/optimizer/ascend/ir_fusion/adam_apply_one_fusion.h
+++ b/mindspore/ccsrc/backend/optimizer/ascend/ir_fusion/adam_apply_one_fusion.h
@@ -40,6 +40,7 @@ class AdamApplyOneFusion : public PatternProcessPass {
    add2_y_ = std::make_shared<Var>();
    add0_var_ = std::make_shared<Var>(std::make_shared<Primitive>(prim::kPrimTensorAdd->name()));
    add1_var_ = std::make_shared<Var>(std::make_shared<Primitive>(prim::kPrimTensorAdd->name()));
    sub0_var_ = std::make_shared<Var>(std::make_shared<Primitive>(prim::kPrimSub->name()));
  }

  ~AdamApplyOneFusion() override = default;
@@ -47,12 +48,14 @@ class AdamApplyOneFusion : public PatternProcessPass {
  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;

 protected:
  AnfNodePtr CreateAdamApplyOneNode(const FuncGraphPtr &func_graph, const EquivPtr &equiv) const;
  AnfNodePtr CreateAdamApplyOneNode(const FuncGraphPtr &func_graph, const EquivPtr &equiv,
                                    const AnfNodePtr &final_node) const;
  std::vector<VarPtr> input_vars_;
  std::vector<VarPtr> mul_x_input_vars_;
  VarPtr add2_y_;
  VarPtr add0_var_;
  VarPtr add1_var_;
  VarPtr sub0_var_;
 };

 class AdamApplyOneCond1Fusion : public AdamApplyOneFusion {
@@ -90,6 +93,51 @@ class AdamApplyOneCond4Fusion : public AdamApplyOneFusion {
  ~AdamApplyOneCond4Fusion() override = default;
  const BaseRef DefinePattern() const override;
 };

 class AdamApplyOneAssignFusion : public AdamApplyOneFusion {
 public:
  explicit AdamApplyOneAssignFusion(bool multigraph = true)
      : AdamApplyOneFusion("adam_apply_one_assign_fusion", multigraph) {}

  ~AdamApplyOneAssignFusion() override = default;
  const BaseRef DefinePattern() const override;
 };

 class AdamApplyOneAssignCond1Fusion : public AdamApplyOneFusion {
 public:
  explicit AdamApplyOneAssignCond1Fusion(bool multigraph = true)
      : AdamApplyOneFusion("adam_apply_one_assign_cond1_fusion", multigraph) {}

  ~AdamApplyOneAssignCond1Fusion() override = default;
  const BaseRef DefinePattern() const override;
 };

 class AdamApplyOneAssignCond2Fusion : public AdamApplyOneFusion {
 public:
  explicit AdamApplyOneAssignCond2Fusion(bool multigraph = true)
      : AdamApplyOneFusion("adam_apply_one_assign_cond2_fusion", multigraph) {}

  ~AdamApplyOneAssignCond2Fusion() override = default;
  const BaseRef DefinePattern() const override;
 };

 class AdamApplyOneAssignCond3Fusion : public AdamApplyOneFusion {
 public:
  explicit AdamApplyOneAssignCond3Fusion(bool multigraph = true)
      : AdamApplyOneFusion("adam_apply_one_assign_cond3_fusion", multigraph) {}

  ~AdamApplyOneAssignCond3Fusion() override = default;
  const BaseRef DefinePattern() const override;
 };

 class AdamApplyOneAssignCond4Fusion : public AdamApplyOneFusion {
 public:
  explicit AdamApplyOneAssignCond4Fusion(bool multigraph = true)
      : AdamApplyOneFusion("adam_apply_one_assign_cond4_fusion", multigraph) {}

  ~AdamApplyOneAssignCond4Fusion() override = default;
  const BaseRef DefinePattern() const override;
 };
 }  // namespace opt
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_IR_FUSION_ADAM_APPLY_ONE_FUSION_H_
--- a/mindspore/ccsrc/backend/optimizer/pass/convert_tuple_output_to_maketuple.cc
+++ b/mindspore/ccsrc/backend/optimizer/pass/convert_tuple_output_to_maketuple.cc
@@ -62,7 +62,14 @@ const AnfNodePtr ConvertTupleOutputToMaketuple::Process(const FuncGraphPtr &func
  auto cnode = node->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(cnode);
  std::unordered_map<AnfNodePtr, AnfNodePtr> transed_nodes;
  if (IsPrimitiveCNode(cnode, prim::kPrimTupleGetItem) || IsPrimitiveCNode(cnode, prim::kPrimControlDepend)) {
  if (IsPrimitiveCNode(cnode, prim::kPrimTupleGetItem)) {
    auto real_input = AnfAlgo::GetTupleGetItemRealInput(cnode);
    MS_EXCEPTION_IF_NULL(real_input);
    if (!real_input->isa<Parameter>() && !real_input->isa<ValueNode>()) {
      return nullptr;
    }
  }
  if (IsPrimitiveCNode(cnode, prim::kPrimControlDepend)) {
    return nullptr;
  }
  bool cnode_input_changed = false;
--- a/mindspore/ccsrc/backend/session/session_basic.cc
+++ b/mindspore/ccsrc/backend/session/session_basic.cc
@@ -863,7 +863,7 @@ bool TensorNeedSync(const AnfNodePtr &parameter, const tensor::TensorPtr &tensor
  auto ms_context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(ms_context);
  auto device_address = AnfAlgo::GetMutableOutputAddr(parameter, 0);
  if (ms_context->enable_pynative_infer()) {
  if (ms_context->execution_mode() == kPynativeMode) {
    return tensor->device_address().get() == nullptr || tensor->device_address() != device_address;
  }
  if (tensor->is_dirty()) {
--- a/mindspore/ccsrc/frontend/operator/cc_implementations.cc
+++ b/mindspore/ccsrc/frontend/operator/cc_implementations.cc
@@ -393,40 +393,5 @@ ValuePtr BoolEq(const ValuePtrList &list) {

  MS_LOG(EXCEPTION) << "Unsported Value for BoolEq, x: " << x->ToString() << ".";
 }

 std::vector<int> BroadcastShape_(std::vector<int> shpx, std::vector<int> shpy) {
  int dlen = SizeToInt(shpx.size()) - SizeToInt(shpy.size());
  if (dlen < 0) {
    for (int i = 0; i < -dlen; ++i) {
      (void)shpx.insert(shpx.begin(), 1);
    }
  } else if (dlen > 0) {
    for (int i = 0; i < dlen; i++) {
      (void)shpy.insert(shpy.begin(), 1);
    }
  }
  if (shpx.size() != shpy.size()) {
    MS_LOG(EXCEPTION) << "Failure: shpx.size() != shpy.size().";
  }
  std::vector<int> shp;
  for (size_t i = 0; i < shpx.size(); i++) {
    auto a = shpx[i];
    auto b = shpy[i];
    if (a == 1) {
      shp.push_back(b);
    } else if (b == 1) {
      shp.push_back(a);
    } else if (a == -1) {
      shp.push_back(b);
    } else if (b == -1) {
      shp.push_back(a);
    } else if (a == b) {
      shp.push_back(a);
    } else {
      return std::vector<int>();
    }
  }
  return shp;
 }
 }  // namespace prim
 }  // namespace mindspore
--- a/mindspore/ccsrc/frontend/operator/cc_implementations.h
+++ b/mindspore/ccsrc/frontend/operator/cc_implementations.h
@@ -52,7 +52,6 @@ ValuePtr BoolNot(const ValuePtrList &list);
 ValuePtr BoolAnd(const ValuePtrList &list);
 ValuePtr BoolOr(const ValuePtrList &list);
 ValuePtr BoolEq(const ValuePtrList &list);
 std::vector<int> BroadcastShape_(std::vector<int> s1, std::vector<int> s2);
 }  // namespace prim
 }  // namespace mindspore

--- a/mindspore/ccsrc/frontend/operator/composite/composite.cc
+++ b/mindspore/ccsrc/frontend/operator/composite/composite.cc
@@ -333,28 +333,28 @@ ArgsPairList HyperMap::Harmonize(const FuncGraphPtr &func_graph, const ArgsPairL
 }

 FuncGraphPtr HyperMap::GenerateFromTypes(const TypePtrList &args_spec_list) {
  FuncGraphPtr ptrGraph = std::make_shared<FuncGraph>();
  ptrGraph->set_flag(FUNC_GRAPH_FLAG_CORE, true);
  ptrGraph->set_flag(FUNC_GRAPH_FLAG_SPECIALIZE_PARAMETER, true);
  ptrGraph->debug_info()->set_name("hyper_map");
  FuncGraphPtr ptr_graph = std::make_shared<FuncGraph>();
  ptr_graph->set_flag(FUNC_GRAPH_FLAG_CORE, true);
  ptr_graph->set_flag(FUNC_GRAPH_FLAG_SPECIALIZE_PARAMETER, true);
  ptr_graph->debug_info()->set_name("hyper_map");

  AnfNodePtr ptrFnArg = nullptr;
  std::size_t i = 0;
  ArgsPairList argmap;
  ArgsPairList argmap2;
  if (fn_leaf_ == nullptr) {
    ptrFnArg = ptrGraph->add_parameter();
    ptrFnArg = ptr_graph->add_parameter();
    i = 1;
  }

  std::size_t size = args_spec_list.size();
  for (; i < size; ++i) {
    argmap.push_back(std::make_pair(ptrGraph->add_parameter(), args_spec_list[i]));
    argmap.push_back(std::make_pair(ptr_graph->add_parameter(), args_spec_list[i]));
  }

  argmap2 = Harmonize(ptrGraph, argmap);
  ptrGraph->set_output(Make(ptrGraph, ptrFnArg, argmap2));
  return ptrGraph;
  argmap2 = Harmonize(ptr_graph, argmap);
  ptr_graph->set_output(Make(ptr_graph, ptrFnArg, argmap2));
  return ptr_graph;
 }

 abstract::AbstractBasePtrList HyperMap::NormalizeArgs(const AbstractBasePtrList &args_spec_list) const {
@@ -582,30 +582,30 @@ FuncGraphPtr GradOperation::GetGrad(AnfNodePtr node, const AnfNodePtr &weights,
  inputs.push_back(opsTupleItem);
  inputs.push_back(cnode);
  inputs.push_back(NewValueNode(1));
  AnfNodePtr ptrBprop = ret->NewCNode(inputs);
  AnfNodePtr ptr_bprop = ret->NewCNode(inputs);

  doGetGrad(ret, out, ptrBprop, weights_node, opsTupleItem);
  doGetGrad(ret, out, ptr_bprop, weights_node, opsTupleItem);
  return ret;
 }

 void GradOperation::doGetGrad(const FuncGraphPtr &func_graph, AnfNodePtr out, AnfNodePtr ptrBprop, AnfNodePtr weights,
 void GradOperation::doGetGrad(const FuncGraphPtr &func_graph, AnfNodePtr out, AnfNodePtr ptr_bprop, AnfNodePtr weights,
                              ValueNodePtr opsTupleItem) {
  MS_EXCEPTION_IF_NULL(func_graph);

  AnfNodePtr ptrBPropArg = nullptr;
  AnfNodePtr ptr_bprop_arg = nullptr;
  if (sens_param_) {
    ptrBPropArg = func_graph->add_parameter();
    ptr_bprop_arg = func_graph->add_parameter();
  } else {
    auto ones_like = prim::GetPythonOps("ones_like");
    ptrBPropArg = func_graph->NewCNode({NewValueNode(ones_like), out});
    ptr_bprop_arg = func_graph->NewCNode({NewValueNode(ones_like), out});
  }

  AnfNodePtr ptrBApp = func_graph->NewCNode({ptrBprop, ptrBPropArg});
  AnfNodePtr ptr_bapp = func_graph->NewCNode({ptr_bprop, ptr_bprop_arg});

  CNodePtr fv_bprop = nullptr;
  if (get_by_list_) {
    // python code: grads = hyper_map(F.partial(env_get, env), weights)
    AnfNodePtr env = func_graph->NewCNode({NewValueNode(prim::kPrimTupleGetItem), ptrBApp, NewValueNode(0)});
    AnfNodePtr env = func_graph->NewCNode({NewValueNode(prim::kPrimTupleGetItem), ptr_bapp, NewValueNode(0)});
    AnfNodePtr partial_env_get =
      func_graph->NewCNode({NewValueNode(prim::kPrimPartial), NewValueNode(prim::GetPythonOps("env_get")), env});
    MetaFuncGraphPtr hyper_map = std::make_shared<HyperMap>();
@@ -614,7 +614,7 @@ void GradOperation::doGetGrad(const FuncGraphPtr &func_graph, AnfNodePtr out, An

  CNodePtr inputs_bprop = nullptr;
  if (get_all_) {
    inputs_bprop = func_graph->NewCNode({NewValueNode(kTail), ptrBApp});
    inputs_bprop = func_graph->NewCNode({NewValueNode(kTail), ptr_bapp});
  }

  // Gradients wrt inputs and parameters
@@ -636,8 +636,8 @@ void GradOperation::doGetGrad(const FuncGraphPtr &func_graph, AnfNodePtr out, An
  }

  // Gradients wrt first input.
  // ptrBApp returns (EnvInstance(grads wrt params), grads wrt input0, grads wrt input1, ...), so 1 is for first input
  func_graph->set_output(func_graph->NewCNode({opsTupleItem, ptrBApp, NewValueNode(1)}));
  // ptr_bapp returns (EnvInstance(grads wrt params), grads wrt input0, grads wrt input1, ...), so 1 is for first input
  func_graph->set_output(func_graph->NewCNode({opsTupleItem, ptr_bapp, NewValueNode(1)}));
 }

 // Generate the graph.
@@ -657,35 +657,35 @@ FuncGraphPtr GradOperation::GenerateFuncGraph(const AbstractBasePtrList &args_sp
  auto real_fn = dyn_cast<FuncGraphAbstractClosure>(fn);
  MS_EXCEPTION_IF_NULL(real_fn);

  FuncGraphPtr ptrGraph = real_fn->func_graph();
  MS_EXCEPTION_IF_NULL(ptrGraph);
  TraceManager::DebugTrace(std::make_shared<TraceGradOperation>(ptrGraph->debug_info()));
  FuncGraphPtr dfBuilder = std::make_shared<FuncGraph>();
  FuncGraphPtr ptr_graph = real_fn->func_graph();
  MS_EXCEPTION_IF_NULL(ptr_graph);
  TraceManager::DebugTrace(std::make_shared<TraceGradOperation>(ptr_graph->debug_info()));
  FuncGraphPtr df_builder = std::make_shared<FuncGraph>();
  TraceManager::EndTrace();
  auto nparam = ptrGraph->parameters().size();
  auto nparam = ptr_graph->parameters().size();

  std::ostringstream ss;
  ss << "grad{" << nparam << "}";
  dfBuilder->set_flag(FUNC_GRAPH_FLAG_CORE, true);
  dfBuilder->debug_info()->set_name(ss.str());
  ParameterPtr param_graph = dfBuilder->add_parameter();
  df_builder->set_flag(FUNC_GRAPH_FLAG_CORE, true);
  df_builder->debug_info()->set_name(ss.str());
  ParameterPtr param_graph = df_builder->add_parameter();

  AnfNodePtr weights = nullptr;
  if (get_by_list_) {
    weights = dfBuilder->add_parameter();
    weights = df_builder->add_parameter();
  }

  std::vector<AnfNodePtr> inputs;
  inputs.push_back(NewValueNode(prim::kPrimJ));
  inputs.push_back(param_graph);
  auto jf = dfBuilder->NewCNode(inputs);
  auto jf = df_builder->NewCNode(inputs);
  // df is checked in GetGrad
  TraceManager::DebugTrace(std::make_shared<TraceGradOperation>(ptrGraph->debug_info()));
  auto df = GetGrad(jf, weights, ptrGraph->parameters());
  TraceManager::DebugTrace(std::make_shared<TraceGradOperation>(ptr_graph->debug_info()));
  auto df = GetGrad(jf, weights, ptr_graph->parameters());
  TraceManager::EndTrace();
  dfBuilder->set_output(NewValueNode(df));
  df_builder->set_output(NewValueNode(df));

  return dfBuilder;
  return df_builder;
 }

 REGISTER_PYBIND_DEFINE(GradOperation_, ([](const py::module *m) {
@@ -929,7 +929,7 @@ void GenerateTupleSliceParameter(const AbstractTuplePtr &tuple, const AbstractSl

  *step_value = CheckSliceMember(slice->step(), step_default, step_name);
  if (*step_value == 0) {
    MS_LOG(EXCEPTION) << "TupleSlice require the step value could not be 0, but got 0.";
    MS_EXCEPTION(ValueError) << "TupleSlice require the step value could not be 0, but got 0.";
  }

  if (*step_value < 0) {
@@ -941,8 +941,8 @@ void GenerateTupleSliceParameter(const AbstractTuplePtr &tuple, const AbstractSl
  *stop_index = CheckSliceMember(slice->stop(), stop_default, stop_name);
  if (!CheckIndexInRange(*start_index, -tuple_size, tuple_size - 1) ||
      !CheckIndexInRange(*stop_index, -tuple_size - 1, tuple_size)) {
    MS_LOG(EXCEPTION) << "TupleSlice the start index " << *start_index << " or end end index " << *stop_index
                      << " out of range, tuple size " << tuple_size << ".";
    MS_EXCEPTION(ValueError) << "TupleSlice the start index " << *start_index << " or end end index " << *stop_index
                             << " out of range, tuple size " << tuple_size << ".";
  }

  *start_index = GetPositiveIndex(*start_index, tuple_size);
--- a/mindspore/ccsrc/frontend/operator/composite/do_signature.cc
+++ b/mindspore/ccsrc/frontend/operator/composite/do_signature.cc
@@ -72,10 +72,15 @@ void SetMaxType(TypeId *max_type_id, size_t *max_type_number, const TypeId type_
 bool GetTensorOrScalarTypeInfo(AbstractBasePtr arg_value, bool is_write, TypeId *arg_type_id,
                               TypeId *arg_type = nullptr) {
  if (arg_value->isa<abstract::AbstractRef>()) {
    if (is_write) {
      arg_value = arg_value->cast<abstract::AbstractRefPtr>()->ref_origin();
    } else {
      arg_value = arg_value->cast<abstract::AbstractRefPtr>()->ref();
    auto ref = arg_value->cast<abstract::AbstractRefPtr>();
    arg_value = ref->ref();
    if (!is_write && ref->need_cast()) {
      auto tensor_type = ref->target_type();
      *arg_type_id = tensor_type->type_id();
      if (arg_type != nullptr) {
        *arg_type = kObjectTypeTensorType;
      }
      return true;
    }
  }
  if (arg_value->isa<abstract::AbstractTensor>()) {
@@ -248,6 +253,8 @@ void DoAutoCast(const std::string &func_name, const std::vector<Signature> &sign
    if (arg_value->isa<abstract::AbstractTensor>() && arg_type_id == it->second) {
      continue;
    }
    MS_LOG(DEBUG) << "do cast for inputs " << i << " " << (*op_inputs)[i + 1]->ToString() << " " << arg_type_id
                  << " to " << it->second;
    (*op_inputs)[i + 1] = DoCast((*op_inputs)[i + 1], it->second, graph);
  }
 }
@@ -289,16 +296,23 @@ AnfNodePtr BuildNewCNode(const FuncGraphPtr &func_graph, const std::string &func

    TypePtr type = args_spec_list[i]->GetTypeTrack();
    if (type && type->type_id() == kObjectTypeRef) {
      auto ref_abs = args_spec_list[i]->cast<abstract::AbstractRefPtr>();
      if (sig == SignatureEnumRW::kRWRead) {
        param = func_graph->NewCNode({NewValueNode(prim::kPrimGetRefValue), param});
        param = NewCNode({NewValueNode(prim::kPrimGetRefValue), param}, func_graph);
        if (ref_abs && ref_abs->need_cast()) {
          auto cast = prim::GetPythonOps("cast", "mindspore.ops.functional");
          param = NewCNode({NewValueNode(cast), param, NewValueNode(ref_abs->target_type())}, func_graph);
        }
      } else if (sig == SignatureEnumRW::kRWWrite) {
        param = func_graph->NewCNode({NewValueNode(prim::kPrimGetRefOrigin), param});
        param = NewCNode({NewValueNode(prim::kPrimGetRefValue), param}, func_graph);
        write_indices.insert(i);
      }
      // If sig is SignatureEnumRW::kRWRef, not do anything.
    } else if (sig == SignatureEnumRW::kRWWrite && type->type_id() != kObjectTypeRefKey) {
      MS_EXCEPTION(TypeError) << "Function " << func_name << "'s input " << i << " should be a Parameter.";
    }
    MS_LOG(DEBUG) << "Function " << func_name << "'s input " << i << " " << param->DebugString(2) << " type "
                  << args_spec_list[i]->ToString();
    op_inputs.push_back(param);
  }
  // process default
--- a/mindspore/ccsrc/frontend/operator/composite/unpack_call.cc
+++ b/mindspore/ccsrc/frontend/operator/composite/unpack_call.cc
@@ -49,13 +49,14 @@ FuncGraphPtr UnpackCall::GenerateFuncGraph(const AbstractBasePtrList &args_spec_
    MS_LOG(EXCEPTION) << op_name << " requires at least two args, but got " << arg_length << ".";
  }

  (void)abstract::CheckArg<AbstractFunction>(op_name, args_spec_list, 0);
  // No need to check, check will be done in infer.
  auto ret_graph = std::make_shared<FuncGraph>();
  ret_graph->set_flag(FUNC_GRAPH_FLAG_CORE, true);
  ret_graph->debug_info()->set_name("UnpackCall");

  AnfNodePtr fnNode = ret_graph->add_parameter();
  AnfNodePtr fn_node = ret_graph->add_parameter();
  std::vector<AnfNodePtr> elems;
  elems.push_back(fnNode);
  elems.push_back(fn_node);
  for (size_t index = 1; index < arg_length; index++) {
    MS_EXCEPTION_IF_NULL(args_spec_list[index]);
    if (args_spec_list[index]->isa<AbstractTuple>()) {
--- a/mindspore/ccsrc/frontend/operator/ops.h
+++ b/mindspore/ccsrc/frontend/operator/ops.h
@@ -31,160 +31,43 @@ ValuePtr GetPythonOps(const std::string &op_name,
                      const std::string &module_name = "mindspore._extends.parse.standard_method",
                      bool use_signature = false);

 // Arithmetic
 inline const PrimitivePtr kPrimScalarAdd = std::make_shared<Primitive>("scalar_add");
 inline const PrimitivePtr kPrimScalarSub = std::make_shared<Primitive>("scalar_sub");
 inline const PrimitivePtr kPrimScalarMul = std::make_shared<Primitive>("scalar_mul");
 inline const PrimitivePtr kPrimScalarDiv = std::make_shared<Primitive>("scalar_div");
 inline const PrimitivePtr kPrimScalarFloordiv = std::make_shared<Primitive>("scalar_floordiv");
 inline const PrimitivePtr kPrimScalarMod = std::make_shared<Primitive>("scalar_mod");
 inline const PrimitivePtr kPrimScalarPow = std::make_shared<Primitive>("scalar_pow");
 inline const PrimitivePtr kPrimScalarTrunc = std::make_shared<Primitive>("scalar_trunc");
 inline const PrimitivePtr kPrimScalarFloor = std::make_shared<Primitive>("scalar_floor");
 inline const PrimitivePtr kPrimScalarUadd = std::make_shared<Primitive>("scalar_uadd");
 inline const PrimitivePtr kPrimScalarUsub = std::make_shared<Primitive>("scalar_usub");
 inline const PrimitivePtr kPrimScalarExp = std::make_shared<Primitive>("scalar_exp");
 inline const PrimitivePtr kPrimScalarLog = std::make_shared<Primitive>("scalar_log");
 inline const PrimitivePtr kPrimScalarSin = std::make_shared<Primitive>("scalar_sin");
 inline const PrimitivePtr kPrimScalarCos = std::make_shared<Primitive>("scalar_cos");
 inline const PrimitivePtr kPrimScalarTan = std::make_shared<Primitive>("scalar_tan");

 // Comparisons
 inline const PrimitivePtr kPrimScalarEq = std::make_shared<Primitive>("scalar_eq");
 inline const PrimitivePtr kPrimScalarLt = std::make_shared<Primitive>("scalar_lt");
 inline const PrimitivePtr kPrimScalarGt = std::make_shared<Primitive>("scalar_gt");
 inline const PrimitivePtr kPrimScalarNe = std::make_shared<Primitive>("scalar_ne");
 inline const PrimitivePtr kPrimScalarLe = std::make_shared<Primitive>("scalar_le");
 inline const PrimitivePtr kPrimScalarGe = std::make_shared<Primitive>("scalar_ge");
 inline const PrimitivePtr kPrimBoolNot = std::make_shared<Primitive>("bool_not");
 inline const PrimitivePtr kPrimBoolAnd = std::make_shared<Primitive>("bool_and");
 inline const PrimitivePtr kPrimBoolOr = std::make_shared<Primitive>("bool_or");
 inline const PrimitivePtr kPrimBoolEq = std::make_shared<Primitive>("bool_eq");
 inline const PrimitivePtr kPrimGreater = std::make_shared<Primitive>("Greater");
 inline const PrimitivePtr kPrimGreaterEqual = std::make_shared<Primitive>("GreaterEqual");
 inline const PrimitivePtr kPrimLess = std::make_shared<Primitive>("Less");
 inline const PrimitivePtr kPrimLessEqual = std::make_shared<Primitive>("LessEqual");
 inline const PrimitivePtr kPrimEqual = std::make_shared<Primitive>("Equal");
 inline const PrimitivePtr kPrimNotEqual = std::make_shared<Primitive>("NotEqual");

 // Primitives only used by frontend;
 // Type introspection
 inline const PrimitivePtr kPrimTypeOf = std::make_shared<Primitive>("typeof");
 inline const PrimitivePtr kPrimHasType = std::make_shared<Primitive>("hastype");

 inline const PrimitivePtr kPrimDistribute = std::make_shared<Primitive>("distribute");
 inline const PrimitivePtr kPrimDot = std::make_shared<Primitive>("dot");
 inline const PrimitivePtr kPrimIm2Col = std::make_shared<Primitive>("im2col");
 inline const PrimitivePtr kPrimCol2Im = std::make_shared<Primitive>("col2im");
 inline const PrimitivePtr kPrimIm2ColV1 = std::make_shared<Primitive>("im2col_v1");
 inline const PrimitivePtr kPrimCol2ImV1 = std::make_shared<Primitive>("col2im_v1");

 inline const PrimitivePtr kPrimResolve = std::make_shared<Primitive>("resolve");
 inline const PrimitivePtr kPrimEmbed = std::make_shared<Primitive>("embed");
 inline const PrimitivePtr kPrimRefToEmbed = std::make_shared<Primitive>("RefToEmbed");
 inline const PrimitivePtr kPrimCreateInstance = std::make_shared<Primitive>("create_instance");

 inline const PrimitivePtr kPrimLabelGoto = std::make_shared<Primitive>("LabelGoto");
 inline const PrimitivePtr kPrimLabelSwitch = std::make_shared<Primitive>("LabelSwitch");
 inline const PrimitivePtr kPrimLabelSet = std::make_shared<Primitive>("LabelSet");
 // Other miscellaneous
 inline const PrimitivePtr kPrimGetRefOrigin = std::make_shared<Primitive>("get_ref_origin");
 inline const PrimitivePtr kPrimInsertGradientOf = std::make_shared<Primitive>("InsertGradientOf");
 inline const PrimitivePtr kPrimCheckBprop = std::make_shared<Primitive>("CheckBprop");
 inline const PrimitivePtr kPrimMixedPrecisionCast = std::make_shared<Primitive>("mixed_precision_cast");
 inline const PrimitivePtr kPrimMakeRecord = std::make_shared<Primitive>("make_record");

 // Structures

 inline const PrimitivePtr kPrimListMap = std::make_shared<Primitive>("list_map");
 inline const PrimitivePtr kPrimListReduce = std::make_shared<Primitive>("list_reduce");
 inline const PrimitivePtr kPrimTupleReversed = std::make_shared<Primitive>("tuple_reversed");
 inline const PrimitivePtr kPrimReducedShape = std::make_shared<Primitive>("reduced_shape");
 inline const PrimitivePtr kPrimTupleDiv = std::make_shared<Primitive>("tuple_div");
 inline const PrimitivePtr kPrimTupleToArray = std::make_shared<Primitive>("tuple_to_array");
 inline const PrimitivePtr kPrimShapeMul = std::make_shared<Primitive>("shape_mul");
 inline const PrimitivePtr kPrimTupleEqual = std::make_shared<Primitive>("tuple_equal");
 inline const PrimitivePtr kPrimListEqual = std::make_shared<Primitive>("list_equal");
 inline const PrimitivePtr kPrimMakeRange = std::make_shared<Primitive>("make_range");
 inline const PrimitivePtr kPrimStopGradient = std::make_shared<Primitive>("stop_gradient");
 inline const PrimitivePtr kPrimStringEqual = std::make_shared<Primitive>("string_equal");
 inline const PrimitivePtr kPrimStringConcat = std::make_shared<Primitive>("string_concat");
 inline const PrimitivePtr kPrimDictLen = std::make_shared<Primitive>("dict_len");

 // Arrays
 inline const PrimitivePtr kPrimScalarToArray = std::make_shared<Primitive>("scalar_to_array");
 inline const PrimitivePtr kPrimArrayToScalar = std::make_shared<Primitive>("array_to_scalar");
 inline const PrimitivePtr kPrimBroadcastShape = std::make_shared<Primitive>("broadcast_shape");
 inline const PrimitivePtr kPrimArrayMap = std::make_shared<Primitive>("array_map");
 inline const PrimitivePtr kPrimArrayReduce = std::make_shared<Primitive>("array_reduce");
 inline const PrimitivePtr kPrimCast = std::make_shared<Primitive>("Cast");
 inline const PrimitivePtr kPrimConcat = std::make_shared<Primitive>("Concat");
 inline const PrimitivePtr kPrimSqueeze = std::make_shared<Primitive>("Squeeze");
 inline const PrimitivePtr kPrimTranspose = std::make_shared<Primitive>("Transpose");
 inline const PrimitivePtr kPrimGatherV2 = std::make_shared<Primitive>("GatherV2");
 inline const PrimitivePtr kPrimEmbeddingLookup = std::make_shared<Primitive>("EmbeddingLookup");
 inline const PrimitivePtr kPrimEmbeddingLookupCommGrad = std::make_shared<Primitive>("EmbeddingLookupCommGrad");
 inline const PrimitivePtr kPrimSize = std::make_shared<Primitive>("Size");
 inline const PrimitivePtr kPrimArgMax = std::make_shared<Primitive>("Argmax");
 inline const PrimitivePtr kPrimPack = std::make_shared<Primitive>("Pack");
 inline const PrimitivePtr kPrimUnsortedSegmentSum = std::make_shared<Primitive>("UnsortedSegmentSum");
 inline const PrimitivePtr kPrimUnsortedSegmentMin = std::make_shared<Primitive>("UnsortedSegmentMin");
 inline const PrimitivePtr kPrimConcatOffset = std::make_shared<Primitive>("ConcatOffset");
 inline const PrimitivePtr kPrimReshape = std::make_shared<Primitive>("Reshape");
 inline const PrimitivePtr kPrimTile = std::make_shared<Primitive>("Tile");
 inline const PrimitivePtr kPrimAddN = std::make_shared<Primitive>("AddN");
 inline const PrimitivePtr KPrimTransData = std::make_shared<Primitive>("TransData");
 inline const PrimitivePtr kPrimNMSWithMask = std::make_shared<Primitive>("NMSWithMask");
 inline const PrimitivePtr kPrimPad = std::make_shared<Primitive>("Pad");
 inline const PrimitivePtr kPrimArgMaxWithValue = std::make_shared<Primitive>("ArgMaxWithValue");
 inline const PrimitivePtr kPrimUnique = std::make_shared<Primitive>("Unique");
 inline const PrimitivePtr kPrimUniqueGrad = std::make_shared<Primitive>("UniqueGrad");

 // NN
 inline const PrimitivePtr kPrimFlatten = std::make_shared<Primitive>("Flatten");
 inline const PrimitivePtr kPrimSoftmax = std::make_shared<Primitive>("Softmax");
 inline const PrimitivePtr kPrimLogSoftmax = std::make_shared<Primitive>("LogSoftmax");
 inline const PrimitivePtr kPrimLogSoftmaxGrad = std::make_shared<Primitive>("LogSoftmaxGrad");
 inline const PrimitivePtr kPrimTanh = std::make_shared<Primitive>("Tanh");
 inline const PrimitivePtr kPrimTanhGrad = std::make_shared<Primitive>("TanhGrad");
 inline const PrimitivePtr kPrimPooling = std::make_shared<Primitive>("Pooling");
 inline const PrimitivePtr kPrimPoolingGrad = std::make_shared<Primitive>("PoolingGrad");
 inline const PrimitivePtr kPrimMaxPool = std::make_shared<Primitive>("MaxPool");
 inline const PrimitivePtr kPrimMaxPoolGrad = std::make_shared<Primitive>("MaxPoolGrad");
 inline const PrimitivePtr kPrimApplyCenteredRMSProp = std::make_shared<Primitive>("ApplyCenteredRMSProp");
 inline const PrimitivePtr kPrimAvgPoolGrad = std::make_shared<Primitive>("AvgPoolGrad");
 inline const PrimitivePtr kPrimAvgPoolGradVm = std::make_shared<Primitive>("AvgPoolGradVm");
 inline const PrimitivePtr kPrimFusedBatchNorm = std::make_shared<Primitive>("FusedBatchNorm");
 inline const PrimitivePtr kPrimConv2D = std::make_shared<Primitive>("Conv2D");
 inline const PrimitivePtr kPrimFusedBatchNormGrad = std::make_shared<Primitive>("FusedBatchNormGrad");
 inline const PrimitivePtr kPrimBatchNorm = std::make_shared<Primitive>("BatchNorm");
 inline const PrimitivePtr kPrimBatchNormGrad = std::make_shared<Primitive>("BatchNormGrad");
 inline const PrimitivePtr kPrimReluGrad = std::make_shared<Primitive>("ReluGrad");
 inline const PrimitivePtr kPrimConv2DBackpropInput = std::make_shared<Primitive>("Conv2DBackpropInput");
 inline const PrimitivePtr kPrimConv2DBackpropFilter = std::make_shared<Primitive>("Conv2DBackpropFilter");
 inline const PrimitivePtr kPrimDepthwiseConv2dNative = std::make_shared<Primitive>("DepthwiseConv2dNative");
 inline const PrimitivePtr kPrimDepthwiseConv2dNativeBackpropFilter =
  std::make_shared<Primitive>("DepthwiseConv2dNativeBackpropFilter");
 inline const PrimitivePtr kPrimDepthwiseConv2dNativeBackpropInput =
  std::make_shared<Primitive>("DepthwiseConv2dNativeBackpropInput");
 inline const PrimitivePtr kPrimBiasAddGrad = std::make_shared<Primitive>("BiasAddGrad");
 inline const PrimitivePtr kPrimSoftmaxCrossEntropyWithLogits =
  std::make_shared<Primitive>("SoftmaxCrossEntropyWithLogits");
 inline const PrimitivePtr kPrimSparseSoftmaxCrossEntropyWithLogits =
  std::make_shared<Primitive>("SparseSoftmaxCrossEntropyWithLogits");
 inline const PrimitivePtr kPrimMomentum = std::make_shared<Primitive>("Momentum");
 inline const PrimitivePtr kPrimApplyMomentum = std::make_shared<Primitive>("ApplyMomentum");
 inline const PrimitivePtr kPrimLayerNorm = std::make_shared<Primitive>("LayerNorm");
 inline const PrimitivePtr kPrimLayerNormGrad = std::make_shared<Primitive>("LayerNormGrad");
 inline const PrimitivePtr kPrimLayerNormXBackprop = std::make_shared<Primitive>("LayerNormXBackprop");
 inline const PrimitivePtr kPrimLayerNormBetaGammaBackprop = std::make_shared<Primitive>("LayerNormBetaGammaBackprop");
 inline const PrimitivePtr kPrimDropoutGenMask = std::make_shared<Primitive>("DropoutGenMask");
 inline const PrimitivePtr kPrimDropoutDoMask = std::make_shared<Primitive>("DropoutDoMask");
 inline const PrimitivePtr kPrimOneHot = std::make_shared<Primitive>("OneHot");
 inline const PrimitivePtr kPrimGelu = std::make_shared<Primitive>("Gelu");
 inline const PrimitivePtr kPrimGeluGrad = std::make_shared<Primitive>("GeluGrad");
 inline const PrimitivePtr kPrimRelu = std::make_shared<Primitive>("ReLU");
 inline const PrimitivePtr kPrimReluV2 = std::make_shared<Primitive>("ReLUV2");
 inline const PrimitivePtr kPrimZerosLike = std::make_shared<Primitive>("ZerosLike");
 inline const PrimitivePtr kPrimFakeBprop = std::make_shared<Primitive>("fake_bprop");
 inline const PrimitivePtr kPrimBpropCut = std::make_shared<Primitive>("bprop_cut");
 inline const PrimitivePtr kPrimFakeQuantPerLayer = std::make_shared<Primitive>("FakeQuantPerLayer");
 inline const PrimitivePtr kPrimFakeQuantPerChannel = std::make_shared<Primitive>("FakeQuantPerChannel");
 inline const PrimitivePtr kPrimApplyRMSProp = std::make_shared<Primitive>("ApplyRMSProp");

 // Comm ops
 inline const PrimitivePtr kPrimMirror = std::make_shared<Primitive>("_MirrorOperator");
 inline const PrimitivePtr kPrimVirtualDiv = std::make_shared<Primitive>("_VirtualDiv");
 inline const PrimitivePtr kPrimVirtualDataset = std::make_shared<Primitive>("_VirtualDataset");
 inline const PrimitivePtr kPrimAllReduce = std::make_shared<Primitive>("AllReduce");

 // RowTensor
 inline const PrimitivePtr kPrimMakeRowTensor = std::make_shared<Primitive>("MakeRowTensor");
 inline const PrimitivePtr kPrimRowTensorGetValues = std::make_shared<Primitive>("RowTensorGetValues");
 inline const PrimitivePtr kPrimRowTensorGetIndices = std::make_shared<Primitive>("RowTensorGetIndices");
 inline const PrimitivePtr kPrimRowTensorGetDenseShape = std::make_shared<Primitive>("RowTensorGetDenseShape");

 // SparseTensor
 inline const PrimitivePtr kPrimMakeSparseTensor = std::make_shared<Primitive>("MakeSparseTensor");
 inline const PrimitivePtr kPrimSparseTensorGetValues = std::make_shared<Primitive>("SparseTensorGetValues");
 inline const PrimitivePtr kPrimSparseTensorGetIndices = std::make_shared<Primitive>("SparseTensorGetIndices");
 inline const PrimitivePtr kPrimSparseTensorGetDenseShape = std::make_shared<Primitive>("SparseTensorGetDenseShape");
 inline const PrimitivePtr kPrimBroadcastGradientArgs = std::make_shared<Primitive>("BroadcastGradientArgs");

 class UnpackGraphPrimitive : public Primitive {
 public:
--- a/mindspore/ccsrc/frontend/operator/ops_front_infer_function.cc
+++ b/mindspore/ccsrc/frontend/operator/ops_front_infer_function.cc
@@ -1,6 +1,4 @@
 /**
 * This is the C++ adaptation and derivative work of Myia (https://github.com/mila-iqia/myia/).
 *
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,360 +13,266 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "frontend/operator/ops_front_infer_function.h"

 #include <set>
 #include <string>
 #include <vector>
 #include <memory>
 #include <algorithm>

 #include "abstract/abstract_value.h"
 #include "pipeline/jit/static_analysis/prim.h"
 #include "abstract/utils.h"
 #include "abstract/param_validator.h"
 #include "frontend/operator/ops.h"
 #include "utils/convert_utils.h"
 #include "utils/tensor_py.h"

 using mindspore::tensor::TensorPy;

 #include "frontend/operator/ops.h"
 #include "abstract/infer_functions.h"
 namespace mindspore {
 namespace abstract {
 enum State {
  SAME,
  X_ONE,
  Y_ONE,
 };

 AbstractBasePtr InferImplStringEqual(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                     const AbstractBasePtrList &args_spec_list) {
  // Inputs: two scalars whose value is a string.
  const std::string op_name = primitive->name();
  CheckArgsSize(op_name, args_spec_list, 2);
  AbstractScalarPtr scalar_x = CheckArg<AbstractScalar>(op_name, args_spec_list, 0);
  AbstractScalarPtr scalar_y = CheckArg<AbstractScalar>(op_name, args_spec_list, 1);

  ValuePtr value_x = scalar_x->BuildValue();
  ValuePtr value_y = scalar_y->BuildValue();
  if (!value_x->isa<StringImm>() || !value_y->isa<StringImm>()) {
    MS_LOG(EXCEPTION) << op_name << " requires 2 parameters are string, but got param0: " << value_x->ToString()
                      << ", param1: " << value_y->ToString();
  }

  bool ret = (value_x->cast<StringImmPtr>()->value() == value_y->cast<StringImmPtr>()->value());
  return std::make_shared<AbstractScalar>(ret);
 }
 struct SlideInfo {
  int start;
  int step;
  int stop;
 };

 AbstractBasePtr InferImplStringConcat(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                      const AbstractBasePtrList &args_spec_list) {
  // Inputs: two scalars whose value is a string.
  const std::string op_name = primitive->name();
 template <typename T>
 AbstractBasePtr InferImplTupleOrListEqual(const std::string &op_name, const AbstractBasePtrList &args_spec_list) {
  // Inputs: two tuples or two lists.
  CheckArgsSize(op_name, args_spec_list, 2);
  AbstractScalarPtr scalar_x = CheckArg<AbstractScalar>(op_name, args_spec_list, 0);
  AbstractScalarPtr scalar_y = CheckArg<AbstractScalar>(op_name, args_spec_list, 1);

  ValuePtr value_x = scalar_x->BuildValue();
  ValuePtr value_y = scalar_y->BuildValue();
  if (!value_x->isa<StringImm>() || !value_y->isa<StringImm>()) {
    MS_LOG(EXCEPTION) << op_name << " requires 2 parameters are string, but got param0: " << value_x->ToString()
                      << ", param1: " << value_y->ToString();
  }

  std::string ret = (value_x->cast<StringImmPtr>()->value() + value_y->cast<StringImmPtr>()->value());
  return std::make_shared<AbstractScalar>(ret);
 }

 AbstractBasePtr InferImplMakeTuple(const AnalysisEnginePtr &, const PrimitivePtr &,
                                   const AbstractBasePtrList &args_spec_list) {
  return std::make_shared<AbstractTuple>(args_spec_list);
 }
  auto input_x = CheckArg<T>(op_name, args_spec_list, 0);
  auto input_y = CheckArg<T>(op_name, args_spec_list, 1);

 AbstractBasePtr InferImplMakeList(const AnalysisEnginePtr &, const PrimitivePtr &,
                                  const AbstractBasePtrList &args_spec_list) {
  return std::make_shared<AbstractList>(args_spec_list);
  ValuePtr x_value = input_x->BuildValue();
  ValuePtr y_value = input_y->BuildValue();
  return std::make_shared<AbstractScalar>(*x_value == *y_value);
 }

 AbstractBasePtr InferImplMakeDict(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                  const AbstractBasePtrList &args_spec_list) {
  // Inputs: two tuples.
  const std::string op_name = primitive->name();
  CheckArgsSize(op_name, args_spec_list, 2);
  AbstractTuplePtr keys = CheckArg<AbstractTuple>(op_name, args_spec_list, 0);
  AbstractTuplePtr values = CheckArg<AbstractTuple>(op_name, args_spec_list, 1);

  size_t keys_size = keys->size();
  if (values->size() != keys_size) {
    MS_LOG(EXCEPTION) << op_name << " evaluator keys' size is not equal with values' size";
  }

  std::vector<AbstractAttribute> key_value;
  AbstractScalarPtr key;
  AbstractBasePtrList key_list = keys->elements();
  AbstractBasePtrList value_list = values->elements();
  for (size_t index = 0; index < keys_size; index++) {
    key = CheckArg<AbstractScalar>(op_name + "key", key_list, index);
    ValuePtr keyPtr = key->BuildValue();
    MS_EXCEPTION_IF_NULL(keyPtr);
    if (!keyPtr->isa<StringImm>()) {
      MS_LOG(EXCEPTION) << op_name << " evaluator keys should be string, but got " << keyPtr->ToString();
 void CalcSlidePara(const AbstractBasePtrList &args_spec_list, SlideInfo *slide) {
  int arg1 = 0;
  int arg2 = 0;
  if (!args_spec_list.empty()) {
    MS_EXCEPTION_IF_NULL(args_spec_list[0]);
    auto arg_value = args_spec_list[0]->BuildValue();
    if (!arg_value->isa<Int32Imm>()) {
      MS_LOG(EXCEPTION) << "Only supported input an int32 number.";
    }
    std::string key_string = GetValue<std::string>(keyPtr);
    key_value.emplace_back(key_string, value_list[index]);
    arg1 = GetValue<int>(arg_value);
  }
  return std::make_shared<AbstractDictionary>(key_value);
 }

 AbstractBasePtr InferImplMakeKwarg(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                   const AbstractBasePtrList &args_spec_list) {
  // Inputs: a string and an object of a subclass of AbstractBase.
  const std::string op_name = primitive->name();
  CheckArgsSize(op_name, args_spec_list, 2);
  AbstractScalarPtr key = CheckArg<AbstractScalar>(op_name, args_spec_list, 0);

  ValuePtr keyPtr = key->BuildValue();
  if (!keyPtr->isa<StringImm>()) {
    MS_LOG(EXCEPTION) << op_name << " evaluator key should be string, but got " << keyPtr->ToString();
  if (args_spec_list.size() >= 2) {
    MS_EXCEPTION_IF_NULL(args_spec_list[1]);
    auto arg_value = args_spec_list[1]->BuildValue();
    if (!arg_value->isa<Int32Imm>()) {
      MS_LOG(EXCEPTION) << "Only supported input an int32 number.";
    }
    arg2 = GetValue<int>(arg_value);
  }
  std::string key_string = GetValue<std::string>(keyPtr);
  return std::make_shared<AbstractKeywordArg>(key_string, args_spec_list[1]);
 }

 AbstractBasePtr InferImplExtractKwarg(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                      const AbstractBasePtrList &args_spec_list) {
  // Inputs: a string and a keyword.
  const std::string op_name = primitive->name();
  CheckArgsSize(op_name, args_spec_list, 2);
  AbstractScalarPtr key = CheckArg<AbstractScalar>(op_name, args_spec_list, 0);
  AbstractKeywordArgPtr kwarg = CheckArg<AbstractKeywordArg>(op_name, args_spec_list, 1);
  if (args_spec_list.size() == 3) {
    MS_EXCEPTION_IF_NULL(args_spec_list[2]);
    auto arg_value = args_spec_list[2]->BuildValue();
    if (!arg_value->isa<Int32Imm>()) {
      MS_LOG(EXCEPTION) << "Only supported input an int32 number.";
    }
    slide->step = GetValue<int>(arg_value);
    slide->start = arg1;
    slide->stop = arg2;
  }

  ValuePtr key_value = key->BuildValue();
  if (!key_value->isa<StringImm>()) {
    MS_LOG(EXCEPTION) << op_name << " evaluator key should be string, but got " << key_value->ToString();
  if (args_spec_list.size() == 2) {
    slide->start = arg1;
    slide->stop = arg2;
  }
  std::string key_input = GetValue<std::string>(key_value);
  std::string key_actual = kwarg->get_key();
  if (key_actual != key_input) {
    MS_LOG(EXCEPTION) << op_name << " evaluator input key should be same as AbstractKeywordArg' key, but input is "
                      << key_input << ", AbstractKeywordArg' key is " << key_actual;

  if (args_spec_list.size() == 1) {
    slide->stop = arg1;
  }
  return kwarg->get_arg();
 }

 AbstractBasePtr InferImplMakeSlice(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                   const AbstractBasePtrList &args_spec_list) {
  // Inputs: three scalars whose value is an int32 number.
  CheckArgsSize(primitive->name(), args_spec_list, 3);
  size_t args_size = args_spec_list.size();
  for (size_t index = 0; index < args_size; index++) {
    MS_EXCEPTION_IF_NULL(args_spec_list[index]);
    if (!args_spec_list[index]->isa<AbstractScalar>() && !args_spec_list[index]->isa<AbstractNone>()) {
      MS_EXCEPTION(TypeError) << "MakeSlice eval " << index << " parameter is neither AbstractScalar nor AbstractNone.";
 void ComputeReduceIndex(const std::vector<int> &reverse_x, const std::vector<int> &reverse_y,
                        std::vector<int> *grad_x_reduce_idx, std::vector<int> *grad_y_reduce_idy) {
  const size_t n = reverse_x.size();
  for (size_t i = 0; i < n; ++i) {
    State curr;
    const int32_t x_i = reverse_x[i];
    const int32_t y_i = reverse_y[i];
    const int reduce_idx = SizeToInt(n - 1 - i);
    if (x_i == y_i) {
      curr = SAME;
    } else if (x_i == 1) {
      grad_x_reduce_idx->push_back(reduce_idx);
      curr = X_ONE;
    } else if (y_i == 1) {
      grad_y_reduce_idy->push_back(reduce_idx);
      curr = Y_ONE;
    } else {
      MS_LOG(EXCEPTION) << "not compatible shape input for BroadcastGradientArgs";
    }
    if (args_spec_list[index]->isa<AbstractScalar>() &&
        !dyn_cast<AbstractScalar>(args_spec_list[index])->BuildValue()->isa<Int32Imm>()) {
      MS_EXCEPTION(TypeError) << "MakeSlice eval " << index
                              << " parameter is an AbstractScalar, but is not an int32 number.";
    if (curr == SAME && x_i == 1) {
      grad_x_reduce_idx->push_back(reduce_idx);
      grad_y_reduce_idy->push_back(reduce_idx);
      continue;
    }
  }
  // Slice: start, end, step
  return std::make_shared<AbstractSlice>(args_spec_list[0], args_spec_list[1], args_spec_list[2]);

  std::reverse(grad_x_reduce_idx->begin(), grad_x_reduce_idx->end());
  std::reverse(grad_y_reduce_idy->begin(), grad_y_reduce_idy->end());
 }

 // Eval the return type of make_record
 AbstractBasePtr InferImplMakeRecord(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                    const AbstractBasePtrList &args_spec_list) {
  // Inputs: at lease two objects of a subclass of AbstractBase.
  if (args_spec_list.size() < 2) {
    MS_LOG(EXCEPTION) << "Typeof evaluator requires more than 1 parameter, while the input size is "
                      << args_spec_list.size() << ".";
  }
 AbstractBasePtr BroadcastGradientArgsDiff(const std::vector<ValuePtr> &x_shape, const std::vector<ValuePtr> &y_shape) {
  std::vector<int> reverse_x;
  std::vector<int> reverse_y;

  // args_spec_list[0] maybe AbstractScalarPtr or AbstractTypePtr
  MS_EXCEPTION_IF_NULL(args_spec_list[0]);
  TypePtr type = args_spec_list[0]->GetTypeTrack();
  MS_EXCEPTION_IF_NULL(type);
  if (type->type_id() != kMetaTypeTypeType) {
    MS_LOG(EXCEPTION) << "Can not make type(" << type->ToString() << ")not TypeType";
  }
  (void)std::transform(x_shape.rbegin(), x_shape.rend(), std::back_inserter(reverse_x),
                       [](const ValuePtr &v) { return v->cast<Int32ImmPtr>()->value(); });
  (void)std::transform(y_shape.rbegin(), y_shape.rend(), std::back_inserter(reverse_y),
                       [](const ValuePtr &v) { return v->cast<Int32ImmPtr>()->value(); });

  ValuePtr value_track = args_spec_list[0]->GetValueTrack();
  MS_EXCEPTION_IF_NULL(value_track);
  TypePtr type_ptr = value_track->cast<TypePtr>();
  if (type_ptr == nullptr) {
    MS_LOG(EXCEPTION) << "Value type error, not Me type:" << value_track->ToString();
  if (reverse_x.size() > reverse_y.size()) {
    reverse_y.resize(reverse_x.size(), 1);
  } else {
    reverse_x.resize(reverse_y.size(), 1);
  }

  auto cls = dyn_cast<Class>(type_ptr);
  MS_EXCEPTION_IF_NULL(cls);
  ClassAttrVector attributes = cls->GetAttributes();
  CheckArgsSize(primitive->name(), args_spec_list, attributes.size() + 1);
  std::vector<int> grad_x_reduce_idx;
  std::vector<int> grad_y_reduce_idy;
  ComputeReduceIndex(reverse_x, reverse_y, &grad_x_reduce_idx, &grad_y_reduce_idy);

  std::vector<AbstractAttribute> abs_attributes;
  for (size_t i = 0; i < attributes.size(); i++) {
    AbstractAttribute elem(attributes[i].first, args_spec_list[i + 1]);
    abs_attributes.push_back(elem);
  }
  AbstractBasePtrList abs_list_x;
  AbstractBasePtrList abs_list_y;
  (void)std::transform(grad_x_reduce_idx.begin(), grad_x_reduce_idx.end(), std::back_inserter(abs_list_x),
                       [](int v) { return abstract::FromValue(v); });
  (void)std::transform(grad_y_reduce_idy.begin(), grad_y_reduce_idy.end(), std::back_inserter(abs_list_y),
                       [](int v) { return abstract::FromValue(v); });
  auto x_reduce_idx = std::make_shared<AbstractTuple>(abs_list_x);
  auto y_reduce_idx = std::make_shared<AbstractTuple>(abs_list_y);
  AbstractBasePtrList elem_list;
  elem_list.push_back(x_reduce_idx);
  elem_list.push_back(y_reduce_idx);

  return std::make_shared<AbstractClass>(cls->tag(), abs_attributes, cls->methods());
  return std::make_shared<AbstractTuple>(elem_list);
 }

 template <typename T>
 AbstractBasePtr InferTupleOrListGetItem(const std::string &op_name, const AbstractBasePtrList &args_spec_list) {
  // Inputs: a tuple or list and a scalar whose value is an int32 number.
  CheckArgsSize(op_name, args_spec_list, 2);
  auto queue = CheckArg<T>(op_name, args_spec_list, 0);
  AbstractScalarPtr index = CheckArg<AbstractScalar>(op_name, args_spec_list, 1);

  ValuePtr index_value = index->BuildValue();
  if (!index_value->isa<Int32Imm>()) {
    // when index_value is an AnyValue and args_spec_list[0] is a scalar, try to return the type of the first element
    //  and continue
    if (dyn_cast<AbstractScalar>(queue->elements()[0]) != nullptr) {
      return std::make_shared<AbstractScalar>(queue->elements()[0]->BuildType());
    }
    MS_EXCEPTION(IndexError) << op_name << " evaluator index should be an int32 number, but got "
                             << index_value->ToString();
  }
  int idx_v = GetValue<int>(index_value);
  std::size_t nelems = queue->elements().size();
  if (idx_v >= SizeToInt(nelems) || idx_v < -SizeToInt(nelems)) {
    MS_EXCEPTION(IndexError) << op_name << " evaluator index should be in range[-" << SizeToInt(nelems) << ", "
                             << SizeToInt(nelems) << "), but got " << idx_v << ".";
  }

  std::size_t uidx_v = 0;
  if (idx_v >= 0) {
    uidx_v = IntToSize(idx_v);
  } else {
    uidx_v = IntToSize(idx_v + SizeToInt(nelems));
 AbstractBasePtr InferImplTypeof(const AnalysisEnginePtr &, const PrimitivePtr &,
                                const AbstractBasePtrList &args_spec_list) {
  // Inputs: a pointer to an AbstractBase object
  if (args_spec_list.size() != 1) {
    MS_LOG(EXCEPTION) << "Typeof evaluator requires 1 parameter, while the input size is " << args_spec_list.size()
                      << ".";
  }
  return queue->elements()[uidx_v];
  AbstractBasePtr abs_base = args_spec_list[0];
  MS_EXCEPTION_IF_NULL(abs_base);
  TypePtr type = abs_base->BuildType();
  return std::make_shared<AbstractType>(type);
 }

 template <typename T>
 AbstractBasePtr InferTupleOrListSetItem(const std::string &op_name, const AbstractBasePtrList &args_spec_list) {
  // Inputs: a tuple or list, a scalar whose value is an int32 number and an object of a subclass of AbstractBase.
  CheckArgsSize(op_name, args_spec_list, 3);
  auto queue = CheckArg<T>(op_name, args_spec_list, 0);
  AbstractScalarPtr index = CheckArg<AbstractScalar>(op_name, args_spec_list, 1);

  ValuePtr index_value = index->BuildValue();
  if (!index_value->isa<Int32Imm>()) {
    MS_EXCEPTION(IndexError) << op_name << " evaluator index should be an int32 number, but got "
                             << index_value->ToString();
  }
  int idx_v = GetValue<int>(index_value);
  if (idx_v < 0) {
    MS_EXCEPTION(IndexError) << "The index of " << typeid(T).name() << " should be positive number, but got " << idx_v
                             << ".";
  }
 AbstractBasePtr InferImplHasType(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                 const AbstractBasePtrList &args_spec_list) {
  // Inputs: a pointer to an AbstractBase object and a pointer to a Type
  const std::string op_name = primitive->name();
  CheckArgsSize(op_name, args_spec_list, 2);
  AbstractTypePtr abs_type = CheckArg<AbstractType>(op_name, args_spec_list, 1);

  size_t uidx_v = IntToSize(idx_v);
  AbstractBasePtrList elements = queue->elements();
  std::size_t nelems = elements.size();
  if (uidx_v >= nelems) {
    MS_EXCEPTION(IndexError) << op_name << " evaluator the index: " << uidx_v << " to set out of range: " << nelems - 1
                             << ".";
  auto mode_v = abs_type->GetValueTrack();
  MS_EXCEPTION_IF_NULL(mode_v);
  if (!mode_v->isa<Type>()) {
    MS_LOG(EXCEPTION) << "Get the type from AbstractType value failed.";
  }
  elements[uidx_v] = args_spec_list[2];
  return std::make_shared<T>(elements);
 }

 AbstractBasePtr InferImplTupleGetItem(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                      const AbstractBasePtrList &args_spec_list) {
  return InferTupleOrListGetItem<AbstractTuple>(primitive->name(), args_spec_list);
  TypePtr mode_t = mode_v->cast<TypePtr>();
  MS_EXCEPTION_IF_NULL(args_spec_list[0]);
  bool v = IsSubtype(args_spec_list[0], mode_t);
  return std::make_shared<AbstractScalar>(std::make_shared<BoolImm>(v), kBool);
 }

 AbstractBasePtr InferImplListGetItem(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                     const AbstractBasePtrList &args_spec_list) {
  return InferTupleOrListGetItem<AbstractList>(primitive->name(), args_spec_list);
 }
 bool CompareShape(const std::vector<ValuePtr> &x_shape, const std::vector<ValuePtr> &y_shape) {
  if (x_shape.size() != y_shape.size()) {
    return false;
  }

 AbstractBasePtr InferImplTupleSetItem(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                      const AbstractBasePtrList &args_spec_list) {
  return InferTupleOrListSetItem<AbstractTuple>(primitive->name(), args_spec_list);
 }
  for (size_t i = 0; i < x_shape.size(); ++i) {
    if (GetValue<int>(x_shape[i]) != GetValue<int>(y_shape[i])) {
      return false;
    }
  }

 AbstractBasePtr InferImplListSetItem(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                     const AbstractBasePtrList &args_spec_list) {
  return InferTupleOrListSetItem<AbstractList>(primitive->name(), args_spec_list);
  return true;
 }

 AbstractBasePtr InferImplDictGetItem(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                     const AbstractBasePtrList &args_spec_list) {
  // Inputs: a dict and a scalar whose value is a string.
  const std::string op_name = primitive->name();
  CheckArgsSize(op_name, args_spec_list, 2);
  AbstractDictionaryPtr dict = CheckArg<AbstractDictionary>(op_name, args_spec_list, 0);
  AbstractScalarPtr key = CheckArg<AbstractScalar>(op_name, args_spec_list, 1);

  ValuePtr key_value = key->BuildValue();
  if (!key_value->isa<StringImm>()) {
    MS_LOG(EXCEPTION) << op_name << " evaluator key should be string, but got " << key_value->ToString();
 AbstractBasePtr DoInferReduceShape(const AbstractTuplePtr &x_shape, const ValuePtr &x_shp_value,
                                   const ValueTuplePtr &axis_value_ptr, const PrimitivePtr &primitive) {
  size_t x_rank = x_shape->size();
  std::set<int> axis_set;
  auto axis_data = axis_value_ptr->value();
  if (axis_data.empty()) {
    int size = 1;
    AbstractBasePtrList values(x_rank, std::make_shared<AbstractScalar>(size));
    return std::make_shared<AbstractTuple>(values);
  }
  auto key_str = GetValue<std::string>(key_value);
  std::vector<AbstractAttribute> dict_elems = dict->elements();
  auto it = std::find_if(dict_elems.begin(), dict_elems.end(),
                         [key_str](const AbstractAttribute &item) { return item.first == key_str; });

  if (it == dict_elems.end()) {
    MS_LOG(EXCEPTION) << "The key " << key_str << " does not exist in the dict:" << args_spec_list[0]->ToString();
  for (auto &elem : axis_data) {
    int e_value = CheckAxis(primitive->name(), elem, -SizeToInt(x_rank), SizeToInt(x_rank) - 1);
    (void)axis_set.insert(e_value);
  }
  return it->second;
 }

 AbstractBasePtr InferImplDictSetItem(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                     const AbstractBasePtrList &args_spec_list) {
  // Inputs: a dict and a scalar whose value is a string and an object of a subclass of AbstractBase.
  const std::string op_name = primitive->name();
  CheckArgsSize(op_name, args_spec_list, 3);
  AbstractDictionaryPtr dict = CheckArg<AbstractDictionary>(op_name, args_spec_list, 0);
  AbstractScalarPtr key = CheckArg<AbstractScalar>(op_name, args_spec_list, 1);

  ValuePtr key_value = key->BuildValue();
  if (!key_value->isa<StringImm>()) {
    MS_LOG(EXCEPTION) << op_name << " evaluator key should be string, but got " << key_value->ToString();
  }
  std::string key_str = GetValue<std::string>(key_value);
  std::vector<AbstractAttribute> dict_elems = dict->elements();
  auto it = std::find_if(dict_elems.begin(), dict_elems.end(),
                         [key_str](AbstractAttribute &item) { return item.first == key_str; });

  MS_EXCEPTION_IF_NULL(args_spec_list[2]);
  auto new_ele = std::make_pair(key_str, args_spec_list[2]);
  if (it != dict_elems.end()) {
    int index = it - dict_elems.begin();
    dict_elems[IntToSize(index)] = new_ele;
  } else {
    dict_elems.push_back(new_ele);
  auto x_shp_data = x_shp_value->cast<ValueTuplePtr>()->value();
  if (x_shp_data.size() < x_rank) {
    MS_LOG(EXCEPTION) << "x_shape_data.size() " << x_shp_data.size() << " less than x_shape.size() " << x_rank;
  }
  AbstractBasePtrList values;
  for (size_t i = 0; i < x_rank; i++) {
    if (axis_set.count(SizeToInt(i)) || axis_set.count(SizeToInt(i) - SizeToInt(x_rank))) {
      auto axis_v = MakeValue(1);
      values.push_back(std::make_shared<AbstractScalar>(axis_v, axis_v->type()));
    } else {
      int dim_value = x_shp_data[i]->cast<Int32ImmPtr>()->value();
      auto dim = MakeValue(dim_value);
      values.push_back(std::make_shared<AbstractScalar>(dim, dim->type()));
    }
  }
  return std::make_shared<AbstractDictionary>(dict_elems);

  return std::make_shared<AbstractTuple>(values);
 }

 AbstractBasePtr InferImplListAppend(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                    const AbstractBasePtrList &args_spec_list) {
  // Inputs: a list and an object of a subclass of AbstractBase.
 AbstractBasePtr InferImplBroadcastGradientArgs(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                               const AbstractBasePtrList &args_spec_list) {
  // this primitive get the index that need to reduce
  // input: x's shape and y's shape, inputs should be tuple
  // output: tuple of x and y 's reduce index, reduce index should be a tuple
  const std::string op_name = primitive->name();
  CheckArgsSize(op_name, args_spec_list, 2);
  AbstractListPtr list = CheckArg<AbstractList>(op_name, args_spec_list, 0);
  (void)AbstractJoin(list->elements());
  return list;
 }
  auto arg_x = CheckArg<AbstractTuple>(op_name, args_spec_list, 0);
  auto arg_y = CheckArg<AbstractTuple>(op_name, args_spec_list, 1);

 template <typename T>
 AbstractBasePtr InferTupleOrListOrDictLen(const std::string &op_name, const AbstractBasePtrList &args_spec_list) {
  // Inputs: a tuple or list or dict.
  CheckArgsSize(op_name, args_spec_list, 1);
  auto arg = CheckArg<T>(op_name, args_spec_list, 0);
  return std::make_shared<AbstractScalar>(SizeToInt(arg->size()));
 }
  ValueTuplePtr arg_x_value = arg_x->BuildValue()->cast<ValueTuplePtr>();
  MS_EXCEPTION_IF_NULL(arg_x_value);

 AbstractBasePtr InferImplTupleLen(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                  const AbstractBasePtrList &args_spec_list) {
  return InferTupleOrListOrDictLen<AbstractTuple>(primitive->name(), args_spec_list);
 }
  ValueTuplePtr arg_y_value = arg_y->BuildValue()->cast<ValueTuplePtr>();
  MS_EXCEPTION_IF_NULL(arg_y_value);

 AbstractBasePtr InferImplListLen(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                 const AbstractBasePtrList &args_spec_list) {
  return InferTupleOrListOrDictLen<AbstractList>(primitive->name(), args_spec_list);
 }
  const std::vector<ValuePtr> x_shape = arg_x_value->value();
  const std::vector<ValuePtr> y_shape = arg_y_value->value();
  bool is_same_shape = CompareShape(x_shape, y_shape);
  // if it is the same shape , do not need reduce , return empty tuple
  if (is_same_shape) {
    AbstractBasePtrList empty_list;
    auto x_reduce_idx = std::make_shared<AbstractTuple>(empty_list);
    auto y_reduce_idx = std::make_shared<AbstractTuple>(empty_list);

 AbstractBasePtr InferImplDictLen(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                 const AbstractBasePtrList &args_spec_list) {
  return InferTupleOrListOrDictLen<AbstractDictionary>(primitive->name(), args_spec_list);
 }
    AbstractBasePtrList elem_list;
    elem_list.push_back(x_reduce_idx);
    elem_list.push_back(y_reduce_idx);

 AbstractBasePtr InferImplArrayLen(const AnalysisEnginePtr &, const PrimitivePtr &,
                                  const AbstractBasePtrList &args_spec_list) {
  return std::make_shared<AbstractScalar>(kAnyValue, kInt32);
    return std::make_shared<AbstractTuple>(elem_list);
  }

  return BroadcastGradientArgsDiff(x_shape, y_shape);
 }

 AbstractBasePtr InferImplListMap(const AnalysisEnginePtr &engine, const PrimitivePtr &primitive,
@@ -430,41 +334,6 @@ AbstractBasePtr InferImplTupleReversed(const AnalysisEnginePtr &, const Primitiv
  return std::make_shared<AbstractTuple>(elem_list);
 }

 AbstractBasePtr DoInferReduceShape(const AbstractTuplePtr &x_shape, const ValuePtr &x_shp_value,
                                   const ValueTuplePtr &axis_value_ptr, const PrimitivePtr &primitive) {
  size_t x_rank = x_shape->size();
  std::set<int> axis_set;
  auto axis_data = axis_value_ptr->value();
  if (axis_data.empty()) {
    int size = 1;
    AbstractBasePtrList values(x_rank, std::make_shared<AbstractScalar>(size));
    return std::make_shared<AbstractTuple>(values);
  }

  for (auto &elem : axis_data) {
    int e_value = CheckAxis(primitive->name(), elem, -SizeToInt(x_rank), SizeToInt(x_rank) - 1);
    (void)axis_set.insert(e_value);
  }

  auto x_shp_data = x_shp_value->cast<ValueTuplePtr>()->value();
  if (x_shp_data.size() < x_rank) {
    MS_LOG(EXCEPTION) << "x_shape_data.size() " << x_shp_data.size() << " less than x_shape.size() " << x_rank;
  }
  AbstractBasePtrList values;
  for (size_t i = 0; i < x_rank; i++) {
    if (axis_set.count(SizeToInt(i)) || axis_set.count(SizeToInt(i) - SizeToInt(x_rank))) {
      auto axis_v = MakeValue(1);
      values.push_back(std::make_shared<AbstractScalar>(axis_v, axis_v->type()));
    } else {
      int dim_value = x_shp_data[i]->cast<Int32ImmPtr>()->value();
      auto dim = MakeValue(dim_value);
      values.push_back(std::make_shared<AbstractScalar>(dim, dim->type()));
    }
  }

  return std::make_shared<AbstractTuple>(values);
 }

 AbstractBasePtr InferImplReduceShape(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                     const AbstractBasePtrList &args_spec_list) {
  // Inputs: x_shape, axis
@@ -563,7 +432,7 @@ AbstractBasePtr InferImplTuple2Array(const AnalysisEnginePtr &, const PrimitiveP

  py::tuple data_tuple = ValuePtrToPyData(input->BuildValue());
  py::array data = py::array(data_tuple);
  auto tensor = TensorPy::MakeTensor(data);
  auto tensor = tensor::TensorPy::MakeTensor(data);
  auto ret = tensor->ToAbstract();
  ret->set_value(tensor);
  MS_LOG(DEBUG) << "Tuple2arry result AbstractTensor: " << ret->ToString();
@@ -596,76 +465,6 @@ AbstractBasePtr InferImplShapeMul(const AnalysisEnginePtr &, const PrimitivePtr
  return std::make_shared<AbstractScalar>(result_v, result_v->type());
 }

 template <typename T>
 AbstractBasePtr InferImplTupleOrListEqual(const std::string &op_name, const AbstractBasePtrList &args_spec_list) {
  // Inputs: two tuples or two lists.
  CheckArgsSize(op_name, args_spec_list, 2);
  auto input_x = CheckArg<T>(op_name, args_spec_list, 0);
  auto input_y = CheckArg<T>(op_name, args_spec_list, 1);

  ValuePtr x_value = input_x->BuildValue();
  ValuePtr y_value = input_y->BuildValue();
  return std::make_shared<AbstractScalar>(*x_value == *y_value);
 }

 AbstractBasePtr InferImplTupleEqual(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                    const AbstractBasePtrList &args_spec_list) {
  return InferImplTupleOrListEqual<AbstractTuple>(primitive->name(), args_spec_list);
 }

 AbstractBasePtr InferImplListEqual(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                   const AbstractBasePtrList &args_spec_list) {
  return InferImplTupleOrListEqual<AbstractList>(primitive->name(), args_spec_list);
 }

 struct SlideInfo {
  int start;
  int step;
  int stop;
 };

 void CalcSlidePara(const AbstractBasePtrList &args_spec_list, SlideInfo *slide) {
  int arg1 = 0;
  int arg2 = 0;
  if (!args_spec_list.empty()) {
    MS_EXCEPTION_IF_NULL(args_spec_list[0]);
    auto arg_value = args_spec_list[0]->BuildValue();
    if (!arg_value->isa<Int32Imm>()) {
      MS_LOG(EXCEPTION) << "Only supported input an int32 number.";
    }
    arg1 = GetValue<int>(arg_value);
  }

  if (args_spec_list.size() >= 2) {
    MS_EXCEPTION_IF_NULL(args_spec_list[1]);
    auto arg_value = args_spec_list[1]->BuildValue();
    if (!arg_value->isa<Int32Imm>()) {
      MS_LOG(EXCEPTION) << "Only supported input an int32 number.";
    }
    arg2 = GetValue<int>(arg_value);
  }

  if (args_spec_list.size() == 3) {
    MS_EXCEPTION_IF_NULL(args_spec_list[2]);
    auto arg_value = args_spec_list[2]->BuildValue();
    if (!arg_value->isa<Int32Imm>()) {
      MS_LOG(EXCEPTION) << "Only supported input an int32 number.";
    }
    slide->step = GetValue<int>(arg_value);
    slide->start = arg1;
    slide->stop = arg2;
  }

  if (args_spec_list.size() == 2) {
    slide->start = arg1;
    slide->stop = arg2;
  }

  if (args_spec_list.size() == 1) {
    slide->stop = arg1;
  }
 }

 AbstractBasePtr InferImplMakeRange(const AnalysisEnginePtr &, const PrimitivePtr &,
                                   const AbstractBasePtrList &args_spec_list) {
  if (args_spec_list.empty()) {
@@ -709,5 +508,145 @@ AbstractBasePtr InferImplStopGradient(const AnalysisEnginePtr &, const Primitive
  CheckArgsSize(primitive->name(), args_spec_list, 1);
  return args_spec_list[0]->Clone();
 }

 AbstractBasePtr InferImplTupleEqual(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                    const AbstractBasePtrList &args_spec_list) {
  return InferImplTupleOrListEqual<AbstractTuple>(primitive->name(), args_spec_list);
 }

 AbstractBasePtr InferImplListEqual(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                   const AbstractBasePtrList &args_spec_list) {
  return InferImplTupleOrListEqual<AbstractList>(primitive->name(), args_spec_list);
 }

 AbstractBasePtr InferImplStringEqual(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                     const AbstractBasePtrList &args_spec_list) {
  // Inputs: two scalars whose value is a string.
  const std::string op_name = primitive->name();
  CheckArgsSize(op_name, args_spec_list, 2);
  AbstractScalarPtr scalar_x = CheckArg<AbstractScalar>(op_name, args_spec_list, 0);
  AbstractScalarPtr scalar_y = CheckArg<AbstractScalar>(op_name, args_spec_list, 1);

  ValuePtr value_x = scalar_x->BuildValue();
  ValuePtr value_y = scalar_y->BuildValue();
  if (!value_x->isa<StringImm>() || !value_y->isa<StringImm>()) {
    MS_LOG(EXCEPTION) << op_name << " requires 2 parameters are string, but got param0: " << value_x->ToString()
                      << ", param1: " << value_y->ToString();
  }

  bool ret = (value_x->cast<StringImmPtr>()->value() == value_y->cast<StringImmPtr>()->value());
  return std::make_shared<AbstractScalar>(ret);
 }

 AbstractBasePtr InferImplStringConcat(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                      const AbstractBasePtrList &args_spec_list) {
  // Inputs: two scalars whose value is a string.
  const std::string op_name = primitive->name();
  CheckArgsSize(op_name, args_spec_list, 2);
  AbstractScalarPtr scalar_x = CheckArg<AbstractScalar>(op_name, args_spec_list, 0);
  AbstractScalarPtr scalar_y = CheckArg<AbstractScalar>(op_name, args_spec_list, 1);

  ValuePtr value_x = scalar_x->BuildValue();
  ValuePtr value_y = scalar_y->BuildValue();
  if (!value_x->isa<StringImm>() || !value_y->isa<StringImm>()) {
    MS_LOG(EXCEPTION) << op_name << " requires 2 parameters are string, but got param0: " << value_x->ToString()
                      << ", param1: " << value_y->ToString();
  }

  std::string ret = (value_x->cast<StringImmPtr>()->value() + value_y->cast<StringImmPtr>()->value());
  return std::make_shared<AbstractScalar>(ret);
 }

 AbstractBasePtr InferImplDictLen(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                 const AbstractBasePtrList &args_spec_list) {
  return InferTupleOrListOrDictLen<AbstractDictionary>(primitive->name(), args_spec_list);
 }

 AbstractBasePtr InferImplJ(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                           const AbstractBasePtrList &args_spec_list) {
  // args: An object of AbstractFunction.
  CheckArgsSize(primitive->name(), args_spec_list, 1);
  MS_LOG(DEBUG) << "evaluate J: " << args_spec_list[0]->ToString();

  AbstractFunctionPtr x = dyn_cast<AbstractFunction>(args_spec_list[0]);
  if (x == nullptr) {
    return std::make_shared<AbstractJTagged>(args_spec_list[0]);
  }

  AbstractFuncAtomPtrList jv;
  auto build_jv = [&jv](const AbstractFuncAtomPtr &func) {
    auto j_closure = std::make_shared<JTransformedAbstractClosure>(func);
    jv.push_back(j_closure);
  };
  x->Visit(build_jv);

  return AbstractFunction::MakeAbstractFunction(jv);
 }

 AbstractBasePtr InferImplFakeBprop(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                   const AbstractBasePtrList &args_spec_list) {
  // Inputs: a tensor.
  CheckArgsSize(primitive->name(), args_spec_list, 1);
  return args_spec_list[0]->Broaden();
 }

 // Eval the return type of make_record
 AbstractBasePtr InferImplMakeRecord(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                    const AbstractBasePtrList &args_spec_list) {
  // Inputs: at lease two objects of a subclass of AbstractBase.
  if (args_spec_list.size() < 2) {
    MS_LOG(EXCEPTION) << "Typeof evaluator requires more than 1 parameter, while the input size is "
                      << args_spec_list.size() << ".";
  }

  // args_spec_list[0] maybe AbstractScalarPtr or AbstractTypePtr
  MS_EXCEPTION_IF_NULL(args_spec_list[0]);
  TypePtr type = args_spec_list[0]->GetTypeTrack();
  MS_EXCEPTION_IF_NULL(type);
  if (type->type_id() != kMetaTypeTypeType) {
    MS_LOG(EXCEPTION) << "Can not make type(" << type->ToString() << ")not TypeType";
  }

  ValuePtr value_track = args_spec_list[0]->GetValueTrack();
  MS_EXCEPTION_IF_NULL(value_track);
  TypePtr type_ptr = value_track->cast<TypePtr>();
  if (type_ptr == nullptr) {
    MS_LOG(EXCEPTION) << "Value type error, not Me type:" << value_track->ToString();
  }

  auto cls = dyn_cast<Class>(type_ptr);
  MS_EXCEPTION_IF_NULL(cls);
  ClassAttrVector attributes = cls->GetAttributes();
  CheckArgsSize(primitive->name(), args_spec_list, attributes.size() + 1);

  std::vector<AbstractAttribute> abs_attributes;
  for (size_t i = 0; i < attributes.size(); i++) {
    AbstractAttribute elem(attributes[i].first, args_spec_list[i + 1]);
    abs_attributes.push_back(elem);
  }

  return std::make_shared<AbstractClass>(cls->tag(), abs_attributes, cls->methods());
 }
 REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(TypeOf, prim::kPrimTypeOf, InferImplTypeof);
 REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(HasType, prim::kPrimHasType, InferImplHasType);
 REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(MakeRecord, prim::kPrimMakeRecord, InferImplMakeRecord);
 REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(ListMap, prim::kPrimListMap, InferImplListMap);
 REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(ListReduce, prim::kPrimListReduce, InferImplListReduce);
 REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(TupleReversed, prim::kPrimTupleReversed, InferImplTupleReversed);
 REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(ReducedShape, prim::kPrimReducedShape, InferImplReduceShape);
 REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(TupleDiv, prim::kPrimTupleDiv, InferImplTupleDiv);
 REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(TupleToArray, prim::kPrimTupleToArray, InferImplTuple2Array);
 REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(ShapeMul, prim::kPrimShapeMul, InferImplShapeMul);
 REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(TupleEqual, prim::kPrimTupleEqual, InferImplTupleEqual);
 REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(ListEqual, prim::kPrimListEqual, InferImplListEqual);
 REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(MakeRange, prim::kPrimMakeRange, InferImplMakeRange);
 REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(StopGradient, prim::kPrimStopGradient, InferImplStopGradient);
 REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(StringEqual, prim::kPrimStringEqual, InferImplStringEqual);
 REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(StringConcat, prim::kPrimStringConcat, InferImplStringConcat);
 REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(DictLen, prim::kPrimDictLen, InferImplDictLen);
 REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(FakeBprop, prim::kPrimFakeBprop, InferImplFakeBprop);
 REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(J, prim::kPrimJ, InferImplJ);
 REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(BroadcastGradientArgs, prim::kPrimBroadcastGradientArgs,
                                      InferImplBroadcastGradientArgs);
 }  // namespace abstract
 }  // namespace mindspore
--- a/mindspore/ccsrc/frontend/operator/ops_front_infer_function.h
+++ b/mindspore/ccsrc/frontend/operator/ops_front_infer_function.h
@@ -0,0 +1,77 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_FRONTEND_OPERATE_OPS_FRONT_INFER_FUNCTION_H_
 #define MINDSPORE_CCSRC_FRONTEND_OPERATE_OPS_FRONT_INFER_FUNCTION_H_
 #include "abstract/abstract_value.h"
 #include "abstract/primitive_infer_map.h"
 namespace mindspore {
 namespace abstract {
 AbstractBasePtr InferImplTypeof(const AnalysisEnginePtr &, const PrimitivePtr &,
                                const AbstractBasePtrList &args_spec_list);
 AbstractBasePtr InferImplHasType(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                 const AbstractBasePtrList &args_spec_list);
 AbstractBasePtr InferImplBroadcastGradientArgs(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                               const AbstractBasePtrList &args_spec_list);
 AbstractBasePtr InferImplListMap(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                 const AbstractBasePtrList &args_spec_list);
 AbstractBasePtr InferImplListReduce(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                    const AbstractBasePtrList &args_spec_list);
 AbstractBasePtr InferImplTupleReversed(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                       const AbstractBasePtrList &args_spec_list);
 AbstractBasePtr InferImplReduceShape(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                     const AbstractBasePtrList &args_spec_list);
 AbstractBasePtr InferImplTupleDiv(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                  const AbstractBasePtrList &args_spec_list);
 AbstractBasePtr InferImplTuple2Array(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                     const AbstractBasePtrList &args_spec_list);
 AbstractBasePtr InferImplShapeMul(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                  const AbstractBasePtrList &args_spec_list);
 AbstractBasePtr InferImplTupleEqual(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                    const AbstractBasePtrList &args_spec_list);
 AbstractBasePtr InferImplListEqual(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                   const AbstractBasePtrList &args_spec_list);
 AbstractBasePtr InferImplMakeRange(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                   const AbstractBasePtrList &args_spec_list);
 AbstractBasePtr InferImplStopGradient(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                      const AbstractBasePtrList &args_spec_list);
 AbstractBasePtr InferImplStringEqual(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                     const AbstractBasePtrList &args_spec_list);
 AbstractBasePtr InferImplStringConcat(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                      const AbstractBasePtrList &args_spec_list);
 AbstractBasePtr InferImplDictLen(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                 const AbstractBasePtrList &args_spec_list);
 AbstractBasePtr InferImplJ(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                           const AbstractBasePtrList &args_spec_list);
 AbstractBasePtr InferImplFakeBprop(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                   const AbstractBasePtrList &args_spec_list);
 AbstractBasePtr InferImplMakeRecord(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                    const AbstractBasePtrList &args_spec_list);

 class RegisterFrontendPrimitiveEvalHelper {
 public:
  RegisterFrontendPrimitiveEvalHelper(const PrimitivePtr &primitive, const StandardPrimitiveEvalImpl &impl) {
    const StandardPrimitiveImplReg impl_reg{impl, false};
    RegisterStandardPrimitiveImpl(primitive, impl_reg);
  }
  ~RegisterFrontendPrimitiveEvalHelper() = default;
 };

 #define REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(name, primitive, impl) \
  static auto helper_##name = RegisterFrontendPrimitiveEvalHelper(primitive, impl)
 }  // namespace abstract
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_FRONTEND_OPERATE_OPS_FRONT_INFER_FUNCTION_H_
--- a/mindspore/ccsrc/frontend/optimizer/irpass.cc
+++ b/mindspore/ccsrc/frontend/optimizer/irpass.cc
@@ -95,10 +95,10 @@ OptimizeIRPassLib::OptimizeIRPassLib() {
  // Ref eliminate
  make_ref_eliminate_ =
    MakeSubstitution(std::make_shared<MakeRefEliminater>(), "make_ref_eliminate", prim::kPrimMakeRef);
  get_ref_param_eliminate_ = MakeSubstitution(std::make_shared<GetRefParamEliminater>(), "get_ref_param_eliminate",
                                              {prim::kPrimGetRefValue, prim::kPrimGetRefOrigin});
  get_ref_param_eliminate_ =
    MakeSubstitution(std::make_shared<GetRefParamEliminater>(), "get_ref_param_eliminate", {prim::kPrimGetRefValue});
  get_make_ref_eliminate_ = MakeSubstitution(std::make_shared<GetMakeRefEliminater>(), "get_make_ref_eliminate",
                                             {prim::kPrimGetRefKey, prim::kPrimGetRefValue, prim::kPrimGetRefOrigin});
                                             {prim::kPrimGetRefKey, prim::kPrimGetRefValue});

  replace_refkey_by_param_ = MakeSubstitution(std::make_shared<ReplaceRefkeyByParam>(), "replace_refkey_by_param",
                                              IsValueNode<RefKey>, opt::FORCE_RENORM);
--- a/mindspore/ccsrc/frontend/optimizer/irpass/arithmetic_simplify.cc
+++ b/mindspore/ccsrc/frontend/optimizer/irpass/arithmetic_simplify.cc
@@ -20,9 +20,6 @@ namespace mindspore {
 namespace opt {
 namespace irpass {
 AnfNodePtr ArithmeticSimplify::operator()(const OptimizerPtr &, const AnfNodePtr &node) {
  if (MsContext::GetInstance()->execution_mode() == kPynativeMode) {
    return nullptr;
  }
  PatternNode x, y, z, xs;
  PConstant one_(node, false, 1);
  PConstant one_scalar_(node, false, 1, true);
@@ -32,16 +29,21 @@ AnfNodePtr ArithmeticSimplify::operator()(const OptimizerPtr &, const AnfNodePtr
  PConstant const_2(node);
  PConstant any_const(node);

  MATCH_REPLACE(node, x + zero_, x);                                                           // Add by zero
  MATCH_REPLACE(node, x + zero_scalar_, x);                                                    // Add by zero
  MATCH_REPLACE(node, PBinOperation(prim::kPrimScalarAdd, x, zero_scalar_, true), x);          // Scalar Add by zero
  MATCH_REPLACE_IF(node, x * one_, any_const.WithValueOf(x), !one_.CheckFunc(IsParam, node));  // Multiply by one
  MATCH_REPLACE(node, PBinOperation(prim::kPrimScalarMul, x, one_scalar_, true), x);           // Scalar Mul by one
  if (MsContext::GetInstance()->execution_mode() != kPynativeMode) {
    MATCH_REPLACE(node, x + zero_, x);                                                           // Add by zero
    MATCH_REPLACE(node, x + zero_scalar_, x);                                                    // Add by zero
    MATCH_REPLACE(node, PBinOperation(prim::kPrimScalarAdd, x, zero_scalar_, true), x);          // Scalar Add by zero
    MATCH_REPLACE_IF(node, x * one_, any_const.WithValueOf(x), !one_.CheckFunc(IsParam, node));  // Multiply by one
    MATCH_REPLACE(node, PBinOperation(prim::kPrimScalarMul, x, one_scalar_, true), x);           // Scalar Mul by one

  // Scalar Mul by zero
  MATCH_REPLACE(node, PBinOperation(prim::kPrimScalarMul, x, zero_scalar_, true), zero_scalar_.NewValue());
    // Scalar Mul by zero
    MATCH_REPLACE(node, PBinOperation(prim::kPrimScalarMul, x, zero_scalar_, true), zero_scalar_.NewValue());
  }
  // Prim Eliminate (identity)
  MATCH_REPLACE(node, PPrimitive(prim::kPrimIdentity, x), x);
  if (MsContext::GetInstance()->execution_mode() == kPynativeMode) {
    return nullptr;
  }

  // ConstantDuplicateMul
  auto const_dup_lambda = [&node, &x, &const_, &const_2]() -> AnfNodePtr {
@@ -95,37 +97,37 @@ AnfNodePtr ArithmeticSimplify2::operator()(const OptimizerPtr &, const AnfNodePt
 // {prim::kPrimAddN, {prim::kPrimMakeTuple, {prim::kPrimMul, {prim::kPrimAllReduce, X}, Y}, Z}} ->
 // {prim::kPrimMul, {prim::kPrimAllReduce, {prim::kPrimAddN,{prim::kPrimMakeTuple, Z, X}}}, Y}
 AnfNodePtr AdjustAllReduceMulAdd::operator()(const OptimizerPtr &, const AnfNodePtr &node) {
  Reset();
  // {prim::kPrimAddN, Zs}
  if (!IsPrimitiveCNode(node, prim::kPrimAddN)) {
    return nullptr;
  }
  auto addn = node->cast<CNodePtr>();
  if (addn->size() != 2) {
    return nullptr;
  }
  AnfVisitor::Match(prim::kPrimMakeTuple, {IsNode, IsNode})(addn->input(1));
  if (x_ == nullptr || y_ == nullptr || z_ == nullptr || all_reduce_fg_ == nullptr) {
    return nullptr;
  }
  auto addn_maketuple = addn->input(1);

  auto fg = all_reduce_fg_;
  // addn inputs cross the graph, make the inputs same as allreduce node.
  if (z_->isa<CNode>() && fg != z_->func_graph()) {
    auto cnode_z = z_->cast<CNodePtr>();
    z_ = NewCNode(cnode_z->inputs(), fg);
  }

  auto addn_op_node = addn->input(0);
  auto make_tuple_op_node = addn->input(1)->cast<CNodePtr>()->input(0);
  PatternNode x, y, z;
  auto all_reduce_pat = PPrimitive(prim::kPrimAllReduce, x);
  auto mul_pat = PBinOperation(prim::kPrimMul, all_reduce_pat, y, true);
  auto admktup_pat = PBinOperation(prim::kPrimMakeTuple, mul_pat, z, true);
  auto addn_pat = PPrimitive(prim::kPrimAddN, admktup_pat);
  auto adjust_lambda = [&node, &x, &y, &z, &addn_pat, &all_reduce_pat, &admktup_pat, &mul_pat, this]() -> AnfNodePtr {
    auto fg = all_reduce_pat.GetFuncGraph();
    auto z_ = z.GetNode(node);
    // If addn inputs cross the graph, make the inputs same as allreduce node.
    if (z_->isa<CNode>() && fg != z_->func_graph()) {
      auto cnode_z = z_->cast<CNodePtr>();
      z_ = NewCNode(cnode_z->inputs(), fg);
    }

  AnfNodePtr tuple = NewCNode({make_tuple_op_node, z_, x_}, fg);
  AnfNodePtr add = NewCNode({addn_op_node, tuple}, fg);
  AnfNodePtr all_reduce = NewCNode({all_reduce_, add}, fg);
  AnfNodePtr mul = NewCNode({mul_, all_reduce, y_}, fg);
  ProcessDependEdge(fg, addn_maketuple, all_reduce);
  return mul;
    auto addn_cnode = addn_pat.GetOriginalNode()->cast<CNodePtr>();
    auto addn_op_node = addn_cnode->input(0);
    auto make_tuple_op_node = addn_cnode->input(1)->cast<CNodePtr>()->input(0);
    auto all_reduce_prim = all_reduce_pat.GetOriginalNode()->cast<CNodePtr>()->input(0);
    mul_cnode_ = mul_pat.GetOriginalNode();
    auto mul_prim = mul_cnode_->cast<CNodePtr>()->input(0);
    auto addn_maketuple = admktup_pat.GetOriginalNode();

    AnfNodePtr tuple = NewCNode({make_tuple_op_node, z_, x.GetNode(node)}, fg);
    AnfNodePtr add = NewCNode({addn_op_node, tuple}, fg);
    AnfNodePtr all_reduce = NewCNode({all_reduce_prim, add}, fg);
    AnfNodePtr mul = NewCNode({mul_prim, all_reduce, y.GetNode(node)}, fg);
    ProcessDependEdge(fg, addn_maketuple, all_reduce);
    return mul;
  };
  MATCH_REPLACE_LAMBDA(node, addn_pat, adjust_lambda);
  return nullptr;
 }

 void AdjustAllReduceMulAdd::ProcessDependEdge(const FuncGraphPtr &fg, const AnfNodePtr &addn_maketuple,
@@ -146,48 +148,6 @@ void AdjustAllReduceMulAdd::ProcessDependEdge(const FuncGraphPtr &fg, const AnfN
  }
 }

 void AdjustAllReduceMulAdd::Visit(const AnfNodePtr &node) {
  if (level_ == 0) {
    level_ = 1;
    is_reduce_match_ = false;
    // {prim::kPrimMul, {prim::kPrimAllReduce, X}, Y}
    AnfVisitor::Match(prim::kPrimMul)(node);
    level_ = 0;
    if (is_reduce_match_) {
      mul_ = node->cast<CNodePtr>()->input(0);
      mul_cnode_ = node->cast<CNodePtr>();
      y_ = tmp_;
    } else {
      z_ = node;
    }
  }

  if (level_ == 1) {
    // {prim::kPrimAllReduce, X}
    if (IsPrimitiveCNode(node, prim::kPrimAllReduce)) {
      auto cnode = node->cast<CNodePtr>();
      if (cnode->size() > 1) {
        all_reduce_ = cnode->input(0);
        x_ = cnode->input(1);
        is_reduce_match_ = true;
        all_reduce_fg_ = cnode->func_graph();
      }
    } else {
      tmp_ = node;
    }
  }
 }

 void AdjustAllReduceMulAdd::Reset() {
  level_ = 0;
  is_reduce_match_ = false;
  x_ = nullptr;
  y_ = nullptr;
  z_ = nullptr;
  tmp_ = nullptr;
  all_reduce_fg_ = nullptr;
 }

 }  // namespace irpass
 }  // namespace opt
 }  // namespace mindspore
--- a/mindspore/ccsrc/frontend/optimizer/irpass/arithmetic_simplify.h
+++ b/mindspore/ccsrc/frontend/optimizer/irpass/arithmetic_simplify.h
@@ -38,20 +38,14 @@ namespace irpass {

 // {prim::kPrimAddN, {prim::kPrimMakeTuple, {prim::kPrimMul, {prim::kPrimAllReduce, X}, Y}, Z}} ->
 // {prim::kPrimMul, {prim::kPrimAllReduce, {prim::kPrimAddN,{prim::kPrimMakeTuple, Z, X}}}, Y}
 class AdjustAllReduceMulAdd : public AnfVisitor {
 class AdjustAllReduceMulAdd : public OptimizerCaller {
 public:
  AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override;

  void ProcessDependEdge(const FuncGraphPtr &fg, const AnfNodePtr &addn_maketuple, const AnfNodePtr &new_node);
  void Visit(const AnfNodePtr &node) override;
  void Reset();

 private:
  int level_{0};
  bool is_reduce_match_{false};
  AnfNodePtr x_{nullptr}, y_{nullptr}, z_{nullptr}, tmp_{nullptr};
  AnfNodePtr all_reduce_{nullptr}, mul_{nullptr}, mul_cnode_{nullptr};
  FuncGraphPtr all_reduce_fg_{nullptr};
  AnfNodePtr mul_cnode_{nullptr};
 };

 class ArithmeticSimplify : public OptimizerCaller {
--- a/mindspore/ccsrc/frontend/optimizer/irpass/ref_eliminate.h
+++ b/mindspore/ccsrc/frontend/optimizer/irpass/ref_eliminate.h
@@ -37,27 +37,23 @@ class MakeRefEliminater : public OptimizerCaller {
 };

 // {prim::kPrimGetRefValue, Parameter} -> Parameter
 // {prim::kPrimGetRefOrigin, Parameter} -> Parameter
 class GetRefParamEliminater : public OptimizerCaller {
 public:
  AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
    PatternNode<AnfNodePtr> x;
    MATCH_REPLACE(node, PPrimitive(prim::kPrimGetRefValue, x), x);
    MATCH_REPLACE(node, PPrimitive(prim::kPrimGetRefOrigin, x), x);
    return nullptr;
  }
 };

 // {prim::kPrimGetRefKey, {prim::kPrimMakeRef, X, Y, Z}} -> X
 // {prim::kPrimGetRefValue, {prim::kPrimMakeRef, X, Y, Z}} -> Y
 // {prim::kPrimGetRefOrigin, {prim::kPrimMakeRef, X, Y, Z}} -> Z
 class GetMakeRefEliminater : public OptimizerCaller {
 public:
  AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
    PatternNode<AnfNodePtr> x, y, z;
    MATCH_REPLACE(node, PPrimitive(prim::kPrimGetRefKey, PPrimitive(prim::kPrimMakeRef, x, y, z)), x);
    MATCH_REPLACE(node, PPrimitive(prim::kPrimGetRefValue, PPrimitive(prim::kPrimMakeRef, x, y, z)), y);
    MATCH_REPLACE(node, PPrimitive(prim::kPrimGetRefOrigin, PPrimitive(prim::kPrimMakeRef, x, y, z)), z);

    return nullptr;
  }
--- a/mindspore/ccsrc/frontend/parallel/auto_parallel/graph_costmodel.h
+++ b/mindspore/ccsrc/frontend/parallel/auto_parallel/graph_costmodel.h
@@ -197,6 +197,9 @@ class CostGraph {
    inputs_tensor_name_list_.push_back(inputs_tensor_name);
  }
  const std::vector<std::vector<std::string>> get_inputs_tensor_name_list() const { return inputs_tensor_name_list_; }
  void set_inputs_tensor_name_list(const std::vector<std::vector<std::string>> &inputs_tensor_name_list) {
    inputs_tensor_name_list_ = inputs_tensor_name_list;
  }
  void add_tuple_getitem(const std::pair<std::string, std::string> &tuple_getitem) {
    auto ret = tuple_getitem_list_.insert(tuple_getitem);
    if (ret.second == false) {
--- a/mindspore/ccsrc/frontend/parallel/auto_parallel/operator_costmodel.h
+++ b/mindspore/ccsrc/frontend/parallel/auto_parallel/operator_costmodel.h
@@ -199,6 +199,8 @@ class SoftmaxCost : public OperatorCost {
 using SoftmaxCostPtr = std::shared_ptr<SoftmaxCost>;
 using TileCost = SoftmaxCost;
 using TileCostPtr = std::shared_ptr<TileCost>;
 using ConcatCost = TileCost;
 using ConcatCostPtr = std::shared_ptr<ConcatCost>;

 class TmpIdentityCost : public OperatorCost {
 public:
--- a/mindspore/ccsrc/frontend/parallel/dynamic_creator.h
+++ b/mindspore/ccsrc/frontend/parallel/dynamic_creator.h
@@ -136,6 +136,7 @@ REGISTER(EmbeddingLookupInfo);
 REGISTER(TileInfo);
 REGISTER(StridedSliceInfo);
 REGISTER(DropoutInfo);
 REGISTER(ConcatInfo);
 }  // namespace parallel
 }  // namespace mindspore

--- a/mindspore/ccsrc/frontend/parallel/node_check.cc
+++ b/mindspore/ccsrc/frontend/parallel/node_check.cc
@@ -24,7 +24,6 @@
 namespace mindspore {
 namespace parallel {
 const std::set<std::string> BLACK_LIST = {TUPLE_GETITEM,
                                          MAKE_TUPLE,
                                          J,
                                          LIST_GETITEM,
                                          ARRAY_GETITEM,
--- a/mindspore/ccsrc/frontend/parallel/ops_info/concat_info.cc
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/concat_info.cc
@@ -0,0 +1,268 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "frontend/parallel/ops_info/concat_info.h"

 #include <algorithm>
 #include <memory>
 #include <utility>
 #include <vector>

 #include "frontend/parallel/device_matrix.h"
 #include "frontend/parallel/strategy.h"
 #include "frontend/parallel/tensor_layout/tensor_redistribution.h"
 #include "pipeline/jit/resource.h"

 namespace mindspore {
 namespace parallel {
 Status ConcatInfo::GetAttrs() {
  int axis = 0;
  auto axis_iter = attrs_.find(AXIS);
  if (axis_iter != attrs_.end()) {
    MS_EXCEPTION_IF_NULL(axis_iter->second);
    if (axis_iter->second->isa<Int32Imm>()) {
      axis = axis_iter->second->cast<Int32ImmPtr>()->value();
    } else {
      MS_LOG(ERROR) << name_ << ": The value of axis is not int";
      return FAILED;
    }
  } else {
    MS_LOG(ERROR) << name_ << ": Can not find the axis attr";
    return FAILED;
  }

  if (inputs_shape_.empty()) {
    MS_LOG(ERROR) << name_ << ": The inputs shape is empty";
    return FAILED;
  }
  int dim = SizeToInt(inputs_shape_[0].size());

  if (axis < 0) {
    axis = axis + dim;
  }

  axis_ = SizeToInt(axis);
  return SUCCESS;
 }

 Status ConcatInfo::CheckStrategy(const StrategyPtr &strategy) {
  MS_EXCEPTION_IF_NULL(strategy);
  if (CheckStrategyValue(strategy, inputs_shape_, is_auto_parallel_) != SUCCESS) {
    MS_LOG(ERROR) << name_ << ": Invalid strategy";
    return FAILED;
  }

  std::vector<Dimensions> stra = strategy->GetInputDim();
  if (stra.empty()) {
    MS_LOG(ERROR) << name_ << ": The strategy is empty";
    return FAILED;
  }

  if (stra.size() != inputs_shape_.size()) {
    MS_LOG(ERROR) << name_ << ": The size of strategy must be equal to the size of inputs shape";
    return FAILED;
  }

  for (size_t i = 0; i < stra.size(); ++i) {
    auto strategy_ele = stra[i];
    auto input_shape_ele = inputs_shape_[i];
    if (strategy_ele.size() != input_shape_ele.size()) {
      MS_LOG(ERROR) << name_ << ": The size of strategy element must be equal to the size of input shape";
      return FAILED;
    }

    if (axis_ >= strategy_ele.size()) {
      MS_LOG(ERROR) << name_ << ": The axis is out of range, the axis is " << axis_;
      return FAILED;
    }

    if (strategy_ele[axis_] != 1) {
      MS_LOG(ERROR) << name_ << ": The axis can not be split";
      return FAILED;
    }

    for (size_t j = 0; j < strategy_ele.size(); ++j) {
      if (strategy_ele[j] != stra[0][j]) {
        MS_LOG(ERROR) << name_ << ": The strategy of each input tensor must be equal";
        return FAILED;
      }
    }
  }

  return SUCCESS;
 }

 Status ConcatInfo::InferDevMatrixShape() {
  MS_EXCEPTION_IF_NULL(strategy_);
  std::vector<Dimensions> stra = strategy_->GetInputDim();
  if (stra.empty()) {
    MS_LOG(ERROR) << name_ << "The strategy is empty";
    return FAILED;
  }

  dev_matrix_shape_ = stra[0];
  return SUCCESS;
 }

 Status ConcatInfo::InferTensorMap() {
  TensorMap tensor_map;
  if (inputs_shape_.empty()) {
    MS_LOG(ERROR) << name_ << "The inputs shape is empty";
    return FAILED;
  }

  // cannot use dev_matrix_shape_ replace inputs_shape_[0], because it may not be fully split in all devices.
  int32_t size = SizeToInt(inputs_shape_[0].size());
  for (int i = 0; i < size; ++i) {
    tensor_map.push_back(size - i - 1);
  }

  for (size_t i = 0; i < inputs_shape_.size(); ++i) {
    inputs_tensor_map_.push_back(tensor_map);
  }
  outputs_tensor_map_.push_back(tensor_map);
  return SUCCESS;
 }

 Status ConcatInfo::InferMirrorOps() {
  mirror_ops_.clear();
  if (inputs_tensor_map_.empty()) {
    MS_LOG(ERROR) << name_ << ": The inputs tensor map is empty";
    return FAILED;
  }

  Shape input_tensor_map = inputs_tensor_map_[0];
  std::vector<Group> group;
  if (CreateGroupByTensorMap(input_tensor_map, &group) != SUCCESS) {
    MS_LOG(ERROR) << name_ << ": Create group for input failed.";
    return FAILED;
  }

  if (group.empty()) {
    MS_LOG(INFO) << name_ << ": The mirror group is empty.";
    return SUCCESS;
  }

  OperatorVector input_op;
  input_op = CreateMirrorOps(group[0].name(), group[0].GetDevNum());
  for (size_t i = 0; i < inputs_shape_.size(); ++i) {
    mirror_ops_.push_back(input_op);
  }

  return SUCCESS;
 }

 Status ConcatInfo::InferTensorInfo() {
  if (inputs_shape_.empty() || outputs_shape_.empty() || inputs_tensor_map_.empty() || outputs_tensor_map_.empty()) {
    MS_LOG(ERROR) << name_ << ": Invalid args";
    return FAILED;
  }

  TensorLayout input_layout, output_layout;
  for (size_t i = 0; i < inputs_shape_.size(); ++i) {
    // infer tensor layout
    if (input_layout.InitFromVector(dev_matrix_shape_, inputs_tensor_map_[i], inputs_shape_[i]) != SUCCESS) {
      MS_LOG(ERROR) << name_ << ": Infer input tensor layout failed.";
      return FAILED;
    }
    TensorInfo input_tensor_info(input_layout);
    inputs_tensor_info_.push_back(input_tensor_info);
  }

  if (output_layout.InitFromVector(dev_matrix_shape_, outputs_tensor_map_[0], outputs_shape_[0]) != SUCCESS) {
    MS_LOG(ERROR) << name_ << ": Infer output tensor layout failed.";
    return FAILED;
  }
  TensorInfo output_tensor_info(output_layout);
  outputs_tensor_info_.push_back(output_tensor_info);
  return SUCCESS;
 }

 void ConcatInfo::ReComputeBatchSplitFlagList() {
  for (size_t i = 0; i < inputs_shape_.size(); i++) {
    split_flag_list_[i] = true;
  }
 }

 Status ConcatInfo::SetCostUnderStrategy(const StrategyPtr &strategy) {
  if (SetCostUnderStrategyBase(strategy) != SUCCESS) {
    MS_LOG(ERROR) << name_ << ": Set cost under strategy failed.";
    return FAILED;
  }

  return SUCCESS;
 }

 Status ConcatInfo::GenerateStrategies(int32_t stage_id) {
  if (InferAttrs() != SUCCESS) {
    MS_LOG(ERROR) << name_ << ": Infer attrs failed";
    return FAILED;
  }
  if (inputs_shape_.empty()) {
    MS_LOG(ERROR) << name_ << ": The inputs shape is empty";
    return FAILED;
  }
  Shape input_split;
  for (size_t i = 0; i < inputs_shape_[0].size(); ++i) {
    if (i == axis_) {
      input_split.push_back(0);
    } else {
      input_split.push_back(1);
    }
  }
  Shapes splittable_inputs;
  for (size_t i = 0; i < inputs_shape_.size(); ++i) {
    splittable_inputs.push_back(input_split);
  }

  std::vector<StrategyPtr> sp_vector;
  is_auto_parallel_ = true;
  if (GenerateStrategiesWithBroadcast(stage_id, inputs_shape_, splittable_inputs, &sp_vector) != SUCCESS) {
    return FAILED;
  }

  size_t success = 0;
  for (auto &sp : sp_vector) {
    PrintStrategy(sp);
    if (SetCostUnderStrategy(sp) == SUCCESS) {
      success++;
      MS_LOG(INFO) << name_ << ": Successfully generated " << success << " strategy.";
      PrintStrategy(sp);
    }
  }
  return SUCCESS;
 }

 Status ConcatInfo::Init(const StrategyPtr &strategy) {
  if (InitWithAutoRepeatCalc(strategy) != SUCCESS) {
    MS_LOG(ERROR) << name_ << ": Init failed.";
    return FAILED;
  }
  MS_LOG(INFO) << name_ << ": Init success.";
  return SUCCESS;
 }

 Status ConcatInfo::InitForCostModel(const StrategyPtr &strategy) {
  if (InitForCostModelWithAutoRepeatCalc(strategy) != SUCCESS) {
    MS_LOG(ERROR) << name_ << ": Init for cost model failed.";
    return FAILED;
  }

  MS_LOG(INFO) << name_ << ": Init for cost model success.";
  return SUCCESS;
 }
 }  // namespace parallel
 }  // namespace mindspore
--- a/mindspore/ccsrc/frontend/parallel/ops_info/concat_info.h
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/concat_info.h
@@ -0,0 +1,62 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_CONCAT_INFO_H_
 #define MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_CONCAT_INFO_H_

 #include <string>
 #include <memory>
 #include <unordered_map>
 #include <vector>

 #include "ir/value.h"
 #include "frontend/parallel/auto_parallel/operator_costmodel.h"
 #include "frontend/parallel/ops_info/operator_info.h"
 #include "frontend/parallel/strategy.h"

 namespace mindspore {
 namespace parallel {
 class ConcatInfo : public OperatorInfo {
 public:
  ConcatInfo(const std::string &operator_name, const Shapes &inputs_shape, const Shapes &outputs_shape,
             const PrimitiveAttrs &attrs)
      : OperatorInfo(operator_name, inputs_shape, outputs_shape, attrs, std::make_shared<ConcatCost>(false)) {}
  ~ConcatInfo() override = default;

  Status Init(const StrategyPtr &strategy) override;
  Status InitForCostModel(const StrategyPtr &strategy) override;
  Status GenerateStrategies(int32_t) override;
  Status SetCostUnderStrategy(const StrategyPtr &) override;
  void ReComputeBatchSplitFlagList() override;

 protected:
  Status GetAttrs() override;
  Status CheckStrategy(const StrategyPtr &strategy) override;
  Status InferMirrorOps() override;
  Status InferForwardCommunication() override { return SUCCESS; }
  Status InferTensorInfo() override;
  Status InferDevMatrixShape() override;
  Status InferTensorMap() override;

 private:
  size_t axis_ = 0;
 };

 using ConcatInfoPtr = std::shared_ptr<ConcatInfo>;
 }  // namespace parallel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_CONCAT_INFO_H_
--- a/mindspore/ccsrc/frontend/parallel/ops_info/ops_info_head_files.h
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/ops_info_head_files.h
@@ -39,5 +39,6 @@
 #include "frontend/parallel/ops_info/gather_v2_p_info.h"
 #include "frontend/parallel/ops_info/tile_info.h"
 #include "frontend/parallel/ops_info/strided_slice_info.h"
 #include "frontend/parallel/ops_info/concat_info.h"

 #endif  // MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_HEAD_FILES_H_
--- a/mindspore/ccsrc/frontend/parallel/ps/common.h
+++ b/mindspore/ccsrc/frontend/parallel/ps/common.h
@@ -56,9 +56,11 @@ constexpr char kMomentum[] = "momentum";

 constexpr char kApplyMomentum[] = "ApplyMomentum";
 constexpr char kSparseAdam[] = "Adam";
 constexpr char kSparseLazyAdam[] = "LazyAdam";
 constexpr char kSparseFtrl[] = "Ftrl";
 constexpr char kApplyMomentumOp[] = "Momentum";
 constexpr char kSparseAdamOp[] = "Adam";
 constexpr char kSparseLazyAdamOp[] = "LazyAdam";
 constexpr char kSparseFtrlOp[] = "FTRL";

 constexpr int kInitWeightsCmd = 10;
--- a/mindspore/ccsrc/frontend/parallel/ps/optimizer_info.cc
+++ b/mindspore/ccsrc/frontend/parallel/ps/optimizer_info.cc
@@ -126,6 +126,15 @@ MomentumOptimInfo::MomentumOptimInfo(const AddressPtr &weight, const AddressPtr
  inputs_.push_back(momentum);
 }

 void MomentumOptimInfo::Update(const Values &values, const Lengths &lens) {
  size_t lr_offset = 0;
  float *lr = values.data() + lr_offset;
  auto ret = memcpy_s(inputs_[2]->addr, sizeof(float), lr, sizeof(float));
  if (ret != 0) {
    MS_LOG(EXCEPTION) << "memcpy_s error, errorno(" << ret << ")";
  }
 }

 const AddressPtr &MomentumOptimInfo::gradient() { return inputs_[3]; }

 const AddressPtr &MomentumOptimInfo::indices() { return inputs_[3]; }
--- a/mindspore/ccsrc/frontend/parallel/ps/optimizer_info.h
+++ b/mindspore/ccsrc/frontend/parallel/ps/optimizer_info.h
@@ -82,6 +82,7 @@ class MomentumOptimInfo : public DenseOptimInfo {
                    const AddressPtr &gradient, const AddressPtr &momentum);
  ~MomentumOptimInfo() override = default;

  void Update(const Values &values, const Lengths &lens) override;
  const AddressPtr &gradient();
  const AddressPtr &indices();
  size_t grad_index() override;
--- a/mindspore/ccsrc/frontend/parallel/ps/parameter_server.h
+++ b/mindspore/ccsrc/frontend/parallel/ps/parameter_server.h
@@ -42,6 +42,7 @@
 #include "backend/kernel_compiler/kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
 #include "backend/kernel_compiler/cpu/ps/pserver_kernel.h"
 #include "backend/kernel_compiler/cpu/ps/sparse_apply_adam_ps_kernel.h"
 #include "backend/kernel_compiler/cpu/ps/sparse_apply_lazy_adam_ps_kernel.h"
 #include "backend/kernel_compiler/cpu/ps/sparse_apply_ftrl_ps_kernel.h"
 #include "backend/kernel_compiler/cpu/ps/apply_momentum_ps_kernel.h"
@@ -374,6 +375,11 @@ void ParameterServer<T>::InitOptimInputsShape(const Keys &keys, const Values &va
      const CNodePtr cnode = GetCNode(optim_op_name);
      MS_EXCEPTION_IF_NULL(cnode);
      if (optim_name == kSparseAdam) {
        std::shared_ptr<PServerKernel> optimizer =
          std::make_shared<kernel::ps::SparseApplyAdamPSKernel>(rank_id_, pserver_num_);
        optimizer->InitKernel(cnode, optim_inputs_shape_[key]);
        optimizers_[key] = optimizer;
      } else if (optim_name == kSparseLazyAdam) {
        std::shared_ptr<PServerKernel> optimizer =
          std::make_shared<kernel::ps::SparseApplyLazyAdamPSKernel>(rank_id_, pserver_num_);
        optimizer->InitKernel(cnode, optim_inputs_shape_[key]);
--- a/mindspore/ccsrc/frontend/parallel/ps/util.cc
+++ b/mindspore/ccsrc/frontend/parallel/ps/util.cc
@@ -25,19 +25,22 @@ namespace ps {
 std::unordered_map<std::string, int> Util::optimizer_to_ids{
  {kApplyMomentum, 0},
  {kSparseAdam, 1},
  {kSparseFtrl, 2},
  {kSparseLazyAdam, 2},
  {kSparseFtrl, 3},
 };

 std::unordered_map<int, std::string> Util::id_to_optimizers{
  {0, kApplyMomentum},
  {1, kSparseAdam},
  {2, kSparseFtrl},
  {2, kSparseLazyAdam},
  {3, kSparseFtrl},
 };

 std::unordered_map<int, std::string> Util::id_to_optimizer_nodes{
  {0, kApplyMomentumOp},
  {1, kSparseAdamOp},
  {2, kSparseFtrlOp},
  {2, kSparseLazyAdamOp},
  {3, kSparseFtrlOp},
 };

 bool Util::IsParamServerMode() { return IsRoleOfWorker() || IsRoleOfPServer() || IsRoleOfScheduler(); }
--- a/mindspore/ccsrc/frontend/parallel/step_auto_parallel.cc
+++ b/mindspore/ccsrc/frontend/parallel/step_auto_parallel.cc
@@ -118,6 +118,9 @@ bool StepAutoParallel(const FuncGraphPtr &root, const opt::OptimizerPtr &) {
 std::vector<bool> ExtractInputParameterByNode(const CNodePtr &node) {
  std::vector<bool> is_parameter;
  std::vector<AnfNodePtr> node_inputs{node->inputs()};
  if ((node_inputs.size() == 2) && AnfNodeIsPrimitive(node_inputs[1], MAKE_TUPLE)) {
    node_inputs = node_inputs[1]->cast<CNodePtr>()->inputs();
  }
  for (size_t i = 1; i < node_inputs.size(); ++i) {
    auto input = node_inputs[i];

@@ -192,6 +195,10 @@ std::vector<size_t> ExtractInputTypeLengthByNode(const CNodePtr &node) {
  std::vector<size_t> inputs_type_len;
  std::vector<AnfNodePtr> node_inputs{node->inputs()};

  if ((node_inputs.size() == 2) && AnfNodeIsPrimitive(node_inputs[1], MAKE_TUPLE)) {
    node_inputs = node_inputs[1]->cast<CNodePtr>()->inputs();
  }

  // extract input element length
  for (auto &input : node_inputs) {
    if (IsValueNode<RefKey>(input)) {
@@ -255,7 +262,7 @@ bool IsSplittableOperator(const std::string &op_name) {
     FLOORDIV, L2_NORMALIZE, TENSOR_ADD, MAXPOOL, MAXPOOLV2, VIRTUAL_DATA_SET, RELU, ONEHOT, DROPOUT_DO_MASK,
     REDUCE_MAX, REDUCE_MIN, ARGMAXWITHVALUE, ARGMINWITHVALUE, REDUCE_SUM, CONV2D, FUSE_BATCH_NORM, POOLING,
     MAX_POOL_WITH_ARGMAX, SIMPLE_MEAN, FLATTEN, BATCH_NORM, LAYER_NORM, BIAS_ADD, ASSIGN_SUB, COS, ACOS, EXP,
     LOG, REDUCE_MEAN, REAL_DIV, SIGMOID, POW, MAXIMUM, MINIMUM, EQUAL, NOT_EQUAL, LOGICALNOT, GATHERV2, SQRT,
     LOG, REDUCE_MEAN, REAL_DIV, SIGMOID, POW, MAXIMUM, MINIMUM, EQUAL, NOT_EQUAL, LOGICALNOT, GATHERV2, SQRT, CONCAT,
     STRIDEDSLICE, GET_NEXT, CAST, NEG, SQUARE, BATCH_MATMUL, EXPAND_DIMS, SQUEEZE, SPARSE_GATHERV2, TILE, DROPOUT,
     SOFTMAX_CROSS_ENTROPY_WITH_LOGITS, SIGMOID_CROSS_ENTROPY_WITH_LOGITS, SPARSE_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS};
  // clang-format on
@@ -275,7 +282,7 @@ bool IsAutoParallelCareNode(const CNodePtr &cnode) {
    return false;
  }
  bool bool_result = IsParallelCareNode(cnode) && !IsSplittableOperator(prim->name());
  if (bool_result) {
  if (bool_result && (prim->name() != MAKE_TUPLE)) {
    MS_LOG(EXCEPTION) << "Should implementing OperatorInfo for: " << prim->name();
  } else if (prim->name() == CAST) {
    if (cnode->fullname_with_scope().find(OPTIMIZER_SUB_STRING) != std::string::npos) {
@@ -520,6 +527,10 @@ Status ConstructCostGraphNodesByUniqueIdTC(const std::vector<AnfNodePtr> &all_no
          MS_LOG(EXCEPTION) << "The OperatorInfo: " << current_op_ptr->name()
                            << " does not match the Prim: " << prim->name();
        }

        // Needed by rec_parser
        ModifyInputsTensorNameListIfOperatorInfoCreated(current_op_ptr->name(), cnode->UniqueId());

        cnode->set_user_data<OperatorInfo>(current_op_ptr);
        MS_LOG(INFO) << "The CNode with UniqueId: " << cnode->UniqueId()
                     << " and UniqueIdThroughCopy: " << cnode->UniqueIdThroughCopy()
@@ -1117,6 +1128,27 @@ CNodePtr GetInternalOperatorInfo(const CNodePtr &cnode, const ValueNodePtr &prim
  return nullptr;
 }

 void ModifyInputsTensorNameListIfOperatorInfoCreated(const std::string &name, const std::string &uniqueid) {
  size_t iter_ops = 0;
  for (auto op : entire_costgraph->GetOperators()) {
    if (op->name() == name) {
      break;
    }
    iter_ops = iter_ops + 1;
  }

  std::vector<std::vector<std::string>> input_tensor_names = entire_costgraph->get_inputs_tensor_name_list();
  for (size_t i = 0; i < input_tensor_names.size(); i++) {
    for (size_t j = 0; j < input_tensor_names[i].size(); j++) {
      if (input_tensor_names[i][j] == uniqueid) {
        input_tensor_names[i][j] = input_tensor_names[iter_ops][0];
      }
    }
  }

  entire_costgraph->set_inputs_tensor_name_list(input_tensor_names);
 }

 Status ParallelStrategyRecSearch(const std::vector<AnfNodePtr> &all_nodes, const FuncGraphPtr &root) {
  if (CostModelContext::GetInstance()->is_multi_subgraphs()) {
    if (ConstructCostGraphNodesByUniqueIdTC(all_nodes, root) == SUCCESS) {
--- a/mindspore/ccsrc/frontend/parallel/step_auto_parallel.h
+++ b/mindspore/ccsrc/frontend/parallel/step_auto_parallel.h
@@ -59,6 +59,8 @@ std::vector<std::vector<std::string>> RecInputTensorNames(const std::map<std::st
                                                          std::vector<std::vector<std::string>> input_tensor_names);

 CNodePtr GetInternalOperatorInfo(const CNodePtr &cnode, const ValueNodePtr &prim_anf_node);

 void ModifyInputsTensorNameListIfOperatorInfoCreated(const std::string &name, const std::string &uniqueid);
 }  // namespace parallel
 }  // namespace mindspore
 #endif  // PARALLEL_STEP_AUTO_PARALLEL_H_
--- a/mindspore/ccsrc/frontend/parallel/step_parallel.cc
+++ b/mindspore/ccsrc/frontend/parallel/step_parallel.cc
@@ -267,6 +267,33 @@ TensorLayout GetTensorInLayout(const CNodePtr &middle_node, const PrimitivePtr &
  return tensorinfo_in.tensor_layout();
 }

 bool AnfNodeIsPrimitive(const AnfNodePtr &anf_node, const std::string &prim_name) {
  MS_EXCEPTION_IF_NULL(anf_node);
  auto cnode = anf_node->cast<CNodePtr>();
  if ((cnode == nullptr) || !IsValueNode<Primitive>(cnode->input(0))) {
    return false;
  }

  auto value_node = cnode->input(0)->cast<ValueNodePtr>();
  auto prim = GetValueNode<PrimitivePtr>(value_node);
  MS_EXCEPTION_IF_NULL(prim);
  if (prim->name() == prim_name) {
    return true;
  }
  return false;
 }

 std::string GetPrimName(const CNodePtr &node) {
  MS_EXCEPTION_IF_NULL(node);
  if (!IsValueNode<Primitive>(node->input(0))) {
    MS_LOG(EXCEPTION) << "The node is not a primitive";
  }
  auto value_node = node->input(0)->cast<ValueNodePtr>();
  auto prim = GetValueNode<PrimitivePtr>(value_node);
  MS_EXCEPTION_IF_NULL(prim);
  return prim->name();
 }

 OperatorInfoPtr GetDistributeOperator(const CNodePtr &node) {
  MS_EXCEPTION_IF_NULL(node);
  if (!IsParallelCareNode(node)) {
@@ -274,7 +301,7 @@ OperatorInfoPtr GetDistributeOperator(const CNodePtr &node) {
  }
  OperatorInfoPtr distribute_operator = node->user_data<OperatorInfo>();
  if (distribute_operator == nullptr) {
    MS_LOG(EXCEPTION) << "GetDistributeOperator:distribute_operator is nullptr";
    MS_LOG(EXCEPTION) << "Distribute operator is nullptr, the prim is " << GetPrimName(node);
  }
  return distribute_operator;
 }
@@ -423,6 +450,11 @@ void StepRedistribution(const CNodePtr &node, const OperatorInfoPtr &distribute_
  MS_EXCEPTION_IF_NULL(manager);
  AnfNodeIndexSet node_set = manager->node_users()[node];
  CNodePtr insert_node_new;

  if (AnfNodeIsPrimitive(node, MAKE_TUPLE)) {
    MS_LOG(INFO) << "No need to insert redistribution op betweend make_tuple node and the next node";
    return;
  }
  if (IsValueNode<Primitive>(node->input(0))) {
    auto current_value = node->input(0)->cast<ValueNodePtr>();
    MS_EXCEPTION_IF_NULL(current_value);
@@ -875,9 +907,15 @@ void InsertMirrorOps(const MirrorOps &mirror_ops, const CNodePtr &node) {
  MS_EXCEPTION_IF_NULL(func_graph);
  FuncGraphManagerPtr manager = func_graph->manager();
  MS_EXCEPTION_IF_NULL(manager);

  if ((node->inputs().size() == 2) && AnfNodeIsPrimitive(node->input(1), MAKE_TUPLE)) {
    MS_LOG(INFO) << "The mirror for " << GetPrimName(node) << " has handle by make_tuple node";
    return;
  }

  if (mirror_ops.size() != node_size - 1) {
    MS_LOG(EXCEPTION) << "Failure:Mirrorops's size is wrong! mirror_ops size is  " << mirror_ops.size()
                      << ", node_size is  " << node_size;
    MS_LOG(EXCEPTION) << "Mirrorops's size is wrong! mirror_ops size is " << mirror_ops.size() << ", node_size is "
                      << node_size - 1;
  }
  for (size_t index = 1; index < node_size; ++index) {
    OperatorVector backward_op = mirror_ops[index - 1];
@@ -993,7 +1031,7 @@ OperatorInfoPtr OperatorInstance(const PrimitivePtr &prim, const PrimitiveAttrs
                                 const std::vector<Shapes> &shape_list) {
  MS_EXCEPTION_IF_NULL(prim);
  OperatorInfoPtr operator_ = OperatorInstanceByName(prim->name(), attrs, shape_list);
  if (operator_ == nullptr) {
  if ((operator_ == nullptr) && (prim->name() != MAKE_TUPLE)) {
    MS_LOG(INFO) << "Creat " << prim->name() << " failed, use batch parallel";
    operator_ = OperatorInstanceByName(BATCH_PARALLEL, attrs, shape_list);
    MS_EXCEPTION_IF_NULL(operator_);
@@ -1177,7 +1215,12 @@ std::vector<Shapes> ExtractShape(const CNodePtr &node) {
      continue;
    }
    if (input_shapes.size() != 1) {
      MS_LOG(EXCEPTION) << "ExtractShape:Get input shape failed";
      if (inputs_size == 2) {  // like concat
        shape_inputs = input_shapes;
        break;
      } else {
        MS_LOG(EXCEPTION) << "ExtractShape: Get input shape failed";
      }
    }
    shape_inputs.push_back(input_shapes[0]);
  }
@@ -1269,8 +1312,8 @@ void SetParallelShape(const AnfNodePtr &parameter, const std::pair<AnfNodePtr, i
  }
  TensorInfo tensorinfo_in = distribute_operator->inputs_tensor_info()[IntToSize(res.second - 1)];
  Shape slice_shape = tensorinfo_in.slice_shape();
  MS_LOG(DEBUG) << "SetParallelShape slice_shape  " << parameter->ToString() << "  shape "
                << MakeValue(slice_shape)->ToString();
  MS_LOG(INFO) << "SetParallelShape slice_shape  " << parameter->ToString() << "  shape "
               << MakeValue(slice_shape)->ToString() << ", op name is " << distribute_operator->name();
  std::shared_ptr<abstract::BaseShape> parallel_shape = std::make_shared<abstract::Shape>(slice_shape);
  MS_EXCEPTION_IF_NULL(parallel_shape);
  // Don't modify it in-place as the pointer of this AbstractValue may used as cache key in StaticAnalysis.
@@ -1450,6 +1493,9 @@ void ExtractInformation(const std::vector<AnfNodePtr> &all_nodes) {
    SetVirtualDatasetStrategy(cnode);
    ValueNodePtr prim_anf_node = cnode->input(0)->cast<ValueNodePtr>();
    PrimitivePtr prim = GetValueNode<PrimitivePtr>(prim_anf_node);
    if (prim->name() == MAKE_TUPLE) {
      continue;
    }
    auto attrs = prim->attrs();
    MS_LOG(INFO) << "extract information: node: " << node->ToString() << " prim " << prim->name();
    if (IsParallelCareNode(cnode)) {
@@ -2045,13 +2091,13 @@ void ParallelCommunication(const FuncGraphPtr &root, const std::vector<AnfNodePt
    MS_EXCEPTION_IF_NULL(node);
    if (node->isa<CNode>()) {
      auto cnode = node->cast<CNodePtr>();
      if (!IsValueNode<Primitive>(cnode->input(0))) {
      // the make_tuple is parallel care node, but it may have not operator info
      if (!IsParallelCareNode(cnode) || !cnode->has_user_data<OperatorInfo>()) {
        continue;
      }

      OperatorInfoPtr distribute_operator = GetDistributeOperator(cnode);
      if (distribute_operator == nullptr) {
        continue;
      }
      MS_EXCEPTION_IF_NULL(distribute_operator);

      // insert forward ops
      InsertForwardOps(distribute_operator, cnode);
@@ -2074,13 +2120,12 @@ void ParallelCommunication(const FuncGraphPtr &root, const std::vector<AnfNodePt
    MS_EXCEPTION_IF_NULL(node);
    if (node->isa<CNode>()) {
      auto cnode = node->cast<CNodePtr>();
      if (!IsValueNode<Primitive>(cnode->input(0))) {
      if (!IsParallelCareNode(cnode) || !cnode->has_user_data<OperatorInfo>()) {
        continue;
      }

      OperatorInfoPtr distribute_operator = GetDistributeOperator(cnode);
      if (distribute_operator == nullptr) {
        continue;
      }
      MS_EXCEPTION_IF_NULL(distribute_operator);
      // StepReplace
      StepReplace(distribute_operator, cnode);
    }
@@ -2330,6 +2375,44 @@ Status ParallelInit() {
  return SUCCESS;
 }

 void HandleForwardMakeTuple(const std::vector<AnfNodePtr> &all_nodes) {
  for (auto &node : all_nodes) {
    if (!AnfNodeIsPrimitive(node, MAKE_TUPLE)) {
      continue;
    }

    auto cnode = node->cast<CNodePtr>();
    MS_EXCEPTION_IF_NULL(cnode);
    if (!cnode->in_forward_flag()) {
      continue;
    }

    FuncGraphManagerPtr manager = cnode->func_graph()->manager();
    MS_EXCEPTION_IF_NULL(manager);
    auto make_tuple_user = manager->node_users()[cnode];
    if (make_tuple_user.size() != 1) {
      MS_LOG(EXCEPTION) << "Now the make_tuple's user must be 1, but got " << make_tuple_user.size();
    }
    CNodePtr make_tuple_next_cnode = make_tuple_user.pop().first->cast<CNodePtr>();
    MS_EXCEPTION_IF_NULL(make_tuple_next_cnode);

    std::string make_tuple_user_prim_name = GetPrimName(make_tuple_next_cnode);
    if (!IsParallelCareNode(make_tuple_next_cnode)) {
      MS_LOG(INFO) << "The make_tuple's user is " << make_tuple_user_prim_name << ", no need to set operator info";
      continue;
    }
    if (make_tuple_next_cnode->inputs().size() != 2) {
      MS_LOG(EXCEPTION) << "Now the make_tuple's user only support 1 input, but got "
                        << make_tuple_next_cnode->inputs().size() - 1;
    }

    MS_LOG(INFO) << "Set the make_tuple's operator info, and the op name is " << make_tuple_user_prim_name;
    OperatorInfoPtr op_info = GetDistributeOperator(make_tuple_next_cnode);
    MS_EXCEPTION_IF_NULL(op_info);
    cnode->set_user_data<OperatorInfo>(op_info);
  }
 }

 bool StepParallel(const FuncGraphPtr &root, const opt::OptimizerPtr &optimizer) {
  MS_EXCEPTION_IF_NULL(root);
  MS_EXCEPTION_IF_NULL(optimizer);
@@ -2383,6 +2466,9 @@ bool StepParallel(const FuncGraphPtr &root, const opt::OptimizerPtr &optimizer)
    ExtractInformation(all_nodes);
    ReshapeInit(all_nodes);
  }

  HandleForwardMakeTuple(all_nodes);

  // save strategy as checkpoint for multi-train
  if (StrategyCheckpoint::GetInstance().SaveCheckPointOn()) {
    CheckpointStrategy(root);
--- a/mindspore/ccsrc/frontend/parallel/step_parallel.h
+++ b/mindspore/ccsrc/frontend/parallel/step_parallel.h
@@ -149,6 +149,8 @@ Status ParallelInit();
 std::vector<std::string> ExtractInputsTensorName(const CNodePtr &node);

 std::set<FuncGraphPtr> ForwardGraph(const FuncGraphPtr &root);

 bool AnfNodeIsPrimitive(const AnfNodePtr &anf_node, const std::string &prim_name);
 }  // namespace parallel
 }  // namespace mindspore

--- a/mindspore/ccsrc/gvar/CMakeLists.txt
+++ b/mindspore/ccsrc/gvar/CMakeLists.txt
@@ -1,6 +0,0 @@
 file(GLOB_RECURSE MS_GVAR_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cc)
 set_property(SOURCE ${MS_GVAR_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_COMMON)
 add_library(mindspore_gvar SHARED ${MS_GVAR_SRC_LIST})
 if (APPLE)
    set_target_properties(mindspore_gvar PROPERTIES MACOSX_RPATH ON)
 endif ()
--- a/mindspore/ccsrc/minddata/dataset/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/CMakeLists.txt
@@ -62,12 +62,15 @@ add_subdirectory(text)
 add_dependencies(utils core)
 add_dependencies(kernels-image core)
 add_dependencies(kernels-data core)
 add_dependencies(kernels-soft-dvpp-image core soft-dvpp-utils)
 add_dependencies(kernels core)
 add_dependencies(engine-datasetops-source core)
 add_dependencies(engine-datasetops-source-sampler core)
 add_dependencies(engine-datasetops core)
 add_dependencies(engine-datasetops-mapop core)
 add_dependencies(engine-opt core)
 add_dependencies(engine-cache-client core)
 add_dependencies(engine-cache-server core)
 add_dependencies(engine-perf core)
 add_dependencies(engine-gnn core)
 add_dependencies(engine core)
@@ -88,6 +91,8 @@ set(submodules
    $<TARGET_OBJECTS:kernels-image>
    $<TARGET_OBJECTS:kernels-data>
    $<TARGET_OBJECTS:cpp-API>
    $<TARGET_OBJECTS:kernels-soft-dvpp-image>
    $<TARGET_OBJECTS:soft-dvpp-utils>
    $<TARGET_OBJECTS:engine-datasetops-source>
    $<TARGET_OBJECTS:engine-datasetops-source-sampler>
    $<TARGET_OBJECTS:engine-datasetops-mapop>
@@ -126,7 +131,7 @@ endif()
 ######################################################################

 ################# Link with external libraries ########################
 target_link_libraries(_c_dataengine PRIVATE mindspore mindspore_gvar)
 target_link_libraries(_c_dataengine PRIVATE mindspore)
 if (${CMAKE_SYSTEM_NAME} MATCHES "Windows")
    if (ENABLE_PYTHON)
        target_link_libraries(_c_dataengine PRIVATE mindspore::pybind11_module ${PYTHON_LIBRARIES} mindspore::protobuf ${SECUREC_LIBRARY})  
@@ -141,7 +146,7 @@ else()
        target_link_libraries(_c_dataengine PRIVATE -ldl mindspore::protobuf ${SECUREC_LIBRARY})
    endif()
 endif()
 target_link_libraries(_c_dataengine PUBLIC mindspore::jpeg_turbo mindspore::opencv_core mindspore::opencv_imgcodecs
 target_link_libraries(_c_dataengine PUBLIC mindspore::jpeg_turbo mindspore::turbojpeg mindspore::opencv_core mindspore::opencv_imgcodecs
        mindspore::opencv_imgproc mindspore::tinyxml2 mindspore::sentencepiece mindspore::sentencepiece_train ${ICU_LIB})
 if (ENABLE_GPUQUE)
    target_link_libraries(_c_dataengine PRIVATE gpu_queue
--- a/mindspore/ccsrc/minddata/dataset/api/datasets.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/datasets.cc
@@ -61,11 +61,19 @@ namespace api {
  } while (false)

 // Function to create the iterator, which will build and launch the execution tree.
 std::shared_ptr<Iterator> Dataset::CreateIterator() {
 std::shared_ptr<Iterator> Dataset::CreateIterator(std::vector<std::string> columns) {
  std::shared_ptr<Iterator> iter;
  try {
    auto ds = shared_from_this();

    // The specified columns will be selected from the dataset and passed down the pipeline
    // in the order specified, other columns will be discarded.
    if (!columns.empty()) {
      ds = ds->Project(columns);
    }

    iter = std::make_shared<Iterator>();
    Status rc = iter->BuildAndLaunchTree(shared_from_this());
    Status rc = iter->BuildAndLaunchTree(ds);
    if (rc.IsError()) {
      MS_LOG(ERROR) << "CreateIterator failed." << rc;
      return nullptr;
@@ -629,13 +637,13 @@ bool VOCDataset::ValidateParams() {
    }
    Path imagesets_file = dir / "ImageSets" / "Segmentation" / mode_ + ".txt";
    if (!imagesets_file.Exists()) {
      MS_LOG(ERROR) << "[Segmentation] imagesets_file is invalid or not exist";
      MS_LOG(ERROR) << "Invalid mode: " << mode_ << ", file \"" << imagesets_file << "\" is not exists!";
      return false;
    }
  } else if (task_ == "Detection") {
    Path imagesets_file = dir / "ImageSets" / "Main" / mode_ + ".txt";
    if (!imagesets_file.Exists()) {
      MS_LOG(ERROR) << "[Detection] imagesets_file is invalid or not exist.";
      MS_LOG(ERROR) << "Invalid mode: " << mode_ << ", file \"" << imagesets_file << "\" is not exists!";
      return false;
    }
  } else {
@@ -655,18 +663,33 @@ std::vector<std::shared_ptr<DatasetOp>> VOCDataset::Build() {
    sampler_ = CreateDefaultSampler();
  }

  std::shared_ptr<VOCOp::Builder> builder = std::make_shared<VOCOp::Builder>();
  (void)builder->SetDir(dataset_dir_);
  (void)builder->SetTask(task_);
  (void)builder->SetMode(mode_);
  (void)builder->SetNumWorkers(num_workers_);
  (void)builder->SetSampler(std::move(sampler_->Build()));
  (void)builder->SetDecode(decode_);
  (void)builder->SetClassIndex(class_index_);
  auto schema = std::make_unique<DataSchema>();
  VOCOp::TaskType task_type_;

  std::shared_ptr<VOCOp> op;
  RETURN_EMPTY_IF_ERROR(builder->Build(&op));
  node_ops.push_back(op);
  if (task_ == "Segmentation") {
    task_type_ = VOCOp::TaskType::Segmentation;
    RETURN_EMPTY_IF_ERROR(schema->AddColumn(
      ColDescriptor(std::string(kColumnImage), DataType(DataType::DE_UINT8), TensorImpl::kFlexible, 1)));
    RETURN_EMPTY_IF_ERROR(schema->AddColumn(
      ColDescriptor(std::string(kColumnTarget), DataType(DataType::DE_UINT8), TensorImpl::kFlexible, 1)));
  } else if (task_ == "Detection") {
    task_type_ = VOCOp::TaskType::Detection;
    RETURN_EMPTY_IF_ERROR(schema->AddColumn(
      ColDescriptor(std::string(kColumnImage), DataType(DataType::DE_UINT8), TensorImpl::kFlexible, 1)));
    RETURN_EMPTY_IF_ERROR(schema->AddColumn(
      ColDescriptor(std::string(kColumnBbox), DataType(DataType::DE_FLOAT32), TensorImpl::kFlexible, 1)));
    RETURN_EMPTY_IF_ERROR(schema->AddColumn(
      ColDescriptor(std::string(kColumnLabel), DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1)));
    RETURN_EMPTY_IF_ERROR(schema->AddColumn(
      ColDescriptor(std::string(kColumnDifficult), DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1)));
    RETURN_EMPTY_IF_ERROR(schema->AddColumn(
      ColDescriptor(std::string(kColumnTruncate), DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1)));
  }

  std::shared_ptr<VOCOp> voc_op;
  voc_op = std::make_shared<VOCOp>(task_type_, mode_, dataset_dir_, class_index_, num_workers_, rows_per_buffer_,
                                   connector_que_size_, decode_, std::move(schema), std::move(sampler_->Build()));
  node_ops.push_back(voc_op);
  return node_ops;
 }

--- a/mindspore/ccsrc/minddata/dataset/api/iterator.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/iterator.cc
@@ -30,6 +30,19 @@ void Iterator::GetNextRow(TensorMap *row) {
  }
 }

 // Get the next row from the data pipeline.
 void Iterator::GetNextRow(TensorVec *row) {
  TensorRow tensor_row;
  Status rc = iterator_->FetchNextTensorRow(&tensor_row);
  if (rc.IsError()) {
    MS_LOG(ERROR) << "GetNextRow: Failed to get next row.";
    row->clear();
  }
  // Generate a vector as return
  row->clear();
  std::copy(tensor_row.begin(), tensor_row.end(), std::back_inserter(*row));
 }

 // Shut down the data pipeline.
 void Iterator::Stop() {
  // Releasing the iterator_ unique_ptre. This should trigger the destructor of iterator_.
@@ -61,13 +74,20 @@ Status Iterator::BuildAndLaunchTree(std::shared_ptr<Dataset> ds) {
      RETURN_STATUS_UNEXPECTED("Node operation returned nothing");
    }

    auto root_op = root_ops.front();

    RETURN_UNEXPECTED_IF_NULL(root_op);

    RETURN_IF_NOT_OK(tree_->AssociateNode(root_op));

    q.push(std::make_pair(ds, root_op));
    // Iterate through all the DatasetOps returned by Dataset's Build(), associate them
    // with the execution tree and add the child and parent relationship between the nodes
    // Note that some Dataset objects might return more than one DatasetOps
    // e.g. MapDataset will return [ProjectOp, MapOp] if project_columns is set for MapDataset
    std::shared_ptr<DatasetOp> prev_op = nullptr;
    for (auto op : root_ops) {
      RETURN_IF_NOT_OK(tree_->AssociateNode(op));
      if (prev_op != nullptr) {
        RETURN_IF_NOT_OK(prev_op->AddChild(op));
      }
      prev_op = op;
    }
    // Add the last DatasetOp to the queue to be BFS.
    q.push(std::make_pair(ds, root_ops.back()));

    // Traverse down to the children and convert them to the corresponding DatasetOps (i.e. execution tree nodes)
    while (!q.empty()) {
@@ -94,7 +114,7 @@ Status Iterator::BuildAndLaunchTree(std::shared_ptr<Dataset> ds) {
        q.push(std::make_pair(child, child_ops.back()));
      }
    }
    RETURN_IF_NOT_OK(tree_->AssignRoot(root_op));
    RETURN_IF_NOT_OK(tree_->AssignRoot(root_ops.front()));
  }

  // Launch the execution tree.
--- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/kernels/image/bindings.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/kernels/image/bindings.cc
@@ -28,8 +28,10 @@
 #include "minddata/dataset/kernels/image/hwc_to_chw_op.h"
 #include "minddata/dataset/kernels/image/image_utils.h"
 #include "minddata/dataset/kernels/image/invert_op.h"
 #include "minddata/dataset/kernels/image/mixup_batch_op.h"
 #include "minddata/dataset/kernels/image/normalize_op.h"
 #include "minddata/dataset/kernels/image/pad_op.h"
 #include "minddata/dataset/kernels/image/random_affine_op.h"
 #include "minddata/dataset/kernels/image/random_color_adjust_op.h"
 #include "minddata/dataset/kernels/image/random_crop_and_resize_op.h"
 #include "minddata/dataset/kernels/image/random_crop_and_resize_with_bbox_op.h"
@@ -48,6 +50,8 @@
 #include "minddata/dataset/kernels/image/resize_bilinear_op.h"
 #include "minddata/dataset/kernels/image/resize_op.h"
 #include "minddata/dataset/kernels/image/resize_with_bbox_op.h"
 #include "minddata/dataset/kernels/image/soft_dvpp/soft_dvpp_decode_random_crop_resize_jpeg_op.h"
 #include "minddata/dataset/kernels/image/soft_dvpp/soft_dvpp_decode_resize_jpeg_op.h"
 #include "minddata/dataset/kernels/image/uniform_aug_op.h"

 namespace mindspore {
@@ -92,6 +96,12 @@ PYBIND_REGISTER(CenterCropOp, 1, ([](const py::module *m) {
                    .def(py::init<int32_t, int32_t>(), py::arg("height"), py::arg("width") = CenterCropOp::kDefWidth);
                }));

 PYBIND_REGISTER(MixUpBatchOp, 1, ([](const py::module *m) {
                  (void)py::class_<MixUpBatchOp, TensorOp, std::shared_ptr<MixUpBatchOp>>(
                    *m, "MixUpBatchOp", "Tensor operation to mixup a batch of images")
                    .def(py::init<float>(), py::arg("alpha"));
                }));

 PYBIND_REGISTER(ResizeOp, 1, ([](const py::module *m) {
                  (void)py::class_<ResizeOp, TensorOp, std::shared_ptr<ResizeOp>>(
                    *m, "ResizeOp", "Tensor operation to resize an image. Takes height, width and mode")
@@ -108,6 +118,19 @@ PYBIND_REGISTER(ResizeWithBBoxOp, 1, ([](const py::module *m) {
                         py::arg("interpolation") = ResizeWithBBoxOp::kDefInterpolation);
                }));

 PYBIND_REGISTER(RandomAffineOp, 1, ([](const py::module *m) {
                  (void)py::class_<RandomAffineOp, TensorOp, std::shared_ptr<RandomAffineOp>>(
                    *m, "RandomAffineOp", "Tensor operation to apply random affine transformations on an image.")
                    .def(py::init<std::vector<float_t>, std::vector<float_t>, std::vector<float_t>,
                                  std::vector<float_t>, InterpolationMode, std::vector<uint8_t>>(),
                         py::arg("degrees") = RandomAffineOp::kDegreesRange,
                         py::arg("translate_range") = RandomAffineOp::kTranslationPercentages,
                         py::arg("scale_range") = RandomAffineOp::kScaleRange,
                         py::arg("shear_ranges") = RandomAffineOp::kShearRanges,
                         py::arg("interpolation") = RandomAffineOp::kDefInterpolation,
                         py::arg("fill_value") = RandomAffineOp::kFillValue);
                }));

 PYBIND_REGISTER(
  RandomResizeWithBBoxOp, 1, ([](const py::module *m) {
    (void)py::class_<RandomResizeWithBBoxOp, TensorOp, std::shared_ptr<RandomResizeWithBBoxOp>>(
@@ -341,6 +364,24 @@ PYBIND_REGISTER(RandomSelectSubpolicyOp, 1, ([](const py::module *m) {
                      return std::make_shared<RandomSelectSubpolicyOp>(cpp_policy);
                    }));
                }));
 PYBIND_REGISTER(SoftDvppDecodeResizeJpegOp, 1, ([](const py::module *m) {
                  (void)py::class_<SoftDvppDecodeResizeJpegOp, TensorOp, std::shared_ptr<SoftDvppDecodeResizeJpegOp>>(
                    *m, "SoftDvppDecodeResizeJpegOp", "TensorOp to use soft dvpp decode and resize jpeg image.")
                    .def(py::init<int32_t, int32_t>(), py::arg("targetHeight"), py::arg("targetWidth"));
                }));
 PYBIND_REGISTER(
  SoftDvppDecodeRandomCropResizeJpegOp, 1, ([](const py::module *m) {
    (void)
      py::class_<SoftDvppDecodeRandomCropResizeJpegOp, TensorOp, std::shared_ptr<SoftDvppDecodeRandomCropResizeJpegOp>>(
        *m, "SoftDvppDecodeRandomCropResizeJpegOp",
        "TensorOp to use soft dvpp decode, random crop and resize jepg image.")
        .def(py::init<int32_t, int32_t, float, float, float, float, int32_t>(), py::arg("targetHeight"),
             py::arg("targetWidth"), py::arg("scaleLb") = RandomCropDecodeResizeOp::kDefScaleLb,
             py::arg("scaleUb") = RandomCropDecodeResizeOp::kDefScaleUb,
             py::arg("aspectLb") = RandomCropDecodeResizeOp::kDefAspectLb,
             py::arg("aspectUb") = RandomCropDecodeResizeOp::kDefAspectUb,
             py::arg("maxIter") = RandomCropDecodeResizeOp::kDefMaxIter);
  }));

 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/mindrecord/include/bindings.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/mindrecord/include/bindings.cc
@@ -48,12 +48,12 @@ PYBIND_REGISTER(
  ShardPkSample, 1, ([](const py::module *m) {
    (void)py::class_<mindrecord::ShardPkSample, mindrecord::ShardOperator, std::shared_ptr<mindrecord::ShardPkSample>>(
      *m, "MindrecordPkSampler")
      .def(py::init([](int64_t kVal, std::string kColumn, bool shuffle) {
      .def(py::init([](int64_t kVal, std::string kColumn, bool shuffle, int64_t num_samples) {
        if (shuffle == true) {
          return std::make_shared<mindrecord::ShardPkSample>(kColumn, kVal, std::numeric_limits<int64_t>::max(),
                                                             GetSeed());
                                                             GetSeed(), num_samples);
        } else {
          return std::make_shared<mindrecord::ShardPkSample>(kColumn, kVal);
          return std::make_shared<mindrecord::ShardPkSample>(kColumn, kVal, num_samples);
        }
      }));
  }));
--- a/mindspore/ccsrc/minddata/dataset/api/transforms.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/transforms.cc
@@ -21,8 +21,12 @@
 #include "minddata/dataset/kernels/image/crop_op.h"
 #include "minddata/dataset/kernels/image/cut_out_op.h"
 #include "minddata/dataset/kernels/image/decode_op.h"
 #include "minddata/dataset/kernels/image/hwc_to_chw_op.h"
 #include "minddata/dataset/kernels/image/mixup_batch_op.h"
 #include "minddata/dataset/kernels/image/normalize_op.h"
 #include "minddata/dataset/kernels/data/one_hot_op.h"
 #include "minddata/dataset/kernels/image/pad_op.h"
 #include "minddata/dataset/kernels/image/random_affine_op.h"
 #include "minddata/dataset/kernels/image/random_color_adjust_op.h"
 #include "minddata/dataset/kernels/image/random_crop_op.h"
 #include "minddata/dataset/kernels/image/random_horizontal_flip_op.h"
@@ -81,6 +85,26 @@ std::shared_ptr<DecodeOperation> Decode(bool rgb) {
  return op;
 }

 // Function to create HwcToChwOperation.
 std::shared_ptr<HwcToChwOperation> HWC2CHW() {
  auto op = std::make_shared<HwcToChwOperation>();
  // Input validation
  if (!op->ValidateParams()) {
    return nullptr;
  }
  return op;
 }

 // Function to create MixUpBatchOperation.
 std::shared_ptr<MixUpBatchOperation> MixUpBatch(float alpha) {
  auto op = std::make_shared<MixUpBatchOperation>(alpha);
  // Input validation
  if (!op->ValidateParams()) {
    return nullptr;
  }
  return op;
 }

 // Function to create NormalizeOperation.
 std::shared_ptr<NormalizeOperation> Normalize(std::vector<float> mean, std::vector<float> std) {
  auto op = std::make_shared<NormalizeOperation>(mean, std);
@@ -91,6 +115,16 @@ std::shared_ptr<NormalizeOperation> Normalize(std::vector<float> mean, std::vect
  return op;
 }

 // Function to create OneHotOperation.
 std::shared_ptr<OneHotOperation> OneHot(int32_t num_classes) {
  auto op = std::make_shared<OneHotOperation>(num_classes);
  // Input validation
  if (!op->ValidateParams()) {
    return nullptr;
  }
  return op;
 }

 // Function to create PadOperation.
 std::shared_ptr<PadOperation> Pad(std::vector<int32_t> padding, std::vector<uint8_t> fill_value,
                                  BorderType padding_mode) {
@@ -114,10 +148,27 @@ std::shared_ptr<RandomColorAdjustOperation> RandomColorAdjust(std::vector<float>
  return op;
 }

 // Function to create RandomAffineOperation.
 std::shared_ptr<RandomAffineOperation> RandomAffine(const std::vector<float_t> &degrees,
                                                    const std::vector<float_t> &translate_range,
                                                    const std::vector<float_t> &scale_range,
                                                    const std::vector<float_t> &shear_ranges,
                                                    InterpolationMode interpolation,
                                                    const std::vector<uint8_t> &fill_value) {
  auto op = std::make_shared<RandomAffineOperation>(degrees, translate_range, scale_range, shear_ranges, interpolation,
                                                    fill_value);
  // Input validation
  if (!op->ValidateParams()) {
    return nullptr;
  }
  return op;
 }

 // Function to create RandomCropOperation.
 std::shared_ptr<RandomCropOperation> RandomCrop(std::vector<int32_t> size, std::vector<int32_t> padding,
                                                bool pad_if_needed, std::vector<uint8_t> fill_value) {
  auto op = std::make_shared<RandomCropOperation>(size, padding, pad_if_needed, fill_value);
                                                bool pad_if_needed, std::vector<uint8_t> fill_value,
                                                BorderType padding_mode) {
  auto op = std::make_shared<RandomCropOperation>(size, padding, pad_if_needed, fill_value, padding_mode);
  // Input validation
  if (!op->ValidateParams()) {
    return nullptr;
@@ -271,6 +322,25 @@ bool DecodeOperation::ValidateParams() { return true; }

 std::shared_ptr<TensorOp> DecodeOperation::Build() { return std::make_shared<DecodeOp>(rgb_); }

 // HwcToChwOperation
 bool HwcToChwOperation::ValidateParams() { return true; }

 std::shared_ptr<TensorOp> HwcToChwOperation::Build() { return std::make_shared<HwcToChwOp>(); }

 // MixUpOperation
 MixUpBatchOperation::MixUpBatchOperation(float alpha) : alpha_(alpha) {}

 bool MixUpBatchOperation::ValidateParams() {
  if (alpha_ < 0) {
    MS_LOG(ERROR) << "MixUpBatch: alpha must be a positive floating value however it is: " << alpha_;
    return false;
  }

  return true;
 }

 std::shared_ptr<TensorOp> MixUpBatchOperation::Build() { return std::make_shared<MixUpBatchOp>(alpha_); }

 // NormalizeOperation
 NormalizeOperation::NormalizeOperation(std::vector<float> mean, std::vector<float> std) : mean_(mean), std_(std) {}

@@ -292,6 +362,20 @@ std::shared_ptr<TensorOp> NormalizeOperation::Build() {
  return std::make_shared<NormalizeOp>(mean_[0], mean_[1], mean_[2], std_[0], std_[1], std_[2]);
 }

 // OneHotOperation
 OneHotOperation::OneHotOperation(int32_t num_classes) : num_classes_(num_classes) {}

 bool OneHotOperation::ValidateParams() {
  if (num_classes_ < 0) {
    MS_LOG(ERROR) << "OneHot: Number of classes cannot be negative. Number of classes: " << num_classes_;
    return false;
  }

  return true;
 }

 std::shared_ptr<TensorOp> OneHotOperation::Build() { return std::make_shared<OneHotOp>(num_classes_); }

 // PadOperation
 PadOperation::PadOperation(std::vector<int32_t> padding, std::vector<uint8_t> fill_value, BorderType padding_mode)
    : padding_(padding), fill_value_(fill_value), padding_mode_(padding_mode) {}
@@ -401,10 +485,90 @@ std::shared_ptr<TensorOp> RandomColorAdjustOperation::Build() {
  return tensor_op;
 }

 // RandomAffineOperation
 RandomAffineOperation::RandomAffineOperation(const std::vector<float_t> &degrees,
                                             const std::vector<float_t> &translate_range,
                                             const std::vector<float_t> &scale_range,
                                             const std::vector<float_t> &shear_ranges, InterpolationMode interpolation,
                                             const std::vector<uint8_t> &fill_value)
    : degrees_(degrees),
      translate_range_(translate_range),
      scale_range_(scale_range),
      shear_ranges_(shear_ranges),
      interpolation_(interpolation),
      fill_value_(fill_value) {}

 bool RandomAffineOperation::ValidateParams() {
  // Degrees
  if (degrees_.size() != 2) {
    MS_LOG(ERROR) << "RandomAffine: degrees vector has incorrect size: degrees.size() = " << degrees_.size();
    return false;
  }
  if (degrees_[0] > degrees_[1]) {
    MS_LOG(ERROR) << "RandomAffine: minimum of degrees range is greater than maximum: min = " << degrees_[0]
                  << ", max = " << degrees_[1];
    return false;
  }
  // Translate
  if (translate_range_.size() != 2) {
    MS_LOG(ERROR) << "RandomAffine: translate_range vector has incorrect size: translate_range.size() = "
                  << translate_range_.size();
    return false;
  }
  if (translate_range_[0] > translate_range_[1]) {
    MS_LOG(ERROR) << "RandomAffine: minimum of translate range is greater than maximum: min = " << translate_range_[0]
                  << ", max = " << translate_range_[1];
    return false;
  }
  // Scale
  if (scale_range_.size() != 2) {
    MS_LOG(ERROR) << "RandomAffine: scale_range vector has incorrect size: scale_range.size() = "
                  << scale_range_.size();
    return false;
  }
  if (scale_range_[0] > scale_range_[1]) {
    MS_LOG(ERROR) << "RandomAffine: minimum of scale range is greater than maximum: min = " << scale_range_[0]
                  << ", max = " << scale_range_[1];
    return false;
  }
  // Shear
  if (shear_ranges_.size() != 4) {
    MS_LOG(ERROR) << "RandomAffine: shear_ranges vector has incorrect size: shear_ranges.size() = "
                  << shear_ranges_.size();
    return false;
  }
  if (shear_ranges_[0] > shear_ranges_[1]) {
    MS_LOG(ERROR) << "RandomAffine: minimum of horizontal shear range is greater than maximum: min = "
                  << shear_ranges_[0] << ", max = " << shear_ranges_[1];
    return false;
  }
  if (shear_ranges_[2] > shear_ranges_[3]) {
    MS_LOG(ERROR) << "RandomAffine: minimum of vertical shear range is greater than maximum: min = " << shear_ranges_[2]
                  << ", max = " << scale_range_[3];
    return false;
  }
  // Fill Value
  if (fill_value_.size() != 3) {
    MS_LOG(ERROR) << "RandomAffine: fill_value vector has incorrect size: fill_value.size() = " << fill_value_.size();
    return false;
  }
  return true;
 }

 std::shared_ptr<TensorOp> RandomAffineOperation::Build() {
  auto tensor_op = std::make_shared<RandomAffineOp>(degrees_, translate_range_, scale_range_, shear_ranges_,
                                                    interpolation_, fill_value_);
  return tensor_op;
 }

 // RandomCropOperation
 RandomCropOperation::RandomCropOperation(std::vector<int32_t> size, std::vector<int32_t> padding, bool pad_if_needed,
                                         std::vector<uint8_t> fill_value)
    : size_(size), padding_(padding), pad_if_needed_(pad_if_needed), fill_value_(fill_value) {}
                                         std::vector<uint8_t> fill_value, BorderType padding_mode)
    : size_(size),
      padding_(padding),
      pad_if_needed_(pad_if_needed),
      fill_value_(fill_value),
      padding_mode_(padding_mode) {}

 bool RandomCropOperation::ValidateParams() {
  if (size_.empty() || size_.size() > 2) {
@@ -443,7 +607,7 @@ std::shared_ptr<TensorOp> RandomCropOperation::Build() {
  }

  auto tensor_op = std::make_shared<RandomCropOp>(crop_height, crop_width, pad_top, pad_bottom, pad_left, pad_right,
                                                  BorderType::kConstant, pad_if_needed_, fill_r, fill_g, fill_b);
                                                  padding_mode_, pad_if_needed_, fill_r, fill_g, fill_b);
  return tensor_op;
 }

--- a/mindspore/ccsrc/minddata/dataset/engine/opt/post/repeat_pass.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/opt/post/repeat_pass.cc
@@ -20,6 +20,7 @@
 #include "minddata/dataset/engine/datasetops/cache_op.h"
 #include "minddata/dataset/engine/datasetops/cache_lookup_op.h"
 #include "minddata/dataset/engine/datasetops/cache_merge_op.h"
 #include "minddata/dataset/engine/datasetops/device_queue_op.h"
 #include "minddata/dataset/engine/datasetops/epoch_ctrl_op.h"

 namespace mindspore {
@@ -258,6 +259,13 @@ Status RepeatPass::RunOnNode(std::shared_ptr<CacheLookupOp> node, bool *modified
  return Status::OK();
 }

 Status RepeatPass::RunOnNode(std::shared_ptr<DeviceQueueOp> node, bool *modified) {
  // Set total repeats and total epochs for the DeviceQueueOp
  node->set_total_repeats(num_epochs_);
  node->set_num_repeats_per_epoch(1);
  return Status::OK();
 }

 // Adds an operator to the eoe operator stack save area
 void RepeatPass::AddToEOEOpStack(std::shared_ptr<DatasetOp> dataset_op) {
  op_stack *current_stack = eoe_op_stacks_.top().get();
--- a/mindspore/ccsrc/minddata/dataset/engine/opt/post/repeat_pass.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/opt/post/repeat_pass.h
@@ -92,6 +92,12 @@ class RepeatPass : public NodePass {
  /// \return Status The error code return
  Status RunOnNode(std::shared_ptr<CacheLookupOp> node, bool *modified) override;

  /// \brief Set the epoch count for DeviceQueue
  /// \param[in] node The node being visited
  /// \param[inout] modified Indicator if the node was changed at all
  /// \return Status The error code return
  Status RunOnNode(std::shared_ptr<DeviceQueueOp> node, bool *modified) override;

  /// \brief All operators have a flag that might be set related to the repeat and any leaf nodes need to be set up
  ///     for use with a controlling repeat above it.
  /// \param[in] node The node being visited
--- a/mindspore/ccsrc/minddata/dataset/include/datasets.h
+++ b/mindspore/ccsrc/minddata/dataset/include/datasets.h
@@ -196,8 +196,9 @@ class Dataset : public std::enable_shared_from_this<Dataset> {
  }

  /// \brief Function to create an Iterator over the Dataset pipeline
  /// \param[in] columns List of columns to be used to specify the order of columns
  /// \return Shared pointer to the Iterator
  std::shared_ptr<Iterator> CreateIterator();
  std::shared_ptr<Iterator> CreateIterator(std::vector<std::string> columns = {});

  /// \brief Function to create a BatchDataset
  /// \notes Combines batch_size number of consecutive rows into batches
@@ -452,6 +453,12 @@ class VOCDataset : public Dataset {
  bool ValidateParams() override;

 private:
  const std::string kColumnImage = "image";
  const std::string kColumnTarget = "target";
  const std::string kColumnBbox = "bbox";
  const std::string kColumnLabel = "label";
  const std::string kColumnDifficult = "difficult";
  const std::string kColumnTruncate = "truncate";
  std::string dataset_dir_;
  std::string task_;
  std::string mode_;
--- a/mindspore/ccsrc/minddata/dataset/include/iterator.h
+++ b/mindspore/ccsrc/minddata/dataset/include/iterator.h
@@ -37,6 +37,7 @@ namespace api {
 class Dataset;

 using TensorMap = std::unordered_map<std::string, std::shared_ptr<Tensor>>;
 using TensorVec = std::vector<std::shared_ptr<Tensor>>;

 // Abstract class for iterating over the dataset.
 class Iterator {
@@ -53,9 +54,15 @@ class Iterator {
  Status BuildAndLaunchTree(std::shared_ptr<Dataset> ds);

  /// \brief Function to get the next row from the data pipeline.
  /// \note Type of return data is a map(with column name).
  /// \param[out] row - the output tensor row.
  void GetNextRow(TensorMap *row);

  /// \brief Function to get the next row from the data pipeline.
  /// \note Type of return data is a vector(without column name).
  /// \param[out] row - the output tensor row.
  void GetNextRow(TensorVec *row);

  /// \brief Function to shut down the data pipeline.
  void Stop();

--- a/mindspore/ccsrc/minddata/dataset/include/transforms.h
+++ b/mindspore/ccsrc/minddata/dataset/include/transforms.h
@@ -51,8 +51,12 @@ class CenterCropOperation;
 class CropOperation;
 class CutOutOperation;
 class DecodeOperation;
 class HwcToChwOperation;
 class MixUpBatchOperation;
 class NormalizeOperation;
 class OneHotOperation;
 class PadOperation;
 class RandomAffineOperation;
 class RandomColorAdjustOperation;
 class RandomCropOperation;
 class RandomHorizontalFlipOperation;
@@ -90,6 +94,18 @@ std::shared_ptr<CutOutOperation> CutOut(int32_t length, int32_t num_patches = 1)
 /// \return Shared pointer to the current TensorOperation.
 std::shared_ptr<DecodeOperation> Decode(bool rgb = true);

 /// \brief Function to create a HwcToChw TensorOperation.
 /// \notes Transpose the input image; shape (H, W, C) to shape (C, H, W).
 /// \return Shared pointer to the current TensorOperation.
 std::shared_ptr<HwcToChwOperation> HWC2CHW();

 /// \brief Function to create a MixUpBatch TensorOperation.
 /// \notes Apply MixUp transformation on an input batch of images and labels. The labels must be in one-hot format and
 ///    Batch must be called before calling this function.
 /// \param[in] alpha hyperparameter of beta distribution (default = 1.0)
 /// \return Shared pointer to the current TensorOperation.
 std::shared_ptr<MixUpBatchOperation> MixUpBatch(float alpha = 1);

 /// \brief Function to create a Normalize TensorOperation.
 /// \notes Normalize the input image with respect to mean and standard deviation.
 /// \param[in] mean - a vector of mean values for each channel, w.r.t channel order.
@@ -97,6 +113,12 @@ std::shared_ptr<DecodeOperation> Decode(bool rgb = true);
 /// \return Shared pointer to the current TensorOperation.
 std::shared_ptr<NormalizeOperation> Normalize(std::vector<float> mean, std::vector<float> std);

 /// \brief Function to create a OneHot TensorOperation.
 /// \notes Convert the labels into OneHot format.
 /// \param[in] num_classes number of classes.
 /// \return Shared pointer to the current TensorOperation.
 std::shared_ptr<OneHotOperation> OneHot(int32_t num_classes);

 /// \brief Function to create a Pad TensorOp
 /// \notes Pads the image according to padding parameters
 /// \param[in] padding A vector representing the number of pixels to pad the image
@@ -119,6 +141,23 @@ std::shared_ptr<NormalizeOperation> Normalize(std::vector<float> mean, std::vect
 std::shared_ptr<PadOperation> Pad(std::vector<int32_t> padding, std::vector<uint8_t> fill_value = {0},
                                  BorderType padding_mode = BorderType::kConstant);

 /// \brief Function to create a RandomAffine TensorOperation.
 /// \notes Applies a Random Affine transformation on input image in RGB or Greyscale mode.
 /// \param[in] degrees A float vector size 2, representing the starting and ending degree
 /// \param[in] translate_range A float vector size 2, representing percentages of translation on x and y axes.
 /// \param[in] scale_range A float vector size 2, representing the starting and ending scales in the range.
 /// \param[in] shear_ranges A float vector size 4, representing the starting and ending shear degrees vertically and
 ///    horizontally.
 /// \param[in] interpolation An enum for the mode of interpolation
 /// \param[in] fill_value A uint8_t vector size 3, representing the pixel intensity of the borders, it is used to
 ///    fill R, G, B channels respectively.
 /// \return Shared pointer to the current TensorOperation.
 std::shared_ptr<RandomAffineOperation> RandomAffine(
  const std::vector<float_t> &degrees, const std::vector<float_t> &translate_range = {0.0, 0.0},
  const std::vector<float_t> &scale_range = {1.0, 1.0}, const std::vector<float_t> &shear_ranges = {0.0, 0.0, 0.0, 0.0},
  InterpolationMode interpolation = InterpolationMode::kNearestNeighbour,
  const std::vector<uint8_t> &fill_value = {0, 0, 0});

 /// \brief Randomly adjust the brightness, contrast, saturation, and hue of the input image
 /// \param[in] brightness Brightness adjustment factor. Must be a vector of one or two values
 ///     if it's a vector of two values it needs to be in the form of [min, max]. Default value is {1, 1}
@@ -148,8 +187,8 @@ std::shared_ptr<RandomColorAdjustOperation> RandomColorAdjust(std::vector<float>
 ///                     fill R, G, B channels respectively.
 /// \return Shared pointer to the current TensorOperation.
 std::shared_ptr<RandomCropOperation> RandomCrop(std::vector<int32_t> size, std::vector<int32_t> padding = {0, 0, 0, 0},
                                                bool pad_if_needed = false,
                                                std::vector<uint8_t> fill_value = {0, 0, 0});
                                                bool pad_if_needed = false, std::vector<uint8_t> fill_value = {0, 0, 0},
                                                BorderType padding_mode = BorderType::kConstant);

 /// \brief Function to create a RandomHorizontalFlip TensorOperation.
 /// \notes Tensor operation to perform random horizontal flip.
@@ -258,6 +297,29 @@ class DecodeOperation : public TensorOperation {
  bool rgb_;
 };

 class HwcToChwOperation : public TensorOperation {
 public:
  ~HwcToChwOperation() = default;

  std::shared_ptr<TensorOp> Build() override;

  bool ValidateParams() override;
 };

 class MixUpBatchOperation : public TensorOperation {
 public:
  explicit MixUpBatchOperation(float alpha = 1);

  ~MixUpBatchOperation() = default;

  std::shared_ptr<TensorOp> Build() override;

  bool ValidateParams() override;

 private:
  float alpha_;
 };

 class NormalizeOperation : public TensorOperation {
 public:
  NormalizeOperation(std::vector<float> mean, std::vector<float> std);
@@ -273,6 +335,20 @@ class NormalizeOperation : public TensorOperation {
  std::vector<float> std_;
 };

 class OneHotOperation : public TensorOperation {
 public:
  explicit OneHotOperation(int32_t num_classes_);

  ~OneHotOperation() = default;

  std::shared_ptr<TensorOp> Build() override;

  bool ValidateParams() override;

 private:
  float num_classes_;
 };

 class PadOperation : public TensorOperation {
 public:
  PadOperation(std::vector<int32_t> padding, std::vector<uint8_t> fill_value = {0},
@@ -290,6 +366,29 @@ class PadOperation : public TensorOperation {
  BorderType padding_mode_;
 };

 class RandomAffineOperation : public TensorOperation {
 public:
  RandomAffineOperation(const std::vector<float_t> &degrees, const std::vector<float_t> &translate_range = {0.0, 0.0},
                        const std::vector<float_t> &scale_range = {1.0, 1.0},
                        const std::vector<float_t> &shear_ranges = {0.0, 0.0, 0.0, 0.0},
                        InterpolationMode interpolation = InterpolationMode::kNearestNeighbour,
                        const std::vector<uint8_t> &fill_value = {0, 0, 0});

  ~RandomAffineOperation() = default;

  std::shared_ptr<TensorOp> Build() override;

  bool ValidateParams() override;

 private:
  std::vector<float_t> degrees_;          // min_degree, max_degree
  std::vector<float_t> translate_range_;  // maximum x translation percentage, maximum y translation percentage
  std::vector<float_t> scale_range_;      // min_scale, max_scale
  std::vector<float_t> shear_ranges_;     // min_x_shear, max_x_shear, min_y_shear, max_y_shear
  InterpolationMode interpolation_;
  std::vector<uint8_t> fill_value_;
 };

 class RandomColorAdjustOperation : public TensorOperation {
 public:
  RandomColorAdjustOperation(std::vector<float> brightness = {1.0, 1.0}, std::vector<float> contrast = {1.0, 1.0},
@@ -311,7 +410,8 @@ class RandomColorAdjustOperation : public TensorOperation {
 class RandomCropOperation : public TensorOperation {
 public:
  RandomCropOperation(std::vector<int32_t> size, std::vector<int32_t> padding = {0, 0, 0, 0},
                      bool pad_if_needed = false, std::vector<uint8_t> fill_value = {0, 0, 0});
                      bool pad_if_needed = false, std::vector<uint8_t> fill_value = {0, 0, 0},
                      BorderType padding_mode = BorderType::kConstant);

  ~RandomCropOperation() = default;

@@ -324,6 +424,7 @@ class RandomCropOperation : public TensorOperation {
  std::vector<int32_t> padding_;
  bool pad_if_needed_;
  std::vector<uint8_t> fill_value_;
  BorderType padding_mode_;
 };

 class RandomHorizontalFlipOperation : public TensorOperation {
--- a/mindspore/ccsrc/minddata/dataset/kernels/data/data_utils.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/data/data_utils.cc
@@ -20,6 +20,7 @@
 #include <limits>
 #include <string>
 #include <vector>
 #include <utility>

 #include "minddata/dataset/core/constants.h"
 #include "minddata/dataset/core/data_type.h"
@@ -648,5 +649,30 @@ Status Concatenate(const TensorRow &input, TensorRow *output, int8_t axis, std::
  return Status::OK();
 }

 Status BatchTensorToCVTensorVector(const std::shared_ptr<Tensor> &input,
                                   std::vector<std::shared_ptr<CVTensor>> *output) {
  std::vector<int64_t> tensor_shape = input->shape().AsVector();
  TensorShape remaining({-1});
  std::vector<int64_t> index(tensor_shape.size(), 0);
  if (tensor_shape.size() <= 1) {
    RETURN_STATUS_UNEXPECTED("Tensor must be at least 2-D in order to unpack");
  }
  TensorShape element_shape(std::vector<int64_t>(tensor_shape.begin() + 1, tensor_shape.end()));

  for (; index[0] < tensor_shape[0]; index[0]++) {
    uchar *start_addr_of_index = nullptr;
    std::shared_ptr<Tensor> out;

    RETURN_IF_NOT_OK(input->StartAddrOfIndex(index, &start_addr_of_index, &remaining));
    RETURN_IF_NOT_OK(input->CreateFromMemory(element_shape, input->type(), start_addr_of_index, &out));
    std::shared_ptr<CVTensor> cv_out = CVTensor::AsCVTensor(std::move(out));
    if (!cv_out->mat().data) {
      RETURN_STATUS_UNEXPECTED("Could not convert to CV Tensor");
    }
    output->push_back(cv_out);
  }
  return Status::OK();
 }

 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/kernels/data/data_utils.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/data/data_utils.h
@@ -152,6 +152,17 @@ Status Mask(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *outpu

 Status Concatenate(const TensorRow &input, TensorRow *output, int8_t axis, std::shared_ptr<Tensor> prepend,
                   std::shared_ptr<Tensor> append);

 // helper for concat, always append to the input, and pass that to the output
 Status ConcatenateHelper(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int8_t axis,
                         std::shared_ptr<Tensor> append);

 /// Convert an n-dimensional Tensor to a vector of (n-1)-dimensional CVTensors
 /// @param input[in] input tensor
 /// @param output[out] output tensor
 /// @return Status ok/error
 Status BatchTensorToCVTensorVector(const std::shared_ptr<Tensor> &input,
                                   std::vector<std::shared_ptr<CVTensor>> *output);
 }  // namespace dataset
 }  // namespace mindspore

--- a/mindspore/ccsrc/minddata/dataset/kernels/image/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/CMakeLists.txt
@@ -1,6 +1,8 @@
 file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
 set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
 add_subdirectory(soft_dvpp)
 add_library(kernels-image OBJECT
    affine_op.cc
    auto_contrast_op.cc
    center_crop_op.cc
    crop_op.cc
@@ -10,8 +12,11 @@ add_library(kernels-image OBJECT
    hwc_to_chw_op.cc
    image_utils.cc
    invert_op.cc
    math_utils.cc
    mixup_batch_op.cc
    normalize_op.cc
    pad_op.cc
    random_affine_op.cc
    random_color_adjust_op.cc
    random_crop_decode_resize_op.cc
    random_crop_and_resize_with_bbox_op.cc
@@ -34,3 +39,4 @@ add_library(kernels-image OBJECT
    resize_with_bbox_op.cc
    random_resize_with_bbox_op.cc
    )
 add_dependencies(kernels-image kernels-soft-dvpp-image)
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/affine_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/affine_op.cc
@@ -0,0 +1,99 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <algorithm>
 #include <random>
 #include <utility>
 #include <vector>

 #include "minddata/dataset/kernels/image/affine_op.h"
 #include "minddata/dataset/kernels/image/image_utils.h"
 #include "minddata/dataset/kernels/image/math_utils.h"
 #include "minddata/dataset/util/random.h"

 namespace mindspore {
 namespace dataset {

 const InterpolationMode AffineOp::kDefInterpolation = InterpolationMode::kNearestNeighbour;
 const float_t AffineOp::kDegrees = 0.0;
 const std::vector<float_t> AffineOp::kTranslation = {0.0, 0.0};
 const float_t AffineOp::kScale = 1.0;
 const std::vector<float_t> AffineOp::kShear = {0.0, 0.0};
 const std::vector<uint8_t> AffineOp::kFillValue = {0, 0, 0};

 AffineOp::AffineOp(float_t degrees, const std::vector<float_t> &translation, float_t scale,
                   const std::vector<float_t> &shear, InterpolationMode interpolation,
                   const std::vector<uint8_t> &fill_value)
    : degrees_(degrees),
      translation_(translation),
      scale_(scale),
      shear_(shear),
      interpolation_(interpolation),
      fill_value_(fill_value) {}

 Status AffineOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
  IO_CHECK(input, output);
  float_t translation_x = translation_[0];
  float_t translation_y = translation_[1];
  float_t degrees = 0.0;
  DegreesToRadians(degrees_, &degrees);
  float_t shear_x = shear_[0];
  float_t shear_y = shear_[1];
  DegreesToRadians(shear_x, &shear_x);
  DegreesToRadians(-1 * shear_y, &shear_y);
  std::shared_ptr<CVTensor> input_cv = CVTensor::AsCVTensor(input);

  // Apply Affine Transformation
  //       T is translation matrix: [1, 0, tx | 0, 1, ty | 0, 0, 1]
  //       C is translation matrix to keep center: [1, 0, cx | 0, 1, cy | 0, 0, 1]
  //       RSS is rotation with scale and shear matrix
  //       RSS(a, s, (sx, sy)) =
  //       = R(a) * S(s) * SHy(sy) * SHx(sx)
  //       = [ s*cos(a - sy)/cos(sy), s*(-cos(a - sy)*tan(x)/cos(y) - sin(a)), 0 ]
  //         [ s*sin(a - sy)/cos(sy), s*(-sin(a - sy)*tan(x)/cos(y) + cos(a)), 0 ]
  //         [ 0                    , 0                                      , 1 ]
  //
  // where R is a rotation matrix, S is a scaling matrix, and SHx and SHy are the shears:
  // SHx(s) = [1, -tan(s)] and SHy(s) = [1      , 0]
  //          [0, 1      ]              [-tan(s), 1]
  //
  // Thus, the affine matrix is M = T * C * RSS * C^-1

  float_t cx = ((input_cv->mat().cols - 1) / 2.0);
  float_t cy = ((input_cv->mat().rows - 1) / 2.0);
  // Calculate RSS
  std::vector<float_t> matrix{scale_ * cos(degrees + shear_y) / cos(shear_y),
                              scale_ * (-1 * cos(degrees + shear_y) * tan(shear_x) / cos(shear_y) - sin(degrees)),
                              0,
                              scale_ * sin(degrees + shear_y) / cos(shear_y),
                              scale_ * (-1 * sin(degrees + shear_y) * tan(shear_x) / cos(shear_y) + cos(degrees)),
                              0};
  // Compute T * C * RSS * C^-1
  matrix[2] = (1 - matrix[0]) * cx - matrix[1] * cy + translation_x;
  matrix[5] = (1 - matrix[4]) * cy - matrix[3] * cx + translation_y;
  cv::Mat affine_mat(matrix);
  affine_mat = affine_mat.reshape(1, {2, 3});

  std::shared_ptr<CVTensor> output_cv;
  RETURN_IF_NOT_OK(CVTensor::CreateEmpty(input_cv->shape(), input_cv->type(), &output_cv));
  RETURN_UNEXPECTED_IF_NULL(output_cv);
  cv::warpAffine(input_cv->mat(), output_cv->mat(), affine_mat, input_cv->mat().size(),
                 GetCVInterpolationMode(interpolation_), cv::BORDER_CONSTANT,
                 cv::Scalar(fill_value_[0], fill_value_[1], fill_value_[2]));
  (*output) = std::static_pointer_cast<Tensor>(output_cv);
  return Status::OK();
 }
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/affine_op.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/affine_op.h
@@ -0,0 +1,68 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IMAGE_AFFINE_OP_H_
 #define MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IMAGE_AFFINE_OP_H_

 #include <memory>
 #include <string>
 #include <vector>

 #include "minddata/dataset/core/cv_tensor.h"
 #include "minddata/dataset/core/tensor.h"
 #include "minddata/dataset/kernels/tensor_op.h"
 #include "minddata/dataset/util/status.h"

 namespace mindspore {
 namespace dataset {
 class AffineOp : public TensorOp {
 public:
  /// Default values
  static const float_t kDegrees;
  static const std::vector<float_t> kTranslation;
  static const float_t kScale;
  static const std::vector<float_t> kShear;
  static const InterpolationMode kDefInterpolation;
  static const std::vector<uint8_t> kFillValue;

  /// Constructor
 public:
  explicit AffineOp(float_t degrees, const std::vector<float_t> &translation = kTranslation, float_t scale = kScale,
                    const std::vector<float_t> &shear = kShear, InterpolationMode interpolation = kDefInterpolation,
                    const std::vector<uint8_t> &fill_value = kFillValue);

  ~AffineOp() override = default;

  std::string Name() const override { return kAffineOp; }

  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;

  /// Member variables
 private:
  std::string kAffineOp = "AffineOp";

 protected:
  float_t degrees_;
  std::vector<float_t> translation_;  // translation_x and translation_y
  float_t scale_;
  std::vector<float_t> shear_;  // shear_x and shear_y
  InterpolationMode interpolation_;
  std::vector<uint8_t> fill_value_;
 };
 }  // namespace dataset
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IMAGE_AFFINE_OP_H_
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.cc
@@ -21,6 +21,7 @@
 #include <utility>
 #include <opencv2/imgcodecs.hpp>
 #include "utils/ms_utils.h"
 #include "minddata/dataset/kernels/image/math_utils.h"
 #include "minddata/dataset/core/constants.h"
 #include "minddata/dataset/core/cv_tensor.h"
 #include "minddata/dataset/core/tensor.h"
@@ -631,36 +632,9 @@ Status AutoContrast(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor
      hist.col(0).copyTo(hist_vec);
      // Ignore values in ignore
      for (const auto &item : ignore) hist_vec[item] = 0;
      int32_t n = std::accumulate(hist_vec.begin(), hist_vec.end(), 0);
      // Find pixel values that are in the low cutoff and high cutoff.
      int32_t cut = static_cast<int32_t>((cutoff / 100.0) * n);
      if (cut != 0) {
        for (int32_t lo = 0; lo < 256 && cut > 0; lo++) {
          if (cut > hist_vec[lo]) {
            cut -= hist_vec[lo];
            hist_vec[lo] = 0;
          } else {
            hist_vec[lo] -= cut;
            cut = 0;
          }
        }
        cut = static_cast<int32_t>((cutoff / 100.0) * n);
        for (int32_t hi = 255; hi >= 0 && cut > 0; hi--) {
          if (cut > hist_vec[hi]) {
            cut -= hist_vec[hi];
            hist_vec[hi] = 0;
          } else {
            hist_vec[hi] -= cut;
            cut = 0;
          }
        }
      }
      int32_t lo = 0;
      int32_t hi = 255;
      for (; lo < 256 && !hist_vec[lo]; lo++) {
      }
      for (; hi >= 0 && !hist_vec[hi]; hi--) {
      }
      int32_t lo = 0;
      RETURN_IF_NOT_OK(ComputeUpperAndLowerPercentiles(&hist_vec, cutoff, cutoff, &hi, &lo));
      if (hi <= lo) {
        for (int32_t i = 0; i < 256; i++) {
          table.push_back(i);
@@ -685,7 +659,6 @@ Status AutoContrast(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor
    std::shared_ptr<CVTensor> output_cv;
    RETURN_IF_NOT_OK(CVTensor::CreateFromMat(result, &output_cv));
    (*output) = std::static_pointer_cast<Tensor>(output_cv);
    (*output) = std::static_pointer_cast<Tensor>(output_cv);
    (*output)->Reshape(input->shape());
  } catch (const cv::Exception &e) {
    RETURN_STATUS_UNEXPECTED("Error in auto contrast");
@@ -983,5 +956,24 @@ Status UpdateBBoxesForResize(const std::shared_ptr<Tensor> &bboxList, const size
  return Status::OK();
 }

 Status GetJpegImageInfo(const std::shared_ptr<Tensor> &input, int *img_width, int *img_height) {
  struct jpeg_decompress_struct cinfo {};
  struct JpegErrorManagerCustom jerr {};
  cinfo.err = jpeg_std_error(&jerr.pub);
  jerr.pub.error_exit = JpegErrorExitCustom;
  try {
    jpeg_create_decompress(&cinfo);
    JpegSetSource(&cinfo, input->GetBuffer(), input->SizeInBytes());
    (void)jpeg_read_header(&cinfo, TRUE);
    jpeg_calc_output_dimensions(&cinfo);
  } catch (std::runtime_error &e) {
    jpeg_destroy_decompress(&cinfo);
    RETURN_STATUS_UNEXPECTED(e.what());
  }
  *img_height = cinfo.output_height;
  *img_width = cinfo.output_width;
  jpeg_destroy_decompress(&cinfo);
  return Status::OK();
 }
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.h
@@ -268,6 +268,12 @@ Status PadBBoxes(const std::shared_ptr<Tensor> *bboxList, const size_t &bboxCoun
 Status UpdateBBoxesForResize(const std::shared_ptr<Tensor> &bboxList, const size_t &bboxCount, int32_t target_width_,
                             int32_t target_height_, int orig_width, int orig_height);

 // Get jpeg image width and height
 // @param input: CVTensor containing the not decoded image 1D bytes
 // @param img_width: the jpeg image width
 // @param img_height: the jpeg image height
 Status GetJpegImageInfo(const std::shared_ptr<Tensor> &input, int *img_width, int *img_height);

 }  // namespace dataset
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IMAGE_IMAGE_UTILS_H_
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/math_utils.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/math_utils.cc
@@ -0,0 +1,84 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "minddata/dataset/kernels/image/math_utils.h"

 #include <opencv2/imgproc/types_c.h>

 #include <algorithm>
 #include <string>

 namespace mindspore {
 namespace dataset {
 Status ComputeUpperAndLowerPercentiles(std::vector<int32_t> *hist, int32_t hi_p, int32_t low_p, int32_t *hi,
                                       int32_t *lo) {
  try {
    int32_t n = std::accumulate(hist->begin(), hist->end(), 0);
    int32_t cut = static_cast<int32_t>((low_p / 100.0) * n);
    for (int32_t lb = 0; lb < hist->size() + 1 && cut > 0; lb++) {
      if (cut > (*hist)[lb]) {
        cut -= (*hist)[lb];
        (*hist)[lb] = 0;
      } else {
        (*hist)[lb] -= cut;
        cut = 0;
      }
    }
    cut = static_cast<int32_t>((hi_p / 100.0) * n);
    for (int32_t ub = hist->size() - 1; ub >= 0 && cut > 0; ub--) {
      if (cut > (*hist)[ub]) {
        cut -= (*hist)[ub];
        (*hist)[ub] = 0;
      } else {
        (*hist)[ub] -= cut;
        cut = 0;
      }
    }
    *lo = 0;
    *hi = hist->size() - 1;
    for (; (*lo) < (*hi) && !(*hist)[*lo]; (*lo)++) {
    }
    for (; (*hi) >= 0 && !(*hist)[*hi]; (*hi)--) {
    }
  } catch (const std::exception &e) {
    const char *err_msg = e.what();
    std::string err_message = "Error in ComputeUpperAndLowerPercentiles: ";
    err_message += err_msg;
    RETURN_STATUS_UNEXPECTED(err_message);
  }
  return Status::OK();
 }

 Status DegreesToRadians(float_t degrees, float_t *radians_target) {
  *radians_target = CV_PI * degrees / 180.0;
  return Status::OK();
 }

 Status GenerateRealNumber(float_t a, float_t b, std::mt19937 *rnd, float_t *result) {
  try {
    std::uniform_real_distribution<float_t> distribution{a, b};
    *result = distribution(*rnd);
  } catch (const std::exception &e) {
    const char *err_msg = e.what();
    std::string err_message = "Error in GenerateRealNumber: ";
    err_message += err_msg;
    RETURN_STATUS_UNEXPECTED(err_message);
  }
  return Status::OK();
 }

 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/math_utils.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/math_utils.h
@@ -0,0 +1,50 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IMAGE_MATH_UTILS_H_
 #define MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IMAGE_MATH_UTILS_H_

 #include <memory>
 #include <random>
 #include <vector>
 #include "minddata/dataset/util/status.h"

 namespace mindspore {
 namespace dataset {

 /// \brief Returns lower and upper pth percentiles of the input histogram.
 /// \param[in] hist: Input histogram (mutates the histogram for computation purposes)
 /// \param[in] hi_p: Right side percentile
 /// \param[in] low_p: Left side percentile
 /// \param[out] hi: Value at high end percentile
 /// \param[out] lo: Value at low end percentile
 Status ComputeUpperAndLowerPercentiles(std::vector<int32_t> *hist, int32_t hi_p, int32_t low_p, int32_t *hi,
                                       int32_t *lo);

 /// \brief Converts degrees input to radians.
 /// \param[in] degrees: Input degrees
 /// \param[out] radians_target: Radians output
 Status DegreesToRadians(float_t degrees, float_t *radians_target);

 /// \brief Generates a random real number in [a,b).
 /// \param[in] a: Start of range
 /// \param[in] b: End of range
 /// \param[in] rnd: Random device
 /// \param[out] result: Random number in range [a,b)
 Status GenerateRealNumber(float_t a, float_t b, std::mt19937 *rnd, float_t *result);

 }  // namespace dataset
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IMAGE_MATH_UTILS_H_
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/mixup_batch_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/mixup_batch_op.cc
@@ -0,0 +1,108 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include <string>
 #include <utility>
 #include "minddata/dataset/core/cv_tensor.h"
 #include "minddata/dataset/kernels/image/mixup_batch_op.h"
 #include "minddata/dataset/kernels/data/data_utils.h"
 #include "minddata/dataset/util/random.h"
 #include "minddata/dataset/util/status.h"

 namespace mindspore {
 namespace dataset {

 MixUpBatchOp::MixUpBatchOp(float alpha) : alpha_(alpha) { rnd_.seed(GetSeed()); }

 Status MixUpBatchOp::Compute(const TensorRow &input, TensorRow *output) {
  if (input.size() < 2) {
    RETURN_STATUS_UNEXPECTED("Both images and labels columns are required for this operation");
  }

  std::vector<std::shared_ptr<CVTensor>> images;
  std::vector<int64_t> image_shape = input.at(0)->shape().AsVector();
  std::vector<int64_t> label_shape = input.at(1)->shape().AsVector();

  // Check inputs
  if (label_shape.size() != 2 || image_shape.size() != 4 || image_shape[0] != label_shape[0]) {
    RETURN_STATUS_UNEXPECTED("You must batch before calling MixUpBatch");
  }

  if ((image_shape[1] != 1 && image_shape[1] != 3) && (image_shape[3] != 1 && image_shape[3] != 3)) {
    RETURN_STATUS_UNEXPECTED("MixUpBatch: Images must be in the shape of HWC or CHW");
  }

  // Move images into a vector of CVTensors
  RETURN_IF_NOT_OK(BatchTensorToCVTensorVector(input.at(0), &images));

  // Calculating lambda
  // If x1 is a random variable from Gamma(a1, 1) and x2 is a random variable from Gamma(a2, 1)
  // then x = x1 / (x1+x2) is a random variable from Beta(a1, a2)
  std::gamma_distribution<float> distribution(alpha_, 1);
  float x1 = distribution(rnd_);
  float x2 = distribution(rnd_);
  float lam = x1 / (x1 + x2);

  // Calculate random labels
  std::vector<int64_t> rand_indx;
  for (int64_t i = 0; i < images.size(); i++) rand_indx.push_back(i);
  std::shuffle(rand_indx.begin(), rand_indx.end(), rnd_);

  // Compute labels
  std::shared_ptr<Tensor> out_labels;
  RETURN_IF_NOT_OK(TypeCast(std::move(input.at(1)), &out_labels, DataType("float32")));
  for (int64_t i = 0; i < label_shape[0]; i++) {
    for (int64_t j = 0; j < label_shape[1]; j++) {
      uint64_t first_value, second_value;
      RETURN_IF_NOT_OK(input.at(1)->GetItemAt(&first_value, {i, j}));
      RETURN_IF_NOT_OK(input.at(1)->GetItemAt(&second_value, {rand_indx[i], j}));
      RETURN_IF_NOT_OK(out_labels->SetItemAt({i, j}, lam * first_value + (1 - lam) * second_value));
    }
  }

  // Compute images
  for (int64_t i = 0; i < images.size(); i++) {
    TensorShape remaining({-1});
    uchar *start_addr_of_index = nullptr;
    std::shared_ptr<Tensor> out;
    RETURN_IF_NOT_OK(input.at(0)->StartAddrOfIndex({rand_indx[i], 0, 0, 0}, &start_addr_of_index, &remaining));
    RETURN_IF_NOT_OK(input.at(0)->CreateFromMemory(TensorShape({image_shape[1], image_shape[2], image_shape[3]}),
                                                   input.at(0)->type(), start_addr_of_index, &out));
    std::shared_ptr<CVTensor> rand_image = CVTensor::AsCVTensor(std::move(out));
    if (!rand_image->mat().data) {
      RETURN_STATUS_UNEXPECTED("Could not convert to CV Tensor");
    }
    images[i]->mat() = lam * images[i]->mat() + (1 - lam) * rand_image->mat();
  }

  // Move the output into a TensorRow
  std::shared_ptr<Tensor> output_image;
  RETURN_IF_NOT_OK(Tensor::CreateEmpty(input.at(0)->shape(), input.at(0)->type(), &output_image));
  for (int64_t i = 0; i < images.size(); i++) {
    RETURN_IF_NOT_OK(output_image->InsertTensor({i}, images[i]));
  }
  output->push_back(output_image);
  output->push_back(out_labels);

  return Status::OK();
 }

 void MixUpBatchOp::Print(std::ostream &out) const {
  out << "MixUpBatchOp: "
      << "alpha: " << alpha_ << "\n";
 }
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/mixup_batch_op.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/mixup_batch_op.h
@@ -0,0 +1,51 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IMAGE_MIXUPBATCH_OP_H_
 #define MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IMAGE_MIXUPBATCH_OP_H_

 #include <memory>
 #include <vector>
 #include <random>
 #include <string>

 #include "minddata/dataset/core/tensor.h"
 #include "minddata/dataset/kernels/tensor_op.h"
 #include "minddata/dataset/util/status.h"

 namespace mindspore {
 namespace dataset {
 class MixUpBatchOp : public TensorOp {
 public:
  // Default values, also used by python_bindings.cc

  explicit MixUpBatchOp(float alpha);

  ~MixUpBatchOp() override = default;

  void Print(std::ostream &out) const override;

  Status Compute(const TensorRow &input, TensorRow *output) override;

  std::string Name() const override { return kMixUpBatchOp; }

 private:
  float alpha_;
  std::mt19937 rnd_;
 };
 }  // namespace dataset
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IMAGE_MIXUPBATCH_OP_H_
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/random_affine_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/random_affine_op.cc
@@ -0,0 +1,77 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <algorithm>
 #include <random>
 #include <utility>
 #include <vector>

 #include "minddata/dataset/kernels/image/random_affine_op.h"
 #include "minddata/dataset/kernels/image/image_utils.h"
 #include "minddata/dataset/kernels/image/math_utils.h"
 #include "minddata/dataset/util/random.h"

 namespace mindspore {
 namespace dataset {

 const std::vector<float_t> RandomAffineOp::kDegreesRange = {0.0, 0.0};
 const std::vector<float_t> RandomAffineOp::kTranslationPercentages = {0.0, 0.0};
 const std::vector<float_t> RandomAffineOp::kScaleRange = {1.0, 1.0};
 const std::vector<float_t> RandomAffineOp::kShearRanges = {0.0, 0.0, 0.0, 0.0};
 const InterpolationMode RandomAffineOp::kDefInterpolation = InterpolationMode::kNearestNeighbour;
 const std::vector<uint8_t> RandomAffineOp::kFillValue = {0, 0, 0};

 RandomAffineOp::RandomAffineOp(std::vector<float_t> degrees, std::vector<float_t> translate_range,
                               std::vector<float_t> scale_range, std::vector<float_t> shear_ranges,
                               InterpolationMode interpolation, std::vector<uint8_t> fill_value)
    : AffineOp(0.0),
      degrees_range_(degrees),
      translate_range_(translate_range),
      scale_range_(scale_range),
      shear_ranges_(shear_ranges) {
  interpolation_ = interpolation;
  fill_value_ = fill_value;
  rnd_.seed(GetSeed());
 }

 Status RandomAffineOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
  IO_CHECK(input, output);
  dsize_t height = input->shape()[0];
  dsize_t width = input->shape()[1];
  float_t max_dx = translate_range_[0] * height;
  float_t max_dy = translate_range_[1] * width;
  float_t degrees = 0.0;
  RETURN_IF_NOT_OK(GenerateRealNumber(degrees_range_[0], degrees_range_[1], &rnd_, &degrees));
  float_t translation_x = 0.0;
  RETURN_IF_NOT_OK(GenerateRealNumber(-1 * max_dx, max_dx, &rnd_, &translation_x));
  float_t translation_y = 0.0;
  RETURN_IF_NOT_OK(GenerateRealNumber(-1 * max_dy, max_dy, &rnd_, &translation_y));
  float_t scale = 1.0;
  RETURN_IF_NOT_OK(GenerateRealNumber(scale_range_[0], scale_range_[1], &rnd_, &scale));
  float_t shear_x = 0.0;
  RETURN_IF_NOT_OK(GenerateRealNumber(shear_ranges_[0], shear_ranges_[1], &rnd_, &shear_x));
  float_t shear_y = 0.0;
  RETURN_IF_NOT_OK(GenerateRealNumber(shear_ranges_[2], shear_ranges_[3], &rnd_, &shear_y));
  // assign to base class variables
  degrees_ = degrees;
  scale_ = scale;
  translation_[0] = translation_x;
  translation_[1] = translation_y;
  shear_[0] = shear_x;
  shear_[1] = shear_y;
  return AffineOp::Compute(input, output);
 }
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/random_affine_op.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/random_affine_op.h
@@ -0,0 +1,64 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IMAGE_RANDOM_AFFINE_OP_H_
 #define MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IMAGE_RANDOM_AFFINE_OP_H_

 #include <memory>
 #include <string>
 #include <vector>

 #include "minddata/dataset/core/cv_tensor.h"
 #include "minddata/dataset/core/tensor.h"
 #include "minddata/dataset/kernels/image/affine_op.h"
 #include "minddata/dataset/util/status.h"

 namespace mindspore {
 namespace dataset {
 class RandomAffineOp : public AffineOp {
 public:
  /// Default values, also used by python_bindings.cc
  static const std::vector<float_t> kDegreesRange;
  static const std::vector<float_t> kTranslationPercentages;
  static const std::vector<float_t> kScaleRange;
  static const std::vector<float_t> kShearRanges;
  static const InterpolationMode kDefInterpolation;
  static const std::vector<uint8_t> kFillValue;

  explicit RandomAffineOp(std::vector<float_t> degrees, std::vector<float_t> translate_range = kTranslationPercentages,
                          std::vector<float_t> scale_range = kScaleRange,
                          std::vector<float_t> shear_ranges = kShearRanges,
                          InterpolationMode interpolation = kDefInterpolation,
                          std::vector<uint8_t> fill_value = kFillValue);

  ~RandomAffineOp() override = default;

  std::string Name() const override { return kRandomAffineOp; }

  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;

 private:
  std::string kRandomAffineOp = "RandomAffineOp";
  std::vector<float_t> degrees_range_;    // min_degree, max_degree
  std::vector<float_t> translate_range_;  // maximum x translation percentage, maximum y translation percentage
  std::vector<float_t> scale_range_;      // min_scale, max_scale
  std::vector<float_t> shear_ranges_;     // min_x_shear, max_x_shear, min_y_shear, max_y_shear
  std::mt19937 rnd_;                      // random device
 };
 }  // namespace dataset
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IMAGE_RANDOM_AFFINE_OP_H_
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/random_crop_decode_resize_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/random_crop_decode_resize_op.cc
@@ -37,22 +37,9 @@ Status RandomCropDecodeResizeOp::Compute(const std::shared_ptr<Tensor> &input, s
    RETURN_IF_NOT_OK(op.Compute(input, &decoded));
    return RandomCropAndResizeOp::Compute(decoded, output);
  } else {
    struct jpeg_decompress_struct cinfo {};
    struct JpegErrorManagerCustom jerr {};
    cinfo.err = jpeg_std_error(&jerr.pub);
    jerr.pub.error_exit = JpegErrorExitCustom;
    try {
      jpeg_create_decompress(&cinfo);
      JpegSetSource(&cinfo, input->GetBuffer(), input->SizeInBytes());
      (void)jpeg_read_header(&cinfo, TRUE);
      jpeg_calc_output_dimensions(&cinfo);
    } catch (std::runtime_error &e) {
      jpeg_destroy_decompress(&cinfo);
      RETURN_STATUS_UNEXPECTED(e.what());
    }
    int h_in = cinfo.output_height;
    int w_in = cinfo.output_width;
    jpeg_destroy_decompress(&cinfo);
    int h_in = 0;
    int w_in = 0;
    RETURN_IF_NOT_OK(GetJpegImageInfo(input, &w_in, &h_in));

    int x = 0;
    int y = 0;
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/CMakeLists.txt
@@ -0,0 +1,6 @@
 file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
 set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
 add_subdirectory(utils)
 add_library(kernels-soft-dvpp-image OBJECT
            soft_dvpp_decode_resize_jpeg_op.cc
            soft_dvpp_decode_random_crop_resize_jpeg_op.cc)
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/soft_dvpp_decode_random_crop_resize_jpeg_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/soft_dvpp_decode_random_crop_resize_jpeg_op.cc
@@ -0,0 +1,84 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "minddata/dataset/kernels/image/soft_dvpp/soft_dvpp_decode_random_crop_resize_jpeg_op.h"
 #include <string>

 #include "opencv2/opencv.hpp"

 #include "minddata/dataset/core/cv_tensor.h"
 #include "minddata/dataset/kernels/image/image_utils.h"
 #include "minddata/dataset/util/random.h"

 namespace mindspore {
 namespace dataset {
 SoftDvppDecodeRandomCropResizeJpegOp::SoftDvppDecodeRandomCropResizeJpegOp(int32_t target_height, int32_t target_width,
                                                                           float scale_lb, float scale_ub,
                                                                           float aspect_lb, float aspect_ub,
                                                                           int32_t max_iter)
    : RandomCropAndResizeOp(target_height, target_width, scale_lb, scale_ub, aspect_lb, aspect_ub,
                            InterpolationMode::kLinear, max_iter) {}

 Status SoftDvppDecodeRandomCropResizeJpegOp::GetCropInfo(const std::shared_ptr<Tensor> &input,
                                                         SoftDpCropInfo *crop_info) {
  int img_width = 0;
  int img_height = 0;
  RETURN_IF_NOT_OK(GetJpegImageInfo(input, &img_width, &img_height));
  int x = 0;
  int y = 0;
  int crop_heigh = 0;
  int crop_widht = 0;
  RETURN_IF_NOT_OK(GetCropBox(img_height, img_width, &x, &y, &crop_heigh, &crop_widht));
  crop_info->left = x;
  crop_info->up = y;
  crop_info->right = crop_info->left + crop_widht;
  crop_info->down = crop_info->up + crop_heigh;
  return Status::OK();
 }

 Status SoftDvppDecodeRandomCropResizeJpegOp::Compute(const std::shared_ptr<Tensor> &input,
                                                     std::shared_ptr<Tensor> *output) {
  IO_CHECK(input, output);
  if (!IsNonEmptyJPEG(input)) {
    RETURN_STATUS_UNEXPECTED("SoftDvppDecodeRandomCropResizeJpeg only support process jpeg image.");
  }
  SoftDpCropInfo crop_info;
  RETURN_IF_NOT_OK(GetCropInfo(input, &crop_info));
  try {
    unsigned char *buffer = const_cast<unsigned char *>(input->GetBuffer());
    CHECK_FAIL_RETURN_UNEXPECTED(buffer != nullptr, "The input image buffer is empty.");
    SoftDpProcsessInfo info;
    info.input_buffer = static_cast<uint8_t *>(buffer);
    info.input_buffer_size = input->SizeInBytes();
    info.output_width = target_width_;
    info.output_height = target_height_;
    cv::Mat out_rgb_img(target_height_, target_width_, CV_8UC3);
    info.output_buffer = out_rgb_img.data;
    info.output_buffer_size = target_width_ * target_height_ * 3;
    info.is_v_before_u = true;
    int ret = DecodeAndCropAndResizeJpeg(&info, crop_info);
    std::string error_info("Soft dvpp DecodeAndResizeJpeg failed with return code: ");
    error_info += std::to_string(ret);
    CHECK_FAIL_RETURN_UNEXPECTED(ret == 0, error_info);
    std::shared_ptr<CVTensor> cv_tensor = nullptr;
    RETURN_IF_NOT_OK(CVTensor::CreateFromMat(out_rgb_img, &cv_tensor));
    *output = std::static_pointer_cast<Tensor>(cv_tensor);
  } catch (const cv::Exception &e) {
    RETURN_STATUS_UNEXPECTED("Error in soft dvpp image decode and resize.");
  }
  return Status::OK();
 }
 }  // namespace dataset
 }  // namespace mindspore