diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6dd5e1e1..021e7798 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,8 +16,11 @@ endif()
 
 if(DEFINED ENV{D_PKG_SERVER})
     set(GE_PB_PKG $ENV{D_PKG_SERVER})
-    message("Download packages from PKG server")
-endif()
+    message("Download packages from DPKG server")
+elseif(DEFINED ENV{MSLIBS_SERVER})
+    set(GE_PB_PKG "http://$ENV{MSLIBS_SERVER}:8081")
+    message("Download packages from MSPKG server")
+endif ()
 
 set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
 set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
@@ -37,7 +40,7 @@ set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} ${ATLAS_ATC_DIR}
 option(ENABLE_OPEN_SRC "Enable graphengine compile in opensource." FALSE)
 
 if (ENABLE_OPEN_SRC)
-    set(HI_PYTHON python3.7)
+    set(HI_PYTHON python3)
 
     include(cmake/external_libs/protobuf_shared.cmake)
     include(cmake/external_libs/protobuf_static.cmake)
@@ -71,7 +74,7 @@ if (ENABLE_OPEN_SRC)
         set(STATIC_ACL_LIB ${GE_LIB_PATH})
         find_module(slog libslog.so ${GE_LIB_PATH})
         find_module(static_mmpa libmmpa.a ${GE_LIB_PATH})
-        find_module(msprof libmsprof.so ${GE_LIB_PATH})
+        find_module(msprofiler libmsprofiler.a ${GE_LIB_PATH})
         find_module(hccl libhccl.so ${GE_LIB_PATH})
         find_module(adump_server libadump_server.a ${GE_LIB_PATH})
         find_module(runtime libruntime.so ${GE_LIB_PATH})
@@ -80,20 +83,19 @@ if (ENABLE_OPEN_SRC)
         find_module(error_manager liberror_manager.so ${GE_LIB_PATH})
         find_module(ascend_hal_stub libascend_hal.so ${GE_LIB_PATH})
         find_module(error_manager_static liberror_manager.a ${GE_LIB_PATH})
-        find_module(msprofiler libmsprofiler.a ${GE_LIB_PATH})
+        find_module(msprofiler_fwk libmsprofiler_fwk.a ${GE_LIB_PATH})
         #find_module(ascendcl_static libascendcl.a ${GE_LIB_PATH})
     else()
         find_module(slog libslog.so ${ASCEND_ATC_DIR} ${ASCEND_DRIVER_COMMON_DIR})
         find_module(static_mmpa libmmpa.a ${ASCEND_ATC_DIR} ${ASCEND_RUNTIME_DIR})
         find_module(error_manager liberror_manager.so ${ASCEND_ATC_DIR} ${ASCEND_RUNTIME_DIR})
         if(PLATFORM STREQUAL "train")
-            find_module(msprof libmsprof.so ${ASCEND_DRIVER_COMMON_DIR})
             find_module(hccl libhccl.so ${ASCEND_RUNTIME_DIR})
             find_module(adump_server libadump_server.a ${ASCEND_RUNTIME_DIR})
             find_module(runtime libruntime.so ${ASCEND_RUNTIME_DIR})
             find_module(resource libresource.so ${ASCEND_RUNTIME_DIR})
             find_module(error_manager liberror_manager.so ${ASCEND_RUNTIME_DIR})
-            find_module(msprofiler libmsprofiler.a ${ASCEND_RUNTIME_DIR})
+            find_module(msprofiler_fwk libmsprofiler_fwk.a ${ASCEND_RUNTIME_DIR})
             find_module(ascend_hal_stub libascend_hal.so ${ASCEND_DRIVER_DIR}/driver)
             if(PRODUCT STREQUAL "flr3")
                 message(FATAL_ERROR "This platform is not supported in train mode, build terminated")
@@ -106,20 +108,17 @@ if (ENABLE_OPEN_SRC)
             find_module(error_manager liberror_manager.so ${ASCEND_ATC_DIR})
             find_module(error_manager_static liberror_manager.a ${ASCEND_ACL_DIR})
             find_module(msprofiler libmsprofiler.a ${ASCEND_ACL_DIR})
-	        #find_module(ascendcl_static libascendcl.a ${ASCEND_ACL_DIR})
+            #find_module(ascendcl_static libascendcl.a ${ASCEND_ACL_DIR})
             if(PRODUCT STREQUAL "flr3")
-                find_module(msprof libmsprof.so ${ASCEND_DRIVER_SHARE_DIR})
             elseif(PRODUCT STREQUAL "flr1")
                 find_module(ascend_hal_stub libascend_hal.so ${ASCEND_DRIVER_DIR}/driver)
-                find_module(msprof libmsprof.so ${ASCEND_DRIVER_COMMON_DIR})
             elseif(PRODUCT STREQUAL "flr2")
                 # flr2 ascend_hal_stub limsprof ?
             else()
                 find_module(ascend_hal_stub libascend_hal.so ${ASCEND_DRIVER_DIR})
-                find_module(msprof libmsprof.so ${ASCEND_DRIVER_DIR})
             endif()
         elseif(PLATFORM STREQUAL "all")
-            find_module(msprof libmsprof.so ${ASCEND_DRIVER_COMMON_DIR})
+            find_module(msprofiler libmsprofiler.a ${ASCEND_DRIVER_COMMON_DIR})
             find_module(hccl libhccl.so ${ASCEND_RUNTIME_DIR})
             find_module(adump_server libadump_server.a ${ASCEND_ACL_DIR})
             find_module(runtime libruntime.so ${ASCEND_ACL_DIR})
@@ -127,14 +126,14 @@ if (ENABLE_OPEN_SRC)
             find_module(resource libresource.so ${ASCEND_ATC_DIR})
             find_module(error_manager liberror_manager.so ${ASCEND_ATC_DIR})
             find_module(error_manager_static liberror_manager.a ${ASCEND_ACL_DIR})
-            find_module(msprofiler libmsprofiler.a ${ASCEND_ACL_DIR})
+            find_module(msprofiler_fwk libmsprofiler_fwk.a ${ASCEND_ACL_DIR})
             find_module(ascend_hal_stub libascend_hal.so ${ASCEND_DRIVER_DIR}/driver)
             #find_module(ascendcl_static libascendcl.a ${ASCEND_ACL_DIR})
         else()
-	    message(STATUS "PLATFORM param is invalid, should be train or inference, you choose nothing!")
+            message(STATUS "PLATFORM param is invalid, should be train or inference, you choose nothing!")
         endif()
 
-	if (ENABLE_GE_COV OR ENABLE_GE_UT)
+        if (ENABLE_GE_COV OR ENABLE_GE_UT)
             add_subdirectory(tests)
         endif()
 
diff --git a/cmake/external_libs/gflags.cmake b/cmake/external_libs/gflags.cmake
index f3f0f0ef..50cfb2bc 100755
--- a/cmake/external_libs/gflags.cmake
+++ b/cmake/external_libs/gflags.cmake
@@ -23,6 +23,7 @@ ExternalProject_Add(gflags_build
                     URL ${REQ_URL}
                     #URL /home/txd/workspace/linux_cmake/pkg/protobuf-3.8.0.tar.gz
                     #SOURCE_DIR ${GE_CODE_DIR}/../../third_party/gflags/src/gflags-2.2.2 
+                    TLS_VERIFY OFF
                     CONFIGURE_COMMAND ${CMAKE_COMMAND} -DCMAKE_CXX_FLAGS=${gflags_CXXFLAGS} -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}/gflags <SOURCE_DIR>
                     BUILD_COMMAND $(MAKE)
                     INSTALL_COMMAND $(MAKE) install
diff --git a/cmake/external_libs/gtest.cmake b/cmake/external_libs/gtest.cmake
index 96ea84b4..c5edcd72 100755
--- a/cmake/external_libs/gtest.cmake
+++ b/cmake/external_libs/gtest.cmake
@@ -10,7 +10,10 @@ if ((${CMAKE_INSTALL_PREFIX} STREQUAL /usr/local) OR
     message(STATUS "No install prefix selected, default to ${CMAKE_INSTALL_PREFIX}.")
 endif()
 
-if (ENABLE_GITEE)
+if (GE_PB_PKG)
+    set(REQ_URL "${GE_PB_PKG}/libs/gtest/release-1.8.0.tar.gz")
+    set(MD5 "")
+elseif (ENABLE_GITEE)
     set(REQ_URL "https://gitee.com/mirrors/googletest/repository/archive/release-1.8.0.tar.gz")
     set(MD5 "")
 else()
@@ -22,8 +25,9 @@ set (gtest_CXXFLAGS "-D_GLIBCXX_USE_CXX11_ABI=0 -D_FORTIFY_SOURCE=2 -O2 -fstack-
 set (gtest_CFLAGS "-D_GLIBCXX_USE_CXX11_ABI=0 -D_FORTIFY_SOURCE=2 -O2 -fstack-protector-all -Wl,-z,relro,-z,now,-z,noexecstack")
 ExternalProject_Add(gtest_build
                     URL ${REQ_URL}
+                    TLS_VERIFY OFF
                     CONFIGURE_COMMAND ${CMAKE_COMMAND} -DCMAKE_CXX_FLAGS=${gtest_CXXFLAGS} -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}/gtest <SOURCE_DIR>
-		    -DBUILD_TESTING=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_MACOSX_RPATH=TRUE -Dgtest_disable_pthreads=ON
+                -DBUILD_TESTING=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_MACOSX_RPATH=TRUE -Dgtest_disable_pthreads=ON
                     BUILD_COMMAND $(MAKE)
                     INSTALL_COMMAND $(MAKE) install
                     EXCLUDE_FROM_ALL TRUE 
diff --git a/cmake/external_libs/json.cmake b/cmake/external_libs/json.cmake
index ce473d4b..3c1cd012 100755
--- a/cmake/external_libs/json.cmake
+++ b/cmake/external_libs/json.cmake
@@ -5,10 +5,14 @@ endif()
 include(ExternalProject)
 
 set(JSON_SRC_DIR ${CMAKE_BINARY_DIR}/opensrc/json/include)
-if (ENABLE_GITEE)
-    set(REQ_URL "https://gitee.com/mirrors/JSON-for-Modern-CPP/repository/archive/v3.6.1.zip")
-    set(MD5 "5bda78ce308e6cfcf614dcf1d5ff27a7")
-    set(JSON_INCLUDE_DIR "${JSON_SRC_DIR}/include")
+if (GE_PB_PKG)
+    set(REQ_URL "${GE_PB_PKG}/libs/ge_nlohmann_json/include.zip")
+    set(MD5 "0dc903888211db3a0f170304cd9f3a89")
+    set(JSON_INCLUDE_DIR ${JSON_SRC_DIR})
+#elseif (ENABLE_GITEE)
+#    set(REQ_URL "https://gitee.com/mirrors/JSON-for-Modern-CPP/repository/archive/v3.6.1.zip")
+#    set(MD5 "5bda78ce308e6cfcf614dcf1d5ff27a7")
+#set(JSON_INCLUDE_DIR "${JSON_SRC_DIR}/include")
 else()
     set(REQ_URL "https://github.com/nlohmann/json/releases/download/v3.6.1/include.zip")
     set(MD5 "0dc903888211db3a0f170304cd9f3a89")
@@ -18,6 +22,7 @@ ExternalProject_Add(json_build
                     URL ${REQ_URL}
                     #URL /home/txd/workspace/cloud_code/pkg/include.zip
                     SOURCE_DIR  ${JSON_SRC_DIR}
+                    TLS_VERIFY OFF
                     CONFIGURE_COMMAND ""
                     BUILD_COMMAND ""
                     INSTALL_COMMAND ""
diff --git a/cmake/external_libs/onnx.cmake b/cmake/external_libs/onnx.cmake
index 9dadb544..1ee80d2d 100755
--- a/cmake/external_libs/onnx.cmake
+++ b/cmake/external_libs/onnx.cmake
@@ -6,7 +6,10 @@ set(ONNX_PROTO_DIR ${CMAKE_BINARY_DIR}/onnx)
 set(ONNX_PROTO_FILE ${ONNX_PROTO_DIR}/onnx.proto)
 file(MAKE_DIRECTORY ${ONNX_PROTO_DIR})
 
-if (ENABLE_GITEE)
+if (GE_PB_PKG)
+    set(REQ_URL "${GE_PB_PKG}/libs/onnx/onnx-1.6.0.tar.gz")
+    set(MD5 "512f2779d6215d4a36f366b6b9acdf1e")
+elseif (ENABLE_GITEE)
     set(REQ_URL "https://gitee.com/mirrors/ONNX/repository/archive/v1.6.0.tar.gz")
     set(MD5 "1bdbcecdd68ea8392630467646776e02")
 else()
@@ -19,6 +22,7 @@ ExternalProject_Add(onnx
                     #URL /home/txd/workspace/cloud_code/pkg/onnx-1.6.0.tar.gz
                     #URL_HASH SHA256=3b88c3fe521151651a0403c4d131cb2e0311bd28b753ef692020a432a81ce345
                     #SOURCE_DIR ${ONNX_SRC_DIR}
+                    TLS_VERIFY OFF
                     CONFIGURE_COMMAND ""
                     BUILD_COMMAND ""
                     #INSTALL_COMMAND "" 
diff --git a/cmake/external_libs/protobuf_shared.cmake b/cmake/external_libs/protobuf_shared.cmake
index c9c6b7d9..6334c8a3 100755
--- a/cmake/external_libs/protobuf_shared.cmake
+++ b/cmake/external_libs/protobuf_shared.cmake
@@ -26,6 +26,7 @@ set(protobuf_CXXFLAGS "-Wno-maybe-uninitialized -Wno-unused-parameter -fPIC -fst
 set(protobuf_LDFLAGS "-Wl,-z,relro,-z,now,-z,noexecstack")
 ExternalProject_Add(protobuf_build
                     URL ${REQ_URL}
+                    TLS_VERIFY OFF
                     CONFIGURE_COMMAND ${CMAKE_COMMAND}
                     -Dprotobuf_WITH_ZLIB=OFF
                     -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR}
diff --git a/cmake/external_libs/protobuf_static.cmake b/cmake/external_libs/protobuf_static.cmake
index 6f3e1f53..e4bbb9a0 100755
--- a/cmake/external_libs/protobuf_static.cmake
+++ b/cmake/external_libs/protobuf_static.cmake
@@ -27,6 +27,7 @@ ExternalProject_Add(protobuf_static_build
                     URL ${REQ_URL}
                     #URL /home/txd/workspace/linux_cmake/pkg/protobuf-3.8.0.tar.gz
                     #SOURCE_DIR ${METADEF_DIR}/../../third_party/protobuf/src/protobuf-3.8.0
+                    TLS_VERIFY OFF
                     CONFIGURE_COMMAND ${CMAKE_COMMAND}
                     -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                     -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
diff --git a/cmake/external_libs/protoc.cmake b/cmake/external_libs/protoc.cmake
index 0d162c0d..58321f04 100755
--- a/cmake/external_libs/protoc.cmake
+++ b/cmake/external_libs/protoc.cmake
@@ -1,115 +1,116 @@
-if (HAVE_PROTOC)
-    return()
-endif()
-
-include(ExternalProject)
-include(GNUInstallDirs)
-#set(CMAKE_INSTALL_PREFIX ${GE_CODE_DIR}/output)
-
-if ((${CMAKE_INSTALL_PREFIX} STREQUAL /usr/local) OR
-    (${CMAKE_INSTALL_PREFIX} STREQUAL "C:/Program Files (x86)/ascend"))
-    set(CMAKE_INSTALL_PREFIX ${GE_CODE_DIR}/output CACHE STRING "path for install()" FORCE)
-    message(STATUS "No install prefix selected, default to ${CMAKE_INSTALL_PREFIX}.")
-endif()
-
-if(GE_PB_PKG)
-    set(REQ_URL "${GE_PB_PKG}/libs/protobuf/v3.8.0.tar.gz")
-else()
-    if (ENABLE_GITEE)
-        set(REQ_URL "https://gitee.com/mirrors/protobuf_source/repository/archive/v3.8.0.tar.gz")
-        set(MD5 "eba86ae9f07ba5cfbaf8af3bc4e84236")
-    else()
-        set(REQ_URL "https://github.com/protocolbuffers/protobuf/archive/v3.8.0.tar.gz")
-        set(MD5 "3d9e32700639618a4d2d342c99d4507a")
-    endif ()
-endif()
-
-set(protobuf_CXXFLAGS "-Wno-maybe-uninitialized -Wno-unused-parameter -fPIC -fstack-protector-all -D_FORTIFY_SOURCE=2 -D_GLIBCXX_USE_CXX11_ABI=0 -O2")
-set(protobuf_LDFLAGS "-Wl,-z,relro,-z,now,-z,noexecstack")
-ExternalProject_Add(protoc_build
-                    URL ${REQ_URL}
-                    #URL /home/txd/workspace/linux_cmake/pkg/protobuf-3.8.0.tar.gz
-                    #SOURCE_DIR ${GE_CODE_DIR}/../third_party/protobuf/src/protobuf-3.8.0
-                    CONFIGURE_COMMAND ${CMAKE_COMMAND} -Dprotobuf_WITH_ZLIB=OFF -Dprotobuf_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_CXX_FLAGS=${protobuf_CXXFLAGS} -DCMAKE_CXX_LDFLAGS=${protobuf_LDFLAGS} -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}/protoc <SOURCE_DIR>/cmake
-                    BUILD_COMMAND $(MAKE)
-                    INSTALL_COMMAND $(MAKE) install
-                    EXCLUDE_FROM_ALL TRUE
-)
-
-set(PROTOC_PKG_DIR ${CMAKE_INSTALL_PREFIX}/protoc)
-
-set(protoc_EXECUTABLE ${PROTOC_PKG_DIR}/${CMAKE_INSTALL_BINDIR}/protoc)
-
-function(protobuf_generate comp c_var h_var)
-    if(NOT ARGN)
-        message(SEND_ERROR "Error: protobuf_generate() called without any proto files")
-        return()
-    endif()
-    set(${c_var})
-    set(${h_var})
-
-    foreach(file ${ARGN})
-        get_filename_component(abs_file ${file} ABSOLUTE)
-        get_filename_component(file_name ${file} NAME_WE)
-        get_filename_component(file_dir ${abs_file} PATH)
-        get_filename_component(parent_subdir ${file_dir} NAME)
-
-        if("${parent_subdir}" STREQUAL "proto")
-            set(proto_output_path ${CMAKE_BINARY_DIR}/proto/${comp}/proto)
-        else()
-            set(proto_output_path ${CMAKE_BINARY_DIR}/proto/${comp}/proto/${parent_subdir})
-        endif()
-        list(APPEND ${c_var} "${proto_output_path}/${file_name}.pb.cc")
-        list(APPEND ${h_var} "${proto_output_path}/${file_name}.pb.h")
-
-        add_custom_command(
-                OUTPUT "${proto_output_path}/${file_name}.pb.cc" "${proto_output_path}/${file_name}.pb.h"
-                WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
-                COMMAND ${CMAKE_COMMAND} -E make_directory "${proto_output_path}"
-                COMMAND ${protoc_EXECUTABLE} -I${file_dir} --cpp_out=${proto_output_path} ${abs_file}
-                DEPENDS protoc_build ${abs_file}
-                COMMENT "Running C++ protocol buffer compiler on ${file}" VERBATIM )
-    endforeach()
-
-    set_source_files_properties(${${c_var}} ${${h_var}} PROPERTIES GENERATED TRUE)
-    set(${c_var} ${${c_var}} PARENT_SCOPE)
-    set(${h_var} ${${h_var}} PARENT_SCOPE)
-
-endfunction()
-
-function(protobuf_generate_py comp py_var)
-    if(NOT ARGN)
-        message(SEND_ERROR "Error: protobuf_generate_py() called without any proto files")
-        return()
-    endif()
-    set(${py_var})
-
-    foreach(file ${ARGN})
-        get_filename_component(abs_file ${file} ABSOLUTE)
-        get_filename_component(file_name ${file} NAME_WE)
-        get_filename_component(file_dir ${abs_file} PATH)
-        get_filename_component(parent_subdir ${file_dir} NAME)
-
-        if("${parent_subdir}" STREQUAL "proto")
-            set(proto_output_path ${CMAKE_BINARY_DIR}/proto/${comp}/proto)
-        else()
-            set(proto_output_path ${CMAKE_BINARY_DIR}/proto/${comp}/proto/${parent_subdir})
-        endif()
-        list(APPEND ${py_var} "${proto_output_path}/${file_name}_pb2.py")
-
-        add_custom_command(
-                OUTPUT "${proto_output_path}/${file_name}_pb2.py"
-                WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
-                COMMAND ${CMAKE_COMMAND} -E make_directory "${proto_output_path}"
-                COMMAND ${protoc_EXECUTABLE} -I${file_dir} --python_out=${proto_output_path} ${abs_file}
-                DEPENDS protoc_build ${abs_file}
-                COMMENT "Running PYTHON protocol buffer compiler on ${file}" VERBATIM )
-    endforeach()
-
-    set_source_files_properties(${${py_var}} PROPERTIES GENERATED TRUE)
-    set(${py_var} ${${py_var}} PARENT_SCOPE)
-
-endfunction()
-
-#set(HAVE_PROTOC TRUE CACHE BOOL "protoc build add")
-set(HAVE_PROTOC TRUE)
+if (HAVE_PROTOC)
+    return()
+endif()
+
+include(ExternalProject)
+include(GNUInstallDirs)
+#set(CMAKE_INSTALL_PREFIX ${GE_CODE_DIR}/output)
+
+if ((${CMAKE_INSTALL_PREFIX} STREQUAL /usr/local) OR
+    (${CMAKE_INSTALL_PREFIX} STREQUAL "C:/Program Files (x86)/ascend"))
+    set(CMAKE_INSTALL_PREFIX ${GE_CODE_DIR}/output CACHE STRING "path for install()" FORCE)
+    message(STATUS "No install prefix selected, default to ${CMAKE_INSTALL_PREFIX}.")
+endif()
+
+if(GE_PB_PKG)
+    set(REQ_URL "${GE_PB_PKG}/libs/protobuf/v3.8.0.tar.gz")
+else()
+    if (ENABLE_GITEE)
+        set(REQ_URL "https://gitee.com/mirrors/protobuf_source/repository/archive/v3.8.0.tar.gz")
+        set(MD5 "eba86ae9f07ba5cfbaf8af3bc4e84236")
+    else()
+        set(REQ_URL "https://github.com/protocolbuffers/protobuf/archive/v3.8.0.tar.gz")
+        set(MD5 "3d9e32700639618a4d2d342c99d4507a")
+    endif ()
+endif()
+
+set(protobuf_CXXFLAGS "-Wno-maybe-uninitialized -Wno-unused-parameter -fPIC -fstack-protector-all -D_FORTIFY_SOURCE=2 -D_GLIBCXX_USE_CXX11_ABI=0 -O2")
+set(protobuf_LDFLAGS "-Wl,-z,relro,-z,now,-z,noexecstack")
+ExternalProject_Add(protoc_build
+                    URL ${REQ_URL}
+                    #URL /home/txd/workspace/linux_cmake/pkg/protobuf-3.8.0.tar.gz
+                    #SOURCE_DIR ${GE_CODE_DIR}/../third_party/protobuf/src/protobuf-3.8.0
+                    TLS_VERIFY OFF
+                    CONFIGURE_COMMAND ${CMAKE_COMMAND} -Dprotobuf_WITH_ZLIB=OFF -Dprotobuf_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_CXX_FLAGS=${protobuf_CXXFLAGS} -DCMAKE_CXX_LDFLAGS=${protobuf_LDFLAGS} -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}/protoc <SOURCE_DIR>/cmake
+                    BUILD_COMMAND $(MAKE)
+                    INSTALL_COMMAND $(MAKE) install
+                    EXCLUDE_FROM_ALL TRUE
+)
+
+set(PROTOC_PKG_DIR ${CMAKE_INSTALL_PREFIX}/protoc)
+
+set(protoc_EXECUTABLE ${PROTOC_PKG_DIR}/${CMAKE_INSTALL_BINDIR}/protoc)
+
+function(protobuf_generate comp c_var h_var)
+    if(NOT ARGN)
+        message(SEND_ERROR "Error: protobuf_generate() called without any proto files")
+        return()
+    endif()
+    set(${c_var})
+    set(${h_var})
+
+    foreach(file ${ARGN})
+        get_filename_component(abs_file ${file} ABSOLUTE)
+        get_filename_component(file_name ${file} NAME_WE)
+        get_filename_component(file_dir ${abs_file} PATH)
+        get_filename_component(parent_subdir ${file_dir} NAME)
+
+        if("${parent_subdir}" STREQUAL "proto")
+            set(proto_output_path ${CMAKE_BINARY_DIR}/proto/${comp}/proto)
+        else()
+            set(proto_output_path ${CMAKE_BINARY_DIR}/proto/${comp}/proto/${parent_subdir})
+        endif()
+        list(APPEND ${c_var} "${proto_output_path}/${file_name}.pb.cc")
+        list(APPEND ${h_var} "${proto_output_path}/${file_name}.pb.h")
+
+        add_custom_command(
+                OUTPUT "${proto_output_path}/${file_name}.pb.cc" "${proto_output_path}/${file_name}.pb.h"
+                WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+                COMMAND ${CMAKE_COMMAND} -E make_directory "${proto_output_path}"
+                COMMAND ${protoc_EXECUTABLE} -I${file_dir} --cpp_out=${proto_output_path} ${abs_file}
+                DEPENDS protoc_build ${abs_file}
+                COMMENT "Running C++ protocol buffer compiler on ${file}" VERBATIM )
+    endforeach()
+
+    set_source_files_properties(${${c_var}} ${${h_var}} PROPERTIES GENERATED TRUE)
+    set(${c_var} ${${c_var}} PARENT_SCOPE)
+    set(${h_var} ${${h_var}} PARENT_SCOPE)
+
+endfunction()
+
+function(protobuf_generate_py comp py_var)
+    if(NOT ARGN)
+        message(SEND_ERROR "Error: protobuf_generate_py() called without any proto files")
+        return()
+    endif()
+    set(${py_var})
+
+    foreach(file ${ARGN})
+        get_filename_component(abs_file ${file} ABSOLUTE)
+        get_filename_component(file_name ${file} NAME_WE)
+        get_filename_component(file_dir ${abs_file} PATH)
+        get_filename_component(parent_subdir ${file_dir} NAME)
+
+        if("${parent_subdir}" STREQUAL "proto")
+            set(proto_output_path ${CMAKE_BINARY_DIR}/proto/${comp}/proto)
+        else()
+            set(proto_output_path ${CMAKE_BINARY_DIR}/proto/${comp}/proto/${parent_subdir})
+        endif()
+        list(APPEND ${py_var} "${proto_output_path}/${file_name}_pb2.py")
+
+        add_custom_command(
+                OUTPUT "${proto_output_path}/${file_name}_pb2.py"
+                WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+                COMMAND ${CMAKE_COMMAND} -E make_directory "${proto_output_path}"
+                COMMAND ${protoc_EXECUTABLE} -I${file_dir} --python_out=${proto_output_path} ${abs_file}
+                DEPENDS protoc_build ${abs_file}
+                COMMENT "Running PYTHON protocol buffer compiler on ${file}" VERBATIM )
+    endforeach()
+
+    set_source_files_properties(${${py_var}} PROPERTIES GENERATED TRUE)
+    set(${py_var} ${${py_var}} PARENT_SCOPE)
+
+endfunction()
+
+#set(HAVE_PROTOC TRUE CACHE BOOL "protoc build add")
+set(HAVE_PROTOC TRUE)
diff --git a/cmake/external_libs/securec.cmake b/cmake/external_libs/securec.cmake
index 0bd62ab2..0f8b6d3a 100755
--- a/cmake/external_libs/securec.cmake
+++ b/cmake/external_libs/securec.cmake
@@ -10,11 +10,20 @@ if ((${CMAKE_INSTALL_PREFIX} STREQUAL /usr/local) OR
     message(STATUS "No install prefix selected, default to ${CMAKE_INSTALL_PREFIX}.")
 endif()
 
+if (GE_PB_PKG)
+    set(REQ_URL "${GE_PB_PKG}/libs/securec/v1.1.10.tar.gz")
+    set(MD5 "")
+else()
+    set(REQ_URL "https://gitee.com/openeuler/libboundscheck/repository/archive/v1.1.10.tar.gz")
+    set(MD5 "")
+endif ()
+
 ExternalProject_Add(c_sec_build
-                    URL https://gitee.com/openeuler/libboundscheck/repository/archive/v1.1.10.tar.gz
-                    #URL /home/txd/workspace/linux_cmake/pkg/protobuf-3.8.0.tar.gz
+                    URL ${REQ_URL}
+                    #URL https://gitee.com/openeuler/libboundscheck/repository/archive/v1.1.10.tar.gz
                     #SOURCE_DIR ${GE_CODE_DIR}/../libc_sec
                     PATCH_COMMAND patch -p1 < ${GE_CODE_DIR}/metadef/third_party/patch/securec/0001-add-securec-cmake-script.patch
+                    TLS_VERIFY OFF
                     CONFIGURE_COMMAND ${CMAKE_COMMAND}
                     -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                     -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
diff --git a/ge/CMakeLists.txt b/ge/CMakeLists.txt
index 88a5c52f..59b804d8 100755
--- a/ge/CMakeLists.txt
+++ b/ge/CMakeLists.txt
@@ -60,6 +60,8 @@ set(TRAIN_SRC_LIST
     "common/dump/dump_manager.cc"
     "common/dump/dump_properties.cc"
     "common/dump/dump_op.cc"
+    "common/profiling/ge_profiling.cc"
+    "common/profiling/ge_runner_profiling.cc"
     "engine_manager/dnnengine_manager.cc"
     "ge_local_engine/engine/host_cpu_engine.cc"
     "generator/ge_generator.cc"
@@ -201,6 +203,7 @@ set(TRAIN_SRC_LIST
     "host_kernels/sub_kernel.cc"
     "host_kernels/transdata_kernel.cc"
     "host_kernels/unpack_kernel.cc"
+    "host_kernels/reformat_kernel.cc"
     "graph/passes/folding_pass.cc"
     "graph/passes/get_original_format_pass.cc"
     "graph/passes/guarantee_const_pass.cc"
@@ -331,7 +334,6 @@ set(TRAIN_SRC_LIST
     "hybrid/hybrid_davinci_model.cc"
     "executor/ge_executor.cc"
     "client/ge_api.cc"
-    "client/ge_prof.cc"
     "analyzer/analyzer.cc"
     "ir_build/ge_ir_build.cc"
     "ir_build/atc_ir_common.cc"
@@ -487,6 +489,7 @@ set(INFER_SRC_LIST
     "host_kernels/slice_d_kernel.cc"
     "host_kernels/dynamic_stitch_kernel.cc"
     "host_kernels/identity_kernel.cc"
+    "host_kernels/reformat_kernel.cc"
     "graph/passes/stop_gradient_pass.cc"
     "graph/passes/prevent_gradient_pass.cc"
     "graph/passes/identity_pass.cc"
@@ -602,7 +605,7 @@ set(INFER_SRC_LIST
 
 if (NOT ENABLE_D AND NOT ENABLE_ACL AND NOT ENABLE_MS_TESTCASES)
 ############ libge_runner.so ############
-add_library(ge_runner SHARED ${TRAIN_SRC_LIST} ${PROTO_SRCS} ${PROTO_CLIENT_SRCS})
+add_library(ge_runner SHARED ${TRAIN_SRC_LIST} ${PROTO_SRCS} ${PROTO_CLIENT_SRCS} $<TARGET_OBJECTS:msprofiler_fwk>)
 
 target_compile_definitions(ge_runner PRIVATE
     PROTOBUF_INLINE_NOT_IN_HEADERS=0
@@ -647,7 +650,6 @@ target_link_libraries(ge_runner
     $<BUILD_INTERFACE:intf_pub>
     ge_memory
     adump_server
-    msprofiler
     static_mmpa
     -Wl,--no-as-needed
     graph
@@ -656,7 +658,6 @@ target_link_libraries(ge_runner
     register
     c_sec
     slog
-    msprof
     runtime
     resource
     error_manager
@@ -781,7 +782,6 @@ target_link_libraries(opensrc_ascendcl PRIVATE
                      c_sec
                      runtime
                      slog
-                     msprof
                      ascend_hal_stub
                      -Wl,--as-needed
                      -lrt
@@ -797,12 +797,10 @@ set_target_properties(opensrc_ascendcl PROPERTIES
 add_custom_command(
     OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/stub_ge_ir_build.cc
            ${CMAKE_CURRENT_BINARY_DIR}/stub_ge_api.cc
-           ${CMAKE_CURRENT_BINARY_DIR}/stub_ge_prof.cc
     COMMAND echo "Generating stub files."
             && ${HI_PYTHON} ${CMAKE_CURRENT_LIST_DIR}/stub/gen_stubapi.py ${GE_CODE_DIR}/inc/external ${CMAKE_CURRENT_BINARY_DIR}
             && mv ge_ir_build.cc stub_ge_ir_build.cc
             && mv ge_api.cc stub_ge_api.cc
-            && mv ge_prof.cc stub_ge_prof.cc
             &&  echo "Generating stub files end."
     #WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
     #DEPENDS stub/gen_stubapi.py ${TOP_DIR}/inc/external ${CMAKE_CURRENT_BINARY_DIR}
@@ -811,7 +809,6 @@ add_custom_command(
 add_custom_target(ge_stub
     DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/stub_ge_ir_build.cc
             ${CMAKE_CURRENT_BINARY_DIR}/stub_ge_api.cc
-            ${CMAKE_CURRENT_BINARY_DIR}/stub_ge_prof.cc
 )
 
 ##################################################################
@@ -853,7 +850,6 @@ target_include_directories(atc_stub_ge_compiler PRIVATE
 ############ stub/libge_runner.so ############
 add_library(fwk_stub_ge_runner SHARED
     stub_ge_api.cc
-    stub_ge_prof.cc
     stub_ge_ir_build.cc
 )
 
diff --git a/ge/client/ge_api.cc b/ge/client/ge_api.cc
index 9ecc3016..66958310 100644
--- a/ge/client/ge_api.cc
+++ b/ge/client/ge_api.cc
@@ -134,7 +134,7 @@ Status GEInitialize(const std::map<string, string> &options) {
 
 Status GEInitialize(const std::map<AscendString, AscendString> &options) {
   std::map<std::string, std::string> str_options;
-  for (auto & option : options) {
+  for (auto &option : options) {
     if (option.first.GetString() == nullptr || option.second.GetString() == nullptr) {
       GELOGE(FAILED, "GEInitialize options is nullptr.");
       return FAILED;
diff --git a/ge/client/ge_prof.cc b/ge/client/ge_prof.cc
deleted file mode 100644
index ede38430..00000000
--- a/ge/client/ge_prof.cc
+++ /dev/null
@@ -1,369 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "ge/ge_prof.h"
-#include "ge/ge_api.h"
-#include "init/gelib.h"
-#include "common/debug/log.h"
-#include "framework/common/debug/ge_log.h"
-#include "common/profiling/profiling_manager.h"
-#include "graph/load/graph_loader.h"
-#include "toolchain/prof_acl_api.h"
-
-using std::map;
-using std::string;
-using std::vector;
-
-namespace {
-const uint32_t kMaxDeviceNum = 64;
-const uint32_t kDeviceListIndex = 3;
-const std::string kProfilingInit = "prof_init";
-const std::string kProfilingFinalize = "prof_finalize";
-const std::string kProfilingStart = "prof_start";
-const std::string kProfilingStop = "prof_stop";
-const std::string kDeviceNums = "devNums";
-const std::string kDeviceIdList = "devIdList";
-const std::string kAicoreMetrics = "aicoreMetrics";
-
-const std::map<ge::ProfilingAicoreMetrics, std::string> kProfAicoreMetricsToString = {
-    {ge::kAicoreArithmaticThroughput, "AICORE_ARITHMATIC_THROUGHPUT"},
-    {ge::kAicorePipeline, "AICORE_PIPELINE"},
-    {ge::kAicoreSynchronization, "AICORE_SYNCHRONIZATION"},
-    {ge::kAicoreMemory, "AICORE_MEMORY"},
-    {ge::kAicoreInternalMemory, "AICORE_INTERNAL_MEMORY"},
-    {ge::kAicoreStall, "AICORE_STALL"}};
-}  // namespace
-
-static bool g_graph_prof_init_ = false;
-static std::mutex g_prof_mutex_;
-
-namespace ge {
-struct aclgrphProfConfig {
-  ProfConfig config;
-};
-
-Status aclgrphProfInit(const char *profiler_path, uint32_t length) {
-  GELOGT(TRACE_INIT, "Graph prof init start");
-
-  std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
-  if (instance_ptr == nullptr || !instance_ptr->InitFlag()) {
-    GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Ge client is not initialized.");
-    return FAILED;
-  }
-
-  std::lock_guard<std::mutex> lock(g_prof_mutex_);
-  if (g_graph_prof_init_) {
-    GELOGW("Multi graph profiling initializations.");
-    return GE_PROF_MULTI_INIT;
-  }
-
-  Status ret = CheckPath(profiler_path, length);
-  if (ret != SUCCESS) {
-    GELOGE(ret, "Profiling config path is invalid.");
-    return ret;
-  }
-  // if command mode is set, just return
-  if (ProfilingManager::Instance().ProfilingOn()) {
-    GELOGW("Graph prof init failed, cause profiling command pattern is running.");
-    return GE_PROF_MODE_CONFLICT;
-  }
-
-  ret = ProfInit(profiler_path);
-  if (ret != SUCCESS) {
-    GELOGE(ret, "ProfInit init fail");
-    return ret;
-  }
-
-  GraphLoader graph_loader;
-  Command command;
-  command.cmd_params.clear();
-  command.cmd_type = kProfilingInit;
-  command.module_index = PROF_MODEL_LOAD;
-  ret = graph_loader.CommandHandle(command);
-  if (ret != SUCCESS) {
-    GELOGE(ret, "Handle profiling command %s failed, config = %s", kProfilingInit.c_str(), profiler_path);
-    return ret;
-  }
-  if (!g_graph_prof_init_) {
-    g_graph_prof_init_ = true;
-    GELOGI("Profiling init successfully.");
-  }
-
-  GELOGI("Successfully execute GraphProfInit.");
-  return SUCCESS;
-}
-
-Status aclgrphProfFinalize() {
-  std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
-  if (instance_ptr == nullptr || !instance_ptr->InitFlag()) {
-    GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Ge client is not initialized.");
-    return FAILED;
-  }
-  std::lock_guard<std::mutex> lock(g_prof_mutex_);
-  // if command mode is set, just return
-  if (ProfilingManager::Instance().ProfilingOn()) {
-    GELOGW("Graph prof finalize failed, cause profiling command pattern is running.");
-    return GE_PROF_MODE_CONFLICT;
-  }
-
-  if (!g_graph_prof_init_) {
-    GELOGE(GE_PROF_NOT_INIT, "Graph not profiling initialize.");
-    return GE_PROF_NOT_INIT;
-  }
-  GraphLoader graph_loader;
-  Command command;
-  command.cmd_params.clear();
-  command.cmd_type = kProfilingFinalize;
-  Status ret = graph_loader.CommandHandle(command);
-  if (ret != SUCCESS) {
-    GELOGE(ret, "Handle profiling command %s failed.", kProfilingFinalize.c_str());
-    return ret;
-  }
-
-  ret = ProfFinalize();
-  if (ret != SUCCESS) {
-    GELOGE(ret, "Finalize profiling failed, result = %d", ret);
-  }
-
-  if (ret == SUCCESS) {
-    g_graph_prof_init_ = false;
-    GELOGI("Successfully execute GraphProfFinalize.");
-  }
-  return ret;
-}
-
-bool TransProfConfigToParam(const aclgrphProfConfig *profiler_config, vector<string> &prof_config_params) {
-  prof_config_params.clear();
-  prof_config_params.emplace_back(kDeviceNums);
-  prof_config_params.emplace_back(std::to_string(profiler_config->config.devNums));
-  prof_config_params.emplace_back(kDeviceIdList);
-  std::string devID = "";
-  if (profiler_config->config.devNums == 0) {
-    GELOGW("The device num is invalid.");
-    return false;
-  }
-  for (uint32_t i = 0; i < profiler_config->config.devNums; i++) {
-    devID.append(std::to_string(profiler_config->config.devIdList[i]));
-    if (i != profiler_config->config.devNums - 1) {
-      devID.append(",");
-    }
-  }
-
-  prof_config_params.push_back(devID);
-  prof_config_params.push_back(kAicoreMetrics);
-  auto iter =
-      kProfAicoreMetricsToString.find(static_cast<ProfilingAicoreMetrics>(profiler_config->config.aicoreMetrics));
-  if (iter == kProfAicoreMetricsToString.end()) {
-    GELOGW("The prof aicore metrics is invalid.");
-    return false;
-  }
-  prof_config_params.push_back(iter->second);
-  return true;
-}
-
-bool isProfConfigValid(const uint32_t *deviceid_list, uint32_t device_nums) {
-  if (deviceid_list == nullptr) {
-    GELOGE(PARAM_INVALID, "deviceIdList is nullptr");
-    return false;
-  }
-  if (device_nums == 0 || device_nums > kMaxDeviceNum) {
-    GELOGE(PARAM_INVALID, "The device nums is invalid.");
-    return false;
-  }
-
-  // real device num
-  int32_t dev_count = 0;
-  rtError_t rt_err = rtGetDeviceCount(&dev_count);
-  if (rt_err != RT_ERROR_NONE) {
-    GELOGE(INTERNAL_ERROR, "Get the Device count fail.");
-    return false;
-  }
-
-  if (device_nums > static_cast<uint32_t>(dev_count)) {
-    GELOGE(PARAM_INVALID, "Device num(%u) is not in range 1 ~ %d.", device_nums, dev_count);
-    return false;
-  }
-
-  std::unordered_set<uint32_t> record;
-  for (size_t i = 0; i < device_nums; ++i) {
-    uint32_t dev_id = deviceid_list[i];
-    if (dev_id >= static_cast<uint32_t>(dev_count)) {
-      GELOGE(PARAM_INVALID, "Device id %u is not in range 0 ~ %d(exclude %d)", dev_id, dev_count, dev_count);
-      return false;
-    }
-    if (record.count(dev_id) > 0) {
-      GELOGE(PARAM_INVALID, "Device id %u is duplicatedly set", dev_id);
-      return false;
-    }
-    record.insert(dev_id);
-  }
-  return true;
-}
-
-aclgrphProfConfig *aclgrphProfCreateConfig(uint32_t *deviceid_list, uint32_t device_nums,
-                                           ProfilingAicoreMetrics aicore_metrics, ProfAicoreEvents *aicore_events,
-                                           uint64_t data_type_config) {
-  if (!isProfConfigValid(deviceid_list, device_nums)) {
-    return nullptr;
-  }
-  aclgrphProfConfig *config = new (std::nothrow) aclgrphProfConfig();
-  if (config == nullptr) {
-    GELOGE(INTERNAL_ERROR, "new aclgrphProfConfig fail");
-    return nullptr;
-  }
-  config->config.devNums = device_nums;
-  if (memcpy_s(config->config.devIdList, sizeof(config->config.devIdList), deviceid_list,
-               device_nums * sizeof(uint32_t)) != EOK) {
-    GELOGE(INTERNAL_ERROR, "copy devID failed. size = %u", device_nums);
-    delete config;
-    return nullptr;
-  }
-
-  config->config.aicoreMetrics = static_cast<ProfAicoreMetrics>(aicore_metrics);
-  config->config.dataTypeConfig = data_type_config;
-  GELOGI("Successfully create prof config.");
-  return config;
-}
-
-Status aclgrphProfDestroyConfig(aclgrphProfConfig *profiler_config) {
-  if (profiler_config == nullptr) {
-    GELOGE(PARAM_INVALID, "destroy profilerConfig failed, profilerConfig must not be nullptr");
-    return PARAM_INVALID;
-  }
-
-  delete profiler_config;
-  GELOGI("Successfully destroy prof config.");
-  return SUCCESS;
-}
-
-Status aclgrphProfStart(aclgrphProfConfig *profiler_config) {
-  if (profiler_config == nullptr) {
-    GELOGE(PARAM_INVALID, "aclgrphProfConfig is invalid.");
-    return FAILED;
-  }
-  std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
-  if (instance_ptr == nullptr || !instance_ptr->InitFlag()) {
-    GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Ge client is not initialized.");
-    return FAILED;
-  }
-
-  std::lock_guard<std::mutex> lock(g_prof_mutex_);
-  // if command mode is set, just return
-  if (ProfilingManager::Instance().ProfilingOn()) {
-    GELOGW("Graph prof finalize failed, cause profiling command pattern is running.");
-    return GE_PROF_MODE_CONFLICT;
-  }
-  if (!g_graph_prof_init_) {
-    GELOGE(GE_PROF_NOT_INIT, "Graph not profiling initialize.");
-    return GE_PROF_NOT_INIT;
-  }
-
-  Status ret = ProfStartProfiling(&profiler_config->config);
-  if (ret != SUCCESS) {
-    GELOGE(ret, "Start profiling failed, prof result = %d", ret);
-    return FAILED;
-  }
-
-  std::vector<string> prof_params;
-  if (!TransProfConfigToParam(profiler_config, prof_params)) {
-    GELOGE(PARAM_INVALID, "Transfer profilerConfig to string vector failed");
-    return PARAM_INVALID;
-  }
-
-  GraphLoader graph_loader;
-  Command command;
-  command.cmd_params.clear();
-  command.cmd_type = kProfilingStart;
-  command.cmd_params = prof_params;
-  command.module_index = profiler_config->config.dataTypeConfig;
-  GELOGI("Profiling will start, device nums:%s , deviceID:[%s], data type config: 0x%llx", prof_params[0].c_str(),
-         prof_params[kDeviceListIndex].c_str(), command.module_index);
-  ret = graph_loader.CommandHandle(command);
-  if (ret != SUCCESS) {
-    GELOGE(ret, "Handle profiling command failed");
-    return FAILED;
-  }
-
-  GELOGI("Successfully execute GraphProfStartProfiling.");
-
-  return SUCCESS;
-}
-
-Status aclgrphProfStop(aclgrphProfConfig *profiler_config) {
-  if (profiler_config == nullptr) {
-    GELOGE(PARAM_INVALID, "aclgrphProfConfig is invalid.");
-    return FAILED;
-  }
-  std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
-  if (instance_ptr == nullptr || !instance_ptr->InitFlag()) {
-    GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Ge client is not initialized.");
-    return FAILED;
-  }
-
-  std::lock_guard<std::mutex> lock(g_prof_mutex_);
-  // if command mode is set, just return
-  if (ProfilingManager::Instance().ProfilingOn()) {
-    GELOGW("Graph prof finalize failed, cause profiling command pattern is running.");
-    return GE_PROF_MODE_CONFLICT;
-  }
-  if (!g_graph_prof_init_) {
-    GELOGE(GE_PROF_NOT_INIT, "Graph not profiling initialize.");
-    return GE_PROF_NOT_INIT;
-  }
-
-  for (uint32_t i = 0; i < profiler_config->config.devNums; i++) {
-    uint64_t data_type_config;
-    Status status = ProfGetDataTypeConfig(profiler_config->config.devIdList[i], data_type_config);
-    if (status != SUCCESS) {
-      GELOGE(status, "Prof get data type config failed, prof result = %d", status);
-      return status;
-    }
-    if (data_type_config != profiler_config->config.dataTypeConfig) {
-      GELOGE(FAILED, "data type config verify failed");
-      return FAILED;
-    }
-  }
-
-  std::vector<string> prof_params;
-  if (!TransProfConfigToParam(profiler_config, prof_params)) {
-    GELOGE(PARAM_INVALID, "Transfer profilerConfig to string vector failed");
-    return PARAM_INVALID;
-  }
-
-  GraphLoader graph_loader;
-  Command command;
-  command.cmd_params.clear();
-  command.cmd_type = kProfilingStop;
-  command.cmd_params = prof_params;
-  command.module_index = profiler_config->config.dataTypeConfig;
-  GELOGI("Profiling will stop, device nums:%s , deviceID:[%s], data type config: 0x%llx", prof_params[0].c_str(),
-         prof_params[kDeviceListIndex].c_str(), command.module_index);
-  Status ret = graph_loader.CommandHandle(command);
-  if (ret != SUCCESS) {
-    GELOGE(ret, "Handle profiling command failed");
-    return FAILED;
-  }
-
-  ret = ProfStopProfiling(&profiler_config->config);
-  if (ret != SUCCESS) {
-    GELOGE(ret, "Stop profiling failed, prof result = %d", ret);
-    return ret;
-  }
-
-  GELOGI("Successfully execute GraphProfStopProfiling.");
-  return SUCCESS;
-}
-}  // namespace ge
diff --git a/ge/client/module.mk b/ge/client/module.mk
index 6ac69d31..e9d35418 100644
--- a/ge/client/module.mk
+++ b/ge/client/module.mk
@@ -4,7 +4,6 @@ LOCAL_PATH := $(call my-dir)
 COMMON_LOCAL_SRC_FILES := \
     proto/ge_api.proto \
     ge_api.cc \
-    ge_prof.cc \
 
 
 COMMON_LOCAL_C_INCLUDES := \
@@ -69,9 +68,9 @@ LOCAL_SHARED_LIBRARIES := \
     libgraph \
     libregister \
     libge_compiler \
-    libge_common \
-    libmsprof
+    libge_common
 
+LOCAL_STATIC_LIBRARIES += libmsprofiler_fwk \
 
 
 LOCAL_LDFLAGS := -lrt -ldl
@@ -104,8 +103,10 @@ LOCAL_SHARED_LIBRARIES := \
     libregister \
     libruntime \
     libge_compiler \
-    libge_common \
-    libmsprof
+    libge_common
+
+
+LOCAL_STATIC_LIBRARIES += libmsprofiler_fwk \
 
 
 LOCAL_LDFLAGS := -lrt -ldl
diff --git a/ge/common/CMakeLists.txt b/ge/common/CMakeLists.txt
index aa546c0d..d196995c 100755
--- a/ge/common/CMakeLists.txt
+++ b/ge/common/CMakeLists.txt
@@ -24,6 +24,7 @@ set(SRC_LIST
     "helper/om_file_helper.cc"
     "helper/model_helper.cc"
     "../model/ge_model.cc"
+    "../model/ge_root_model.cc"
     "auth/file_saver.cc"
     "fp16_t.cc"
     "math/fp16_math.cc"
diff --git a/ge/common/auth/file_saver.cc b/ge/common/auth/file_saver.cc
index 7b41397a..e708653a 100755
--- a/ge/common/auth/file_saver.cc
+++ b/ge/common/auth/file_saver.cc
@@ -54,8 +54,8 @@ Status FileSaver::OpenFile(int32_t &fd, const std::string &file_path) {
 Status FileSaver::WriteData(const void *data, uint32_t size, int32_t fd) {
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(size == 0 || data == nullptr, return PARAM_INVALID);
   mmSsize_t write_count;
-  uint32_t size_2g = ((uint32_t) 0x1 << 31);
-  uint32_t size_1g = ((uint32_t) 0x1 << 30);
+  uint32_t size_2g = 2147483648;  // 0x1 << 31
+  uint32_t size_1g = 1073741824;  // 0x1 << 30
   // Write data
   if (size > size_2g) {
     auto seek = reinterpret_cast<uint8_t *>(const_cast<void *>(data));
@@ -258,6 +258,65 @@ FileSaver::SaveToFile(const string &file_path, ModelFileHeader &file_header, Mod
   return SUCCESS;
 }
 
+FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status
+FileSaver::SaveToFile(const string &file_path, ModelFileHeader &file_header,
+                      vector<ModelPartitionTable *> &model_partition_tables,
+                      const vector<vector<ModelPartition>> &all_partition_datas) {
+  file_header.is_encrypt = ModelEncryptType::UNENCRYPTED;
+
+  const Status ret = SaveWithFileHeader(file_path, file_header, model_partition_tables, all_partition_datas);
+  GE_CHK_BOOL_RET_STATUS(ret == SUCCESS, FAILED, "save file failed, file_path:%s, file header len:%u.",
+                         file_path.c_str(), file_header.length);
+  return SUCCESS;
+}
+
+Status FileSaver::SaveWithFileHeader(const std::string &file_path, const ModelFileHeader &file_header,
+                                     vector<ModelPartitionTable *> &model_partition_tables,
+                                     const vector<vector<ModelPartition>> &all_partition_datas) {
+
+  GE_CHK_BOOL_EXEC(model_partition_tables.size() == all_partition_datas.size(),
+                   return PARAM_INVALID,
+                   "model table size %zu does not match partition size %zu",
+                   model_partition_tables.size(), all_partition_datas.size())
+  for (size_t index = 0; index < model_partition_tables.size(); ++index) {
+    auto &cur_partiton_data = all_partition_datas[index];
+    auto &cur_model_partition_table = *model_partition_tables[index];
+    GE_CHK_BOOL_RET_STATUS(!cur_partiton_data.empty() && cur_model_partition_table.num != 0
+                           && cur_model_partition_table.num == cur_partiton_data.size(), FAILED,
+                           "Invalid param:partition data size is (%u), model_partition_table.num is (%zu).",
+                           cur_model_partition_table.num, cur_partiton_data.size());
+  }
+
+  // Open file
+  int32_t fd = 0;
+  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(OpenFile(fd, file_path) != SUCCESS, return FAILED);
+  Status ret = SUCCESS;
+  do {
+    // Write file header
+    GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(
+        WriteData(static_cast<const void *>(&file_header), sizeof(ModelFileHeader), fd) != SUCCESS, ret = FAILED;
+        break);
+    for (size_t index = 0; index < model_partition_tables.size(); ++index) {
+      // Write model partition table
+      auto &cur_tabel = *model_partition_tables[index];
+      uint32_t table_size = static_cast<uint32_t>(SIZE_OF_MODEL_PARTITION_TABLE(cur_tabel));
+      GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(
+          WriteData(static_cast<const void *>(&cur_tabel), table_size, fd) != SUCCESS, ret = FAILED; break);
+      // Write partition data
+      auto &cur_partition_datas = all_partition_datas[index];
+      for (const auto &partition_data : cur_partition_datas) {
+        GELOGI("GC:size[%zu]", partition_data.size);
+        GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(
+            WriteData(static_cast<const void *>(partition_data.data), partition_data.size, fd) != SUCCESS, ret = FAILED;
+            break);
+      }
+    }
+  } while (0);
+  // Close file
+  GE_CHK_BOOL_RET_STATUS(mmClose(fd) == EN_OK, FAILED, "Close file failed.");
+  return ret;
+}
+
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status FileSaver::SaveToFile(const string &file_path, const void *data,
                                                                               int len) {
   if (data == nullptr || len <= 0) {
diff --git a/ge/common/auth/file_saver.h b/ge/common/auth/file_saver.h
index 79e2126e..97fbaae5 100644
--- a/ge/common/auth/file_saver.h
+++ b/ge/common/auth/file_saver.h
@@ -74,6 +74,10 @@ class FileSaver {
                            ModelPartitionTable &model_partition_table,
                            const std::vector<ModelPartition> &partition_datas);
 
+  static Status SaveToFile(const string &file_path, ModelFileHeader &file_header,
+                        vector<ModelPartitionTable *> &model_partition_tables,
+                        const vector<vector<ModelPartition>> &all_partition_datas);
+
   static Status SaveToBuffWithFileHeader(const ModelFileHeader &file_header,
                                             ModelPartitionTable &model_partition_table,
                                             const std::vector<ModelPartition> &partitionDatas,
@@ -108,6 +112,9 @@ class FileSaver {
   static Status SaveWithFileHeader(const std::string &file_path, const ModelFileHeader &file_header,
                                    ModelPartitionTable &model_partition_table,
                                    const std::vector<ModelPartition> &partition_datas);
+  static Status SaveWithFileHeader(const std::string &file_path, const ModelFileHeader &file_header,
+                                       vector<ModelPartitionTable *> &model_partition_tables,
+                                       const vector<vector<ModelPartition>> &all_partition_datas);
 };
 }  // namespace ge
 #endif  // GE_COMMON_AUTH_FILE_SAVER_H_
diff --git a/ge/common/base64.h b/ge/common/base64.h
index fb6c1870..a537e585 100644
--- a/ge/common/base64.h
+++ b/ge/common/base64.h
@@ -25,32 +25,38 @@
 
 namespace ge {
 namespace {
-const char* kBase64Chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-                           "abcdefghijklmnopqrstuvwxyz"
-                           "0123456789+/";
+const char *kBase64Chars =
+  "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+  "abcdefghijklmnopqrstuvwxyz"
+  "0123456789+/";
 const char kEqualSymbol = '=';
 const size_t kBase64CharsNum = 64;
 const size_t kThreeByteOneGroup = 3;
 const size_t kFourByteOneGroup = 4;
-}
+const size_t kThreeByteOneGroupIndex0 = 0;
+const size_t kThreeByteOneGroupIndex1 = 1;
+const size_t kThreeByteOneGroupIndex2 = 2;
+const size_t kFourByteOneGroupIndex0 = 0;
+const size_t kFourByteOneGroupIndex1 = 1;
+const size_t kFourByteOneGroupIndex2 = 2;
+const size_t kFourByteOneGroupIndex3 = 3;
+}  // namespace
 
 namespace base64 {
-static inline bool IsBase64Char(const char &c) {
-  return (isalnum(c) || (c == '+') || (c == '/'));
-}
+static inline bool IsBase64Char(const char &c) { return (isalnum(c) || (c == '+') || (c == '/')); }
 
 static std::string EncodeToBase64(const std::string &raw_data) {
   size_t encode_length = raw_data.size() / kThreeByteOneGroup * kFourByteOneGroup;
   encode_length += raw_data.size() % kThreeByteOneGroup == 0 ? 0 : kFourByteOneGroup;
-  size_t raw_data_index = 0 ;
+  size_t raw_data_index = 0;
   size_t encode_data_index = 0;
   std::string encode_data;
   encode_data.resize(encode_length);
 
   for (; raw_data_index + kThreeByteOneGroup <= raw_data.size(); raw_data_index += kThreeByteOneGroup) {
     auto char_1 = static_cast<uint8_t>(raw_data[raw_data_index]);
-    auto char_2 = static_cast<uint8_t>(raw_data[raw_data_index + 1]);
-    auto char_3 = static_cast<uint8_t>(raw_data[raw_data_index + 2]);
+    auto char_2 = static_cast<uint8_t>(raw_data[raw_data_index + kThreeByteOneGroupIndex1]);
+    auto char_3 = static_cast<uint8_t>(raw_data[raw_data_index + kThreeByteOneGroupIndex2]);
     encode_data[encode_data_index++] = kBase64Chars[char_1 >> 2u];
     encode_data[encode_data_index++] = kBase64Chars[((char_1 << 4u) & 0x30) | (char_2 >> 4u)];
     encode_data[encode_data_index++] = kBase64Chars[((char_2 << 2u) & 0x3c) | (char_3 >> 6u)];
@@ -80,8 +86,7 @@ static std::string EncodeToBase64(const std::string &raw_data) {
 #pragma GCC diagnostic ignored "-Wunused-function"
 static Status DecodeFromBase64(const std::string &base64_data, std::string &decode_data) {
   if (base64_data.size() % kFourByteOneGroup != 0) {
-    GELOGE(PARAM_INVALID, "base64 data size must can be divided by 4, but given data size is %zu",
-           base64_data.size());
+    GELOGE(PARAM_INVALID, "base64 data size must can be divided by 4, but given data size is %zu", base64_data.size());
     return PARAM_INVALID;
   }
   decode_data.clear();
@@ -92,10 +97,10 @@ static Status DecodeFromBase64(const std::string &base64_data, std::string &deco
     return static_cast<uint8_t>(std::distance(kBase64Chars, char_pos)) & 0xff;
   };
 
-  for (std::size_t input_data_index = 0; input_data_index < base64_data_len; input_data_index += 4) {
+  for (std::size_t input_data_index = 0; input_data_index < base64_data_len; input_data_index += kFourByteOneGroup) {
     for (size_t i = 0; i < kFourByteOneGroup; ++i) {
       if (base64_data[input_data_index + i] == kEqualSymbol &&
-          input_data_index >= base64_data_len - 4 && i > 1) {
+          input_data_index >= base64_data_len - kFourByteOneGroup && i > 1) {
         byte_4[i] = kBase64CharsNum;
       } else if (IsBase64Char(base64_data[input_data_index + i])) {
         byte_4[i] = FindCharInBase64Chars(base64_data[input_data_index + i]);
@@ -104,19 +109,23 @@ static Status DecodeFromBase64(const std::string &base64_data, std::string &deco
         return PARAM_INVALID;
       }
     }
-    decode_data += static_cast<char>((byte_4[0] << 2u) + ((byte_4[1] & 0x30) >> 4u));
-    if (byte_4[2] >= kBase64CharsNum){
+    decode_data +=
+      static_cast<char>((byte_4[kFourByteOneGroupIndex0] << 2u) + ((byte_4[kFourByteOneGroupIndex1] & 0x30) >> 4u));
+    if (byte_4[kFourByteOneGroupIndex2] >= kBase64CharsNum) {
       break;
-    } else if (byte_4[3] >= kBase64CharsNum) {
-      decode_data += static_cast<char>(((byte_4[1] & 0x0f) << 4u)  + ((byte_4[2] & 0x3c) >> 2u));
+    } else if (byte_4[kFourByteOneGroupIndex3] >= kBase64CharsNum) {
+      decode_data += static_cast<char>(((byte_4[kFourByteOneGroupIndex1] & 0x0f) << 4u) +
+                                       ((byte_4[kFourByteOneGroupIndex2] & 0x3c) >> 2u));
       break;
     }
-    decode_data += static_cast<char>(((byte_4[1] & 0x0f) << 4u)  + ((byte_4[2] & 0x3c) >> 2u));
-    decode_data += static_cast<char>(((byte_4[2] & 0x03) << 6u)  + byte_4[3]);
+    decode_data += static_cast<char>(((byte_4[kFourByteOneGroupIndex1] & 0x0f) << 4u) +
+                                     ((byte_4[kFourByteOneGroupIndex2] & 0x3c) >> 2u));
+    decode_data +=
+      static_cast<char>(((byte_4[kFourByteOneGroupIndex2] & 0x03) << 6u) + byte_4[kFourByteOneGroupIndex3]);
   }
   return SUCCESS;
 }
 #pragma GCC diagnostic pop
-}
+}  // namespace base64
 }  // namespace ge
 #endif  // GE_COMMON_BASE64_H_
\ No newline at end of file
diff --git a/ge/common/debug/memory_dumper.cc b/ge/common/debug/memory_dumper.cc
index 872fe1da..527f0bb2 100644
--- a/ge/common/debug/memory_dumper.cc
+++ b/ge/common/debug/memory_dumper.cc
@@ -139,7 +139,8 @@ int MemoryDumper::OpenFile(const char *filename) {
   GE_IF_BOOL_EXEC(
     -1 != path_split_pos, string prefix_path = std::string(filename).substr(0, path_split_pos);
     string last_path = std::string(filename).substr(path_split_pos, strlen(filename) - 1);
-    GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(prefix_path.length() >= MMPA_MAX_PATH, return kInvalidFd, "Prefix path is too long!");
+    GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(prefix_path.length() >= MMPA_MAX_PATH,
+        return kInvalidFd, "Prefix path is too long!");
     GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(mmRealPath(prefix_path.c_str(), tmp_path, MMPA_MAX_PATH) != EN_OK, return kInvalidFd,
                                    "Dir %s does not exit.", prefix_path.c_str());
     real_path = std::string(tmp_path) + last_path;)
diff --git a/ge/common/formats/format_transfers/format_transfer_fractal_nz.cc b/ge/common/formats/format_transfers/format_transfer_fractal_nz.cc
index ed1c6941..cb528453 100755
--- a/ge/common/formats/format_transfers/format_transfer_fractal_nz.cc
+++ b/ge/common/formats/format_transfers/format_transfer_fractal_nz.cc
@@ -23,12 +23,30 @@
 #include "common/formats/utils/formats_trans_utils.h"
 #include "framework/common/debug/ge_log.h"
 #include "framework/common/debug/log.h"
+#include "framework/common/types.h"
 #include "graph/utils/type_utils.h"
 
 namespace ge {
 namespace formats {
 namespace {
 const int kDimSize4D = 4;
+
+const size_t kSingleDim = 1;
+
+const size_t kNdDimIndexN = 0;
+const size_t kNdDimIndexH = 1;
+const size_t kNdDimIndexW = 2;
+
+const size_t kDimDValueBNdFNz = 2;  // dim d-value between Nd and FractalZz
+
+const size_t kNdDimCountBackwardsW = 1;
+const size_t kNdDimCountBackwardsWH = 2;
+
+const size_t kFNzDimCountBackwardsW0 = 1;
+const size_t kFNzDimCountBackwardsW0H0 = 2;
+const size_t kFNzDimCountBackwardsW0H0H1 = 3;
+const size_t kFNzDimCountBackwardsW0H0H1W1 = 4;
+
 bool IsDataTypeSupport(DataType data_type) { return GetSizeByDataType(data_type) > 0; }
 
 using ShapeVector = std::vector<int64_t>;
@@ -60,14 +78,14 @@ Status TransShapeToFracNz(const ShapeVector &src_shape, DataType data_type, Shap
   auto w0 = GetCubeSizeByDataType(data_type);
   int64_t h0 = kCubeSize;
   switch (src_shape.size()) {
-    case 1:
-      dst_shape.push_back(Ceil(src_shape[0], w0));
-      dst_shape.push_back(1);
+    case kSingleDim:
+      dst_shape.push_back(Ceil(src_shape[kNdDimIndexN], w0));
+      dst_shape.push_back(DIM_DEFAULT_VALUE);
       dst_shape.push_back(h0);
       dst_shape.push_back(w0);
-      hw_shape.push_back(1);
-      hw_shape.push_back(1);
-      hw_shape.push_back(src_shape[0]);
+      hw_shape.push_back(DIM_DEFAULT_VALUE);
+      hw_shape.push_back(DIM_DEFAULT_VALUE);
+      hw_shape.push_back(src_shape[kNdDimIndexN]);
       if (!IsShapeValid(dst_shape)) {
         GELOGE(PARAM_INVALID, "Failed to check dst shape %s", ShapeToString(dst_shape).c_str());
         return PARAM_INVALID;
@@ -76,17 +94,17 @@ Status TransShapeToFracNz(const ShapeVector &src_shape, DataType data_type, Shap
     default:
       auto size = src_shape.size();
       int64_t times = 1;
-      for (size_t i = 0; i != size - 2; i++) {
+      for (size_t i = 0; i != size - kDimDValueBNdFNz; i++) {
         dst_shape.push_back(src_shape[i]);
         times *= src_shape[i];
       }
-      dst_shape.push_back(Ceil(src_shape[size - 1], w0));
-      dst_shape.push_back(Ceil(src_shape[size - 2], h0));
+      dst_shape.push_back(Ceil(src_shape[size - kNdDimCountBackwardsW], w0));
+      dst_shape.push_back(Ceil(src_shape[size - kNdDimCountBackwardsWH], h0));
       dst_shape.push_back(h0);
       dst_shape.push_back(w0);
       hw_shape.push_back(times);
-      hw_shape.push_back(src_shape[size - 2]);
-      hw_shape.push_back(src_shape[size - 1]);
+      hw_shape.push_back(src_shape[size - kNdDimCountBackwardsWH]);
+      hw_shape.push_back(src_shape[size - kNdDimCountBackwardsW]);
       if (!IsShapeValid(dst_shape)) {
         GELOGE(PARAM_INVALID, "Failed to check dst shape %s", ShapeToString(dst_shape).c_str());
         return PARAM_INVALID;
@@ -128,16 +146,16 @@ Status TransFormatFromNdToFracNz(const TransArgs &args, TransResult &result, con
   }
 
   // src&dst_shape can be written as times*H*W & times*W1*H1*H0*W0, respectively. dst_shape_size >= kDimNum4D
-  auto times = hw_shape.at(0);
-  auto h = hw_shape.at(1);
-  auto w = hw_shape.at(2);
+  auto times = hw_shape.at(kNdDimIndexN);
+  auto h = hw_shape.at(kNdDimIndexH);
+  auto w = hw_shape.at(kNdDimIndexW);
   auto hw = h * w;
 
   auto shape_size = args.dst_shape.size();
-  auto w1 = args.dst_shape[shape_size - 4];
-  auto h1 = args.dst_shape[shape_size - 3];
-  auto h0 = args.dst_shape[shape_size - 2];
-  auto w0 = args.dst_shape[shape_size - 1];
+  auto w1 = args.dst_shape[shape_size - kFNzDimCountBackwardsW0H0H1W1];
+  auto h1 = args.dst_shape[shape_size - kFNzDimCountBackwardsW0H0H1];
+  auto h0 = args.dst_shape[shape_size - kFNzDimCountBackwardsW0H0];
+  auto w0 = args.dst_shape[shape_size - kFNzDimCountBackwardsW0];
   auto h1h0 = h1 * h0;
   auto h1h0w0 = h1h0 * w0;
   auto w1h1h0w0 = w1 * h1h0w0;
@@ -198,16 +216,16 @@ Status TransFormatFromFracNzToNd(const TransArgs &args, TransResult &result, con
     return OUT_OF_MEMORY;
   }
 
-  auto times = dst_hw_shape.at(0);
-  auto h = dst_hw_shape.at(1);
-  auto w = dst_hw_shape.at(2);
+  auto times = dst_hw_shape.at(kNdDimIndexN);
+  auto h = dst_hw_shape.at(kNdDimIndexH);
+  auto w = dst_hw_shape.at(kNdDimIndexW);
   auto hw = h * w;
 
   auto shape_size = args.src_shape.size();
-  auto w1 = args.src_shape[shape_size - 4];
-  auto h1 = args.src_shape[shape_size - 3];
-  auto h0 = args.src_shape[shape_size - 2];
-  auto w0 = args.src_shape[shape_size - 1];
+  auto w1 = args.src_shape[shape_size - kFNzDimCountBackwardsW0H0H1W1];
+  auto h1 = args.src_shape[shape_size - kFNzDimCountBackwardsW0H0H1];
+  auto h0 = args.src_shape[shape_size - kFNzDimCountBackwardsW0H0];
+  auto w0 = args.src_shape[shape_size - kFNzDimCountBackwardsW0];
   auto h1h0 = h1 * h0;
   auto h1h0w0 = h1h0 * w0;
   auto w1h1h0w0 = w1 * h1h0w0;
diff --git a/ge/common/formats/format_transfers/format_transfer_fractal_zz.cc b/ge/common/formats/format_transfers/format_transfer_fractal_zz.cc
index d890e681..88603d5c 100755
--- a/ge/common/formats/format_transfers/format_transfer_fractal_zz.cc
+++ b/ge/common/formats/format_transfers/format_transfer_fractal_zz.cc
@@ -23,12 +23,29 @@
 #include "common/formats/utils/formats_trans_utils.h"
 #include "framework/common/debug/ge_log.h"
 #include "framework/common/debug/log.h"
+#include "framework/common/types.h"
 #include "graph/utils/type_utils.h"
 
 namespace ge {
 namespace formats {
 namespace {
 const int kDimSize4D = 4;
+
+const size_t kSingleDim = 1;
+
+const size_t kNdDimIndexN = 0;
+const size_t kNdDimIndexH = 1;
+const size_t kNdDimIndexW = 2;
+
+const size_t kDimDValueBNdFZz = 2;  // dim d-value between Nd and FractalZz
+
+const size_t kNdDimCountBackwardsW = 1;
+const size_t kNdDimCountBackwardsWH = 2;
+
+const size_t kFZzDimCountBackwardsW0 = 1;
+const size_t kFZzDimCountBackwardsW0H0 = 2;
+const size_t kFZzDimCountBackwardsW0H0W1 = 3;
+const size_t kFZzDimCountBackwardsW0H0W1H1 = 4;
 bool IsDataTypeSupport(DataType d_type) { return GetSizeByDataType(d_type) > 0; }
 
 using ShapeVector = std::vector<int64_t>;
@@ -40,8 +57,8 @@ bool CheckShape(Format format, const ShapeVector &shape) {
     case FORMAT_NHWC:
       return CheckShapeValid(shape, kDimSize4D);
     default:
-      std::string error = "Trans format between " +  FmtToStr(TypeUtils::FormatToSerialString(format)) +
-          " and FORMAT_FRACTAL_ZZ is not supported.";
+      std::string error = "Trans format between " + FmtToStr(TypeUtils::FormatToSerialString(format)) +
+                          " and FORMAT_FRACTAL_ZZ is not supported.";
       GE_ERRORLOG_AND_ERRORMSG(PARAM_INVALID, error.c_str());
       return false;
   }
@@ -60,14 +77,14 @@ Status TransShapeToFracZz(const ShapeVector &src_shape, DataType data_type, Shap
   auto w0 = GetCubeSizeByDataType(data_type);
   auto h0 = GetCubeSizeByDataType(data_type);
   switch (src_shape.size()) {
-    case 1:
-      dst_shape.push_back(1);
-      dst_shape.push_back(Ceil(src_shape[0], w0));
+    case kSingleDim:
+      dst_shape.push_back(DIM_DEFAULT_VALUE);
+      dst_shape.push_back(Ceil(src_shape[kNdDimIndexN], w0));
       dst_shape.push_back(h0);
       dst_shape.push_back(w0);
-      hw_shape.push_back(1);
-      hw_shape.push_back(1);
-      hw_shape.push_back(src_shape[0]);
+      hw_shape.push_back(DIM_DEFAULT_VALUE);
+      hw_shape.push_back(DIM_DEFAULT_VALUE);
+      hw_shape.push_back(src_shape[kNdDimIndexN]);
       if (!IsShapeValid(dst_shape)) {
         GELOGE(PARAM_INVALID, "Failed to check dst shape %s", ShapeToString(dst_shape).c_str());
         return PARAM_INVALID;
@@ -76,17 +93,17 @@ Status TransShapeToFracZz(const ShapeVector &src_shape, DataType data_type, Shap
     default:
       auto size = src_shape.size();
       int64_t times = 1;
-      for (size_t i = 0; i != size - 2; i++) {
+      for (size_t i = 0; i != size - kDimDValueBNdFZz; i++) {
         dst_shape.push_back(src_shape[i]);
         times *= src_shape[i];
       }
-      dst_shape.push_back(Ceil(src_shape[size - 2], h0));
-      dst_shape.push_back(Ceil(src_shape[size - 1], w0));
+      dst_shape.push_back(Ceil(src_shape[size - kNdDimCountBackwardsWH], h0));
+      dst_shape.push_back(Ceil(src_shape[size - kNdDimCountBackwardsW], w0));
       dst_shape.push_back(h0);
       dst_shape.push_back(w0);
       hw_shape.push_back(times);
-      hw_shape.push_back(src_shape[size - 2]);
-      hw_shape.push_back(src_shape[size - 1]);
+      hw_shape.push_back(src_shape[size - kNdDimCountBackwardsWH]);
+      hw_shape.push_back(src_shape[size - kNdDimCountBackwardsW]);
       if (!IsShapeValid(dst_shape)) {
         GELOGE(PARAM_INVALID, "Failed to check dst shape %s", ShapeToString(dst_shape).c_str());
         return PARAM_INVALID;
@@ -127,16 +144,16 @@ Status TransFormatFromNdToFracZz(const TransArgs &args, TransResult &result, con
     return OUT_OF_MEMORY;
   }
   // The src&dst_shape can be written as times*H*W & times*H1*W1*H0*W0, respectively. dst_shape_size >= kDimNum4D
-  auto times = hw_shape.at(0);
-  auto h = hw_shape.at(1);
-  auto w = hw_shape.at(2);
+  auto times = hw_shape.at(kNdDimIndexN);
+  auto h = hw_shape.at(kNdDimIndexH);
+  auto w = hw_shape.at(kNdDimIndexW);
   auto hw = h * w;
 
   auto shape_size = args.dst_shape.size();
-  auto h1 = args.dst_shape[shape_size - 4];
-  auto w1 = args.dst_shape[shape_size - 3];
-  auto h0 = args.dst_shape[shape_size - 2];
-  auto w0 = args.dst_shape[shape_size - 1];
+  auto h1 = args.dst_shape[shape_size - kFZzDimCountBackwardsW0H0W1H1];
+  auto w1 = args.dst_shape[shape_size - kFZzDimCountBackwardsW0H0W1];
+  auto h0 = args.dst_shape[shape_size - kFZzDimCountBackwardsW0H0];
+  auto w0 = args.dst_shape[shape_size - kFZzDimCountBackwardsW0];
   auto h0w0 = h0 * w0;
   auto w1h0w0 = w1 * h0w0;
   auto h1w1h0w0 = h1 * w1h0w0;
@@ -155,8 +172,8 @@ Status TransFormatFromNdToFracZz(const TransArgs &args, TransResult &result, con
           auto src_offset = (src_h_head + w1_idx * w0) * size;
           auto dst_offset = (h0_head + w1_idx * h0w0) * size;
           auto protected_size = dst_size - dst_offset < static_cast<int64_t>(SECUREC_MEM_MAX_LEN)
-                                    ? dst_size - dst_offset
-                                    : static_cast<int64_t>(SECUREC_MEM_MAX_LEN);
+                                  ? dst_size - dst_offset
+                                  : static_cast<int64_t>(SECUREC_MEM_MAX_LEN);
           auto ret = memcpy_s(dst.get() + dst_offset, static_cast<size_t>(protected_size), args.data + src_offset,
                               static_cast<size_t>(size * w0));
           if (ret != EOK) {
@@ -171,8 +188,8 @@ Status TransFormatFromNdToFracZz(const TransArgs &args, TransResult &result, con
           auto src_offset = (src_h_head + src_w_idx) * size;
           auto dst_offset = (w0_head + w0_idx) * size;
           auto protected_size = dst_size - dst_offset < static_cast<int64_t>(SECUREC_MEM_MAX_LEN)
-                                    ? dst_size - dst_offset
-                                    : static_cast<int64_t>(SECUREC_MEM_MAX_LEN);
+                                  ? dst_size - dst_offset
+                                  : static_cast<int64_t>(SECUREC_MEM_MAX_LEN);
           auto ret = memcpy_s(dst.get() + dst_offset, static_cast<size_t>(protected_size), args.data + src_offset,
                               static_cast<size_t>(size));
           if (ret != EOK) {
@@ -205,16 +222,16 @@ Status TransFormatFromFracZzToNd(const TransArgs &args, TransResult &result, con
   }
 
   // The src&dst_shape can be written as times*H*W & times*H1*W1*H0*W0, respectively. dst_shape_size >= kDimNum4D
-  auto times = dst_hw_shape.at(0);
-  auto h = dst_hw_shape.at(1);
-  auto w = dst_hw_shape.at(2);
+  auto times = dst_hw_shape.at(kNdDimIndexN);
+  auto h = dst_hw_shape.at(kNdDimIndexH);
+  auto w = dst_hw_shape.at(kNdDimIndexW);
   auto hw = h * w;
 
   auto shape_size = args.src_shape.size();
-  auto h1 = args.src_shape[shape_size - 4];
-  auto w1 = args.src_shape[shape_size - 3];
-  auto h0 = args.src_shape[shape_size - 2];
-  auto w0 = args.src_shape[shape_size - 1];
+  auto h1 = args.src_shape[shape_size - kFZzDimCountBackwardsW0H0W1H1];
+  auto w1 = args.src_shape[shape_size - kFZzDimCountBackwardsW0H0W1];
+  auto h0 = args.src_shape[shape_size - kFZzDimCountBackwardsW0H0];
+  auto w0 = args.src_shape[shape_size - kFZzDimCountBackwardsW0];
   auto h0w0 = h0 * w0;
   auto w1h0w0 = w1 * h0w0;
   auto h1w1h0w0 = h1 * w1h0w0;
@@ -233,8 +250,8 @@ Status TransFormatFromFracZzToNd(const TransArgs &args, TransResult &result, con
           auto src_offset = (h0_head + w1_idx * h0w0) * size;
           auto dst_offset = (dst_h_head + w1_idx * w0) * size;
           auto protected_size = dst_size - dst_offset < static_cast<int64_t>(SECUREC_MEM_MAX_LEN)
-                                    ? dst_size - dst_offset
-                                    : static_cast<int64_t>(SECUREC_MEM_MAX_LEN);
+                                  ? dst_size - dst_offset
+                                  : static_cast<int64_t>(SECUREC_MEM_MAX_LEN);
           auto ret = memcpy_s(dst.get() + dst_offset, static_cast<size_t>(protected_size), args.data + src_offset,
                               static_cast<size_t>(size * w0));
           if (ret != EOK) {
@@ -249,8 +266,8 @@ Status TransFormatFromFracZzToNd(const TransArgs &args, TransResult &result, con
           auto dst_w_idx = w1_head + w0_idx;
           auto dst_offset = (dst_h_head + dst_w_idx) * size;
           auto protected_size = dst_size - dst_offset < static_cast<int64_t>(SECUREC_MEM_MAX_LEN)
-                                    ? dst_size - dst_offset
-                                    : static_cast<int64_t>(SECUREC_MEM_MAX_LEN);
+                                  ? dst_size - dst_offset
+                                  : static_cast<int64_t>(SECUREC_MEM_MAX_LEN);
           auto ret = memcpy_s(dst.get() + dst_offset, static_cast<size_t>(protected_size), args.data + src_offset,
                               static_cast<size_t>(size));
           if (ret != EOK) {
diff --git a/ge/common/formats/format_transfers/format_transfer_nchw_fz_c04.cc b/ge/common/formats/format_transfers/format_transfer_nchw_fz_c04.cc
index a66aeeb4..49b19f46 100644
--- a/ge/common/formats/format_transfers/format_transfer_nchw_fz_c04.cc
+++ b/ge/common/formats/format_transfers/format_transfer_nchw_fz_c04.cc
@@ -35,7 +35,6 @@
  *      Padding to (N, ceil(Z/16)*16)
  *  Last Step: View the (N, ceil(Z/16)*16) as 4D (N/16, 16, C/16, 16) and transpose to (C/16, N/16, 16, 16)
  */
-
 namespace ge {
 namespace formats {
 namespace {
diff --git a/ge/common/formats/format_transfers/format_transfer_transpose.cc b/ge/common/formats/format_transfers/format_transfer_transpose.cc
index e623d9e7..9be74b1f 100755
--- a/ge/common/formats/format_transfers/format_transfer_transpose.cc
+++ b/ge/common/formats/format_transfers/format_transfer_transpose.cc
@@ -19,6 +19,7 @@
 #include <securec.h>
 #include <memory>
 
+#include "common/formats/utils/formats_definitions.h"
 #include "common/formats/utils/formats_trans_utils.h"
 #include "framework/common/debug/ge_log.h"
 #include "framework/common/debug/log.h"
@@ -29,21 +30,21 @@ namespace formats {
 namespace {
 std::map<Format, std::map<Format, std::vector<int64_t>>> perm_args{
     {FORMAT_NCHW,
-     {{FORMAT_NHWC, std::vector<int64_t>({0, 2, 3, 1})},
-      {FORMAT_HWCN, std::vector<int64_t>({2, 3, 1, 0})},
-      {FORMAT_CHWN, std::vector<int64_t>({1, 2, 3, 0})}}},
+     {{FORMAT_NHWC, std::vector<int64_t>({kNchwN, kNchwH, kNchwW, kNchwC})},
+      {FORMAT_HWCN, std::vector<int64_t>({kNchwH, kNchwW, kNchwC, kNchwN})},
+      {FORMAT_CHWN, std::vector<int64_t>({kNchwC, kNchwH, kNchwW, kNchwN})}}},
     {FORMAT_NHWC,
-     {{FORMAT_NCHW, std::vector<int64_t>({0, 3, 1, 2})},
-      {FORMAT_CHWN, std::vector<int64_t>({3, 1, 2, 0})},
-      {FORMAT_HWCN, std::vector<int64_t>({1, 2, 3, 0})}}},
+     {{FORMAT_NCHW, std::vector<int64_t>({kNhwcN, kNhwcC, kNhwcH, kNhwcW})},
+      {FORMAT_CHWN, std::vector<int64_t>({kNhwcC, kNhwcH, kNhwcW, kNhwcN})},
+      {FORMAT_HWCN, std::vector<int64_t>({kNhwcH, kNhwcW, kNhwcC, kNhwcN})}}},
     {FORMAT_HWCN,
-     {{FORMAT_NCHW, std::vector<int64_t>({3, 2, 0, 1})},
-      {FORMAT_NHWC, std::vector<int64_t>({3, 0, 1, 2})},
-      {FORMAT_CHWN, std::vector<int64_t>({2, 0, 1, 3})}}},
+     {{FORMAT_NCHW, std::vector<int64_t>({kHwcnN, kHwcnC, kHwcnH, kHwcnW})},
+      {FORMAT_NHWC, std::vector<int64_t>({kHwcnN, kHwcnH, kHwcnW, kHwcnC})},
+      {FORMAT_CHWN, std::vector<int64_t>({kHwcnC, kHwcnH, kHwcnW, kHwcnN})}}},
     {FORMAT_CHWN,
-     {{FORMAT_NCHW, std::vector<int64_t>({3, 0, 1, 2})},
-      {FORMAT_NHWC, std::vector<int64_t>({3, 1, 2, 0})},
-      {FORMAT_HWCN, std::vector<int64_t>({1, 2, 0, 3})}}},
+     {{FORMAT_NCHW, std::vector<int64_t>({kChwnN, kChwnC, kChwnH, kChwnW})},
+      {FORMAT_NHWC, std::vector<int64_t>({kChwnN, kChwnH, kChwnW, kChwnC})},
+      {FORMAT_HWCN, std::vector<int64_t>({kChwnH, kChwnW, kChwnC, kChwnN})}}},
 };
 
 bool IsShapeArgValid(const std::vector<int64_t> &src_shape, const std::vector<int64_t> &perm_arg) {
diff --git a/ge/common/formats/utils/formats_definitions.h b/ge/common/formats/utils/formats_definitions.h
index 7f873f1b..25f36d6a 100755
--- a/ge/common/formats/utils/formats_definitions.h
+++ b/ge/common/formats/utils/formats_definitions.h
@@ -23,6 +23,7 @@ static const int kCubeSize = 16;
 static const int kNiSize = 16;
 static const int64_t kShapeItemNumMAX = 1024UL * 1024UL * 1024UL * 1024UL;
 
+
 enum NchwDimIndex {
   kNchwN,
   kNchwC,
@@ -47,6 +48,14 @@ enum HwcnDimIndex {
   kHwcnDimsNum
 };
 
+enum ChwnDimIndex {
+  kChwnC,
+  kChwnH,
+  kChwnW,
+  kChwnN,
+  kChwnDimsNum
+};
+
 enum Nc1hwc0DimIndex {
   kNc1hwc0N,
   kNc1hwc0C1,
diff --git a/ge/common/ge/plugin_manager.cc b/ge/common/ge/plugin_manager.cc
index 7bb1310c..75a36d99 100644
--- a/ge/common/ge/plugin_manager.cc
+++ b/ge/common/ge/plugin_manager.cc
@@ -123,7 +123,10 @@ Status PluginManager::LoadSo(const string &path, const vector<string> &func_chec
     if (handle == nullptr) {
       const char *error = mmDlerror();
       GE_IF_BOOL_EXEC(error == nullptr, error = "");
-      GELOGE(GE_PLGMGR_PATH_INVALID, "Failed to dlopen %s!", error);
+      ErrorManager::GetInstance().ATCReportErrMessage("E19012", {"function", "reason"},
+          {"mmDlopen", "shared library path is " + FmtToStr(file_path_dlopen) + ". Errormessage" + FmtToStr(error)});
+      GELOGE(GE_PLGMGR_PATH_INVALID, "Failed to dlopen the shared library path[%s]. Errormessage[%s]!",
+             file_path_dlopen.c_str(), error);
       continue;
     }
 
@@ -132,6 +135,9 @@ Status PluginManager::LoadSo(const string &path, const vector<string> &func_chec
     for (const auto &func_name : func_check_list) {
       auto real_fn = (void (*)())mmDlsym(handle, const_cast<char *>(func_name.c_str()));
       if (real_fn == nullptr) {
+        ErrorManager::GetInstance().ATCReportErrMessage("E19012", {"function", "reason"},
+            {"mmDlsym", FmtToStr(func_name) + " is skipped since function" +
+            FmtToStr(func_name) + " is not existed!"});
         GELOGE(GE_PLGMGR_PATH_INVALID, "%s is skipped since function %s is not existed!", func_name.c_str(),
                func_name.c_str());
         is_valid = false;
diff --git a/ge/common/ge/tbe_plugin_manager.cc b/ge/common/ge/tbe_plugin_manager.cc
index b91f1204..44199c32 100755
--- a/ge/common/ge/tbe_plugin_manager.cc
+++ b/ge/common/ge/tbe_plugin_manager.cc
@@ -37,6 +37,8 @@
 #include "graph/utils/type_utils.h"
 
 namespace ge {
+const int kBaseInt = 10;
+
 std::map<string, string> TBEPluginManager::options_ = {};
 
 // Get Singleton Instance
@@ -155,7 +157,7 @@ void TBEPluginManager::GetCustomOpPath(std::string &customop_path) {
   domi::FrameworkType type = domi::TENSORFLOW;
   auto it = options_.find(FRAMEWORK_TYPE);
   if (it != options_.end()) {
-    type = static_cast<domi::FrameworkType>(std::strtol(it->second.c_str(), nullptr, 10));
+    type = static_cast<domi::FrameworkType>(std::strtol(it->second.c_str(), nullptr, kBaseInt));
   }
   fmk_type = ge::TypeUtils::FmkTypeToSerialString(type);
   GELOGI("Framework type is %s.", fmk_type.c_str());
diff --git a/ge/common/ge_common.mk b/ge/common/ge_common.mk
index 3fffd203..e28090ad 100755
--- a/ge/common/ge_common.mk
+++ b/ge/common/ge_common.mk
@@ -7,6 +7,7 @@ GE_COMMON_LOCAL_SRC_FILES := \
     helper/om_file_helper.cc \
     helper/model_helper.cc \
     ../model/ge_model.cc \
+    ../model/ge_root_model.cc \
     auth/file_saver.cc \
     fp16_t.cc \
     math/fp16_math.cc \
diff --git a/ge/common/helper/model_helper.cc b/ge/common/helper/model_helper.cc
index 6f201461..aacef88c 100644
--- a/ge/common/helper/model_helper.cc
+++ b/ge/common/helper/model_helper.cc
@@ -32,6 +32,7 @@ using domi::ModelTaskDef;
 
 namespace {
 const int64_t kOriginalOmPartitionNum = 1;
+const uint32_t kStatiOmFileModelNum = 1;
 }
 
 
@@ -39,7 +40,7 @@ namespace ge {
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelHelper::~ModelHelper() { (void)ReleaseLocalModelData(); }
 
 Status ModelHelper::SaveModelPartition(std::shared_ptr<OmFileSaveHelper> &om_file_save_helper, ModelPartitionType type,
-                                       const uint8_t *data, size_t size) {
+                                       const uint8_t *data, size_t size, size_t model_index) {
   if (size < 1 || size > UINT32_MAX) {
     GELOGE(PARAM_INVALID, "Add model partition failed, partition size %zu invalid", size);
     if (size > UINT32_MAX) {
@@ -68,25 +69,16 @@ Status ModelHelper::SaveModelPartition(std::shared_ptr<OmFileSaveHelper> &om_fil
   partition_model.data = const_cast<uint8_t *>(data);
   partition_model.size = static_cast<uint32_t>(size);
   partition_model.type = type;
-  if (om_file_save_helper->AddPartition(partition_model) != SUCCESS) {
+  if (om_file_save_helper->AddPartition(partition_model, model_index) != SUCCESS) {
     GELOGE(PARAM_INVALID, "Add model partition failed, partition size %zu", size);
     return PARAM_INVALID;
   }
   return SUCCESS;
 }
 
-FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::SaveToOmModel(const GeModelPtr &ge_model,
-                                                                                   const SaveParam &save_param,
-                                                                                   const std::string &output_file,
-                                                                                   ModelBufferData& model) {
-  if (output_file.empty()) {
-    GELOGE(FAILED, "GraphBuilder SaveModel received invalid file name prefix");
-    return FAILED;
-  }
 
-  GE_IF_BOOL_EXEC(ge_model == nullptr, GELOGE(FAILED, "Ge_model is nullptr"); return FAILED);
-  std::shared_ptr<OmFileSaveHelper> om_file_save_helper = ge::MakeShared<OmFileSaveHelper>();
-  GE_CHECK_NOTNULL(om_file_save_helper);
+Status ModelHelper::SaveModelDef(std::shared_ptr<OmFileSaveHelper> &om_file_save_helper,
+                                 const GeModelPtr &ge_model, ge::Buffer &model_buffer, size_t model_index) {
   ModelPtr model_tmp = ge::MakeShared<ge::Model>(ge_model->GetName(), ge_model->GetPlatformVersion());
   if (model_tmp == nullptr) {
     GELOGE(FAILED, "Create Model %s Ptr failed", ge_model->GetName().c_str());
@@ -96,16 +88,21 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::SaveToOmMod
   model_tmp->SetVersion(ge_model->GetVersion());
   model_tmp->SetAttr(ge_model->MutableAttrMap());
 
-  ge::Buffer model_buffer;
+
   (void)model_tmp->Save(model_buffer);
   GELOGD("MODEL_DEF size is %zu", model_buffer.GetSize());
   if (model_buffer.GetSize() > 0) {
     if (SaveModelPartition(om_file_save_helper, ModelPartitionType::MODEL_DEF, model_buffer.GetData(),
-                           model_buffer.GetSize()) != SUCCESS) {
+                           model_buffer.GetSize(), model_index) != SUCCESS) {
       GELOGE(PARAM_INVALID, "Add model graph partition failed");
       return PARAM_INVALID;
     }
   }
+  return SUCCESS;
+}
+
+Status ModelHelper::SaveModelWeights(std::shared_ptr<OmFileSaveHelper> &om_file_save_helper,
+                                     const GeModelPtr &ge_model, size_t model_index) {
   auto ge_model_weight = ge_model->GetWeight();
   GELOGD("WEIGHTS_DATA size is %zu, %p", ge_model_weight.GetSize(), ge_model_weight.GetData());
   // weight is not necessary
@@ -113,31 +110,43 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::SaveToOmMod
     GE_CHK_STATUS_RET(SaveModelPartition(om_file_save_helper,
                                          ModelPartitionType::WEIGHTS_DATA,
                                          ge_model_weight.GetData(),
-                                         ge_model_weight.GetSize()), "Add weight partition failed");
+                                         ge_model_weight.GetSize(), model_index), "Add weight partition failed");
   }
+  return SUCCESS;
+}
 
+Status ModelHelper::SaveModelTbeKernel(std::shared_ptr<OmFileSaveHelper> &om_file_save_helper,
+                                       const GeModelPtr &ge_model, size_t model_index) {
   TBEKernelStore tbe_kernel_store = ge_model->GetTBEKernelStore();
   GELOGD("TBE_KERNELS size is %zu", tbe_kernel_store.DataSize());
   if (tbe_kernel_store.DataSize() > 0) {
-    GE_CHK_STATUS_RET(SaveModelPartition(om_file_save_helper,
-                                         ModelPartitionType::TBE_KERNELS,
-                                         tbe_kernel_store.Data(),
-                                         tbe_kernel_store.DataSize()), "Add tbe kernel partition failed");
+    GE_CHK_STATUS_RET(
+        SaveModelPartition(om_file_save_helper, ModelPartitionType::TBE_KERNELS,
+                           ge_model->GetTBEKernelStore().Data(), ge_model->GetTBEKernelStore().DataSize(), 
+                           model_index), "Add tbe kernel partition failed");
   }
-
   // no need to check value, DATA->NetOutput
   (void)tbe_kernel_store.Load(tbe_kernel_store.Data(), tbe_kernel_store.DataSize());
 
+  return SUCCESS;
+}
+
+Status ModelHelper::SaveModelCustAICPU(std::shared_ptr<OmFileSaveHelper> &om_file_save_helper,
+                                       const GeModelPtr &ge_model, size_t model_index) {
   CustAICPUKernelStore cust_aicpu_kernel_store = ge_model->GetCustAICPUKernelStore();
   GELOGD("cust aicpu kernels size is %zu", cust_aicpu_kernel_store.DataSize());
   if (cust_aicpu_kernel_store.DataSize() > 0) {
     GE_CHK_STATUS_RET(SaveModelPartition(om_file_save_helper,
                                          ModelPartitionType::CUST_AICPU_KERNELS,
-                                         cust_aicpu_kernel_store.Data(),
-                                         cust_aicpu_kernel_store.DataSize()),
+                                         ge_model->GetCustAICPUKernelStore().Data(),
+                                         cust_aicpu_kernel_store.DataSize(), model_index),
                       "Add cust aicpu kernel partition failed");
   }
+  return SUCCESS;
+}
 
+Status ModelHelper::SaveModelTaskDef(std::shared_ptr<OmFileSaveHelper> &om_file_save_helper,
+                                     const GeModelPtr &ge_model, ge::Buffer &task_buffer, size_t model_index) {
   std::shared_ptr<ModelTaskDef> model_task_def = ge_model->GetModelTaskDefPtr();
   if (model_task_def == nullptr) {
     GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Create model task def ptr failed");
@@ -146,9 +155,9 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::SaveToOmMod
   size_t partition_task_size = model_task_def->ByteSizeLong();
   GE_IF_BOOL_EXEC(partition_task_size == 0 || partition_task_size > INT_MAX,
                   GELOGE(FAILED, "Model_def's byte size (%zu) is invalid!", partition_task_size);
-                  return FAILED);
+                      return FAILED);
 
-  ge::Buffer task_buffer(partition_task_size);
+  task_buffer = ge::Buffer(partition_task_size);
   if (task_buffer.GetSize() == 0) {
     GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Alloc model task def buffer failed");
     return ACL_ERROR_GE_MEMORY_ALLOCATION;
@@ -159,21 +168,28 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::SaveToOmMod
   GELOGD("TASK_INFO size is %zu", partition_task_size);
 
   if (SaveModelPartition(om_file_save_helper, ModelPartitionType::TASK_INFO, task_buffer.GetData(),
-                         partition_task_size) != SUCCESS) {
+                         partition_task_size, model_index) != SUCCESS) {
     GELOGE(PARAM_INVALID, "Add model task def partition failed");
     return PARAM_INVALID;
   }
+  return SUCCESS;
+}
+
+Status ModelHelper::SaveModelHeader(std::shared_ptr<OmFileSaveHelper> &om_file_save_helper,
+                                    const GeModelPtr &ge_model, size_t model_num) {
   // Save target/version to model_header
   ModelFileHeader &model_header = om_file_save_helper->GetModelFileHeader();
   model_header.platform_type = ge_model->GetPlatformType();
   model_header.om_ir_version = ge_model->GetVersion();
+  model_header.model_num = model_num;
   std::string platform_version = ge_model->GetPlatformVersion();
 
   errno_t err;
   err = memcpy_s(model_header.platform_version, PLATFORM_VERSION_LEN, platform_version.c_str(),
                  platform_version.size() + 1);
   if (err != EOK) {
-    GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "ModelHelper SaveModel failed while allocating memory for platform_version.");
+    GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION,
+           "ModelHelper SaveModel failed while allocating memory for platform_version.");
     return ACL_ERROR_GE_MEMORY_ALLOCATION;
   }
   string version = reinterpret_cast<char *>(model_header.platform_version);
@@ -188,8 +204,142 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::SaveToOmMod
   }
   string model_name = reinterpret_cast<char *>(model_header.name);
   GELOGD("Model name save:%s", model_name.c_str());
+  return SUCCESS;
+}
+
+Status ModelHelper::SaveAllModelPartiton(std::shared_ptr<OmFileSaveHelper>& om_file_save_helper,
+                                         const GeModelPtr &ge_model, ge::Buffer &model_buffer,
+                                         ge::Buffer &task_buffer, size_t model_index) {
+  if (SaveModelDef(om_file_save_helper, ge_model, model_buffer, model_index) != SUCCESS) {
+    GELOGE(FAILED, "save model def failed");
+    return FAILED;
+  }
+
+  if (SaveModelWeights(om_file_save_helper, ge_model, model_index) != SUCCESS) {
+    GELOGE(FAILED, "save model weights failed");
+    return FAILED;
+  }
+
+  if (SaveModelTbeKernel(om_file_save_helper, ge_model, model_index) != SUCCESS) {
+    GELOGE(FAILED, "save model tbe kernel failed");
+    return FAILED;
+  }
+
+  if (SaveModelCustAICPU(om_file_save_helper, ge_model, model_index) != SUCCESS) {
+    GELOGE(FAILED, "save model cust ai cpu failed");
+    return FAILED;
+  }
+
+
+  if (SaveModelTaskDef(om_file_save_helper, ge_model, task_buffer, model_index) != SUCCESS) {
+    GELOGE(FAILED, "save task def failed");
+    return FAILED;
+  }
+  return SUCCESS;
+}
+
+FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::SaveToOmModel(const GeModelPtr &ge_model,
+                                                                                   const SaveParam &save_param,
+                                                                                   const std::string &output_file,
+                                                                                   ModelBufferData& model) {
+  if (output_file.empty()) {
+    GELOGE(FAILED, "GraphBuilder SaveModel received invalid file name prefix");
+    return FAILED;
+  }
 
-  Status ret = om_file_save_helper->SaveModel(save_param, output_file.c_str(), model, is_offline_);
+  GE_IF_BOOL_EXEC(ge_model == nullptr, GELOGE(FAILED, "Ge_model is nullptr"); return FAILED);
+  std::shared_ptr<OmFileSaveHelper> om_file_save_helper = ge::MakeShared<OmFileSaveHelper>();
+  GE_CHECK_NOTNULL(om_file_save_helper);
+  ge::Buffer model_buffer;
+  ge::Buffer task_buffer;
+
+  auto ret = SaveAllModelPartiton(om_file_save_helper, ge_model, model_buffer, task_buffer);
+  if (ret != SUCCESS) {
+    GELOGE(ret, "save all model partition failed");
+    return ret;
+  }
+
+  ret = SaveModelHeader(om_file_save_helper, ge_model);
+  if (ret != SUCCESS) {
+    GELOGE(ret, "save model header failed");
+    return ret;
+  }
+
+  ret = om_file_save_helper->SaveModel(save_param, output_file.c_str(), model, is_offline_);
+  if (ret != SUCCESS) {
+    GELOGE(FAILED, "OmFileSaveHelper SaveModel return fail.");
+    return ret;
+  }
+  return SUCCESS;
+}
+
+FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::SaveToOmRootModel(
+    const GeRootModelPtr &ge_root_model,
+    const SaveParam &save_param,
+    const std::string &output_file,
+    ModelBufferData& model,
+    bool is_unknown_shape) {
+
+  GE_CHECK_NOTNULL(ge_root_model);
+  GE_IF_BOOL_EXEC(ge_root_model == nullptr, GELOGE(FAILED, "Ge_root_model is nullptr"); return FAILED);
+
+  auto &name_to_ge_model = ge_root_model->GetSubgraphInstanceNameToModel();
+  GE_IF_BOOL_EXEC(name_to_ge_model.empty(), GELOGE(FAILED, "Ge_root_model has no sub model"); return FAILED);
+  GE_IF_BOOL_EXEC(output_file.empty(),
+                  GELOGE(FAILED, "GraphBuilder SaveModel received invalid file name prefix");
+                  return FAILED);
+
+  if (!is_unknown_shape) {
+    auto &model_root = name_to_ge_model.begin()->second;
+    return SaveToOmModel(model_root, save_param, output_file, model);
+  }
+
+  std::shared_ptr<OmFileSaveHelper> om_file_save_helper = ge::MakeShared<OmFileSaveHelper>();
+  GE_CHECK_NOTNULL(om_file_save_helper);
+
+  auto &first_ge_model = name_to_ge_model.at(ge_root_model->GetRootGraph()->GetName());
+
+  // ge root model must be the first to be loaded
+  vector<string> model_names{ge_root_model->GetRootGraph()->GetName()};
+  for (auto &item : name_to_ge_model) {
+    if (item.first != model_names.front()) {
+      model_names.emplace_back(item.first);
+    }
+  }
+  
+  vector<ge::Buffer> model_buffers(model_names.size());
+  vector<ge::Buffer> task_buffers(model_names.size());
+
+  size_t cur_index = 0;
+
+  if (model_names.size() > 1) {
+    GELOGD("only save first model MODEL_DEF");
+    if (SaveModelDef(om_file_save_helper, first_ge_model, model_buffers[cur_index], cur_index) != SUCCESS) {
+      GELOGE(FAILED, "save model def failed");
+      return FAILED;
+    }
+    ++cur_index;
+  }
+
+  for (; cur_index < model_names.size(); ++cur_index) {
+    auto model_name = model_names[cur_index];
+    GELOGD("cur model %s index is %zu", model_name.c_str(), cur_index);
+    const GeModelPtr &ge_model = name_to_ge_model.at(model_name);
+    auto ret = SaveAllModelPartiton(om_file_save_helper, ge_model, model_buffers[cur_index],
+                                    task_buffers[cur_index], cur_index);
+    if (ret != SUCCESS) {
+      GELOGE(INTERNAL_ERROR, "Save model %s failed", model_name.c_str());
+      return INTERNAL_ERROR;
+    }
+  }
+
+  auto ret = SaveModelHeader(om_file_save_helper, first_ge_model, model_names.size());
+  if (ret != SUCCESS) {
+    GELOGE(INTERNAL_ERROR, "Save model %s header failed", first_ge_model->GetName().c_str());
+    return INTERNAL_ERROR;
+  }
+
+  ret = om_file_save_helper->SaveRootModel(save_param, output_file.c_str(), model, is_offline_);
   if (ret != SUCCESS) {
     GELOGE(FAILED, "OmFileSaveHelper SaveModel return fail.");
     return FAILED;
@@ -288,7 +438,6 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::LoadModel(c
   }
 
   file_header_ = reinterpret_cast<ModelFileHeader *>(model_data.model_data);
-
   OmFileLoadHelper om_load_helper;
   status = om_load_helper.Init(model_addr_tmp_, model_len_tmp_);
   if (status != SUCCESS) {
@@ -310,7 +459,61 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::LoadModel(c
     GELOGE(status, "GenerateGeModel failed");
     return status;
   }
+  GELOGD("in ModelHelper::LoadModel, is_assign_model_ is setted to true!");
+  is_assign_model_ = true;
+  return SUCCESS;
+}
+
+FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::LoadRootModel(const ge::ModelData &model_data) {
+  if (model_data.model_data == nullptr || model_data.model_len == 0) {
+    GELOGE(GE_EXEC_MODEL_DATA_SIZE_INVALID, "Model_data is nullptr, or model_data_size is 0");
+    return GE_EXEC_MODEL_DATA_SIZE_INVALID;
+  }
+
+  if (is_assign_model_) {
+    GELOGE(GE_EXEC_LOAD_MODEL_REPEATED, "Model helper has already loaded!");
+    return GE_EXEC_LOAD_MODEL_REPEATED;
+  }
 
+  if (ReleaseLocalModelData() != SUCCESS) {
+    GELOGE(INTERNAL_ERROR, "ReleaseLocalModelData failed.");
+    return INTERNAL_ERROR;
+  }
+
+  Status status = ge::DavinciModelParser::ParseModelContent(model_data, model_addr_tmp_, model_len_tmp_);
+  if (status != SUCCESS) {
+    GELOGE(status, "Parse model content failed!");
+    return status;
+  }
+
+  file_header_ = reinterpret_cast<ModelFileHeader *>(model_data.model_data);
+
+  //model verison 1.0 file header does not have model_num member
+  is_unknown_shape_model_ = file_header_->version >= ge::MODEL_VERSION &&
+                            file_header_->model_num > kStatiOmFileModelNum;
+  GELOGD("cur om model is ge root model or no %d, model version %zu", is_unknown_shape_model_, file_header_->version);
+
+  OmFileLoadHelper om_load_helper;
+  if (is_unknown_shape_model_) {
+    auto model_num = file_header_->model_num;
+    status = om_load_helper.Init(model_addr_tmp_, model_len_tmp_, model_num);
+  } else {
+    status = om_load_helper.Init(model_addr_tmp_, model_len_tmp_);
+  }
+  if (status != SUCCESS) {
+    GELOGE(status, "Om_load_helper init failed");
+    model_addr_tmp_ = nullptr;
+    return status;
+  }
+  // Encrypt model need to del temp model/no encrypt model don't need to del model
+  model_addr_tmp_ = nullptr;
+
+  status = GenerateGeRootModel(om_load_helper);
+  if (status != SUCCESS) {
+    GELOGE(status, "GenerateGeRootModel failed");
+    return status;
+  }
+  GELOGD("in ModelHelper::LoadRootModel, is_assign_model_ is setted to true!");
   is_assign_model_ = true;
   return SUCCESS;
 }
@@ -341,6 +544,61 @@ Status ModelHelper::GenerateGeModel(OmFileLoadHelper &om_load_helper) {
   return SUCCESS;
 }
 
+Status ModelHelper::GenerateGeRootModel(OmFileLoadHelper &om_load_helper) {
+  GELOGD("Begin to generate ge root model");
+  root_model_ = ge::MakeShared<ge::GeRootModel>();
+  GE_CHECK_NOTNULL(root_model_);
+  if (!is_unknown_shape_model_) {
+    if (GenerateGeModel(om_load_helper) != SUCCESS) {
+      GELOGE(FAILED, "GenerateGeModel failed");
+      return FAILED;
+    }
+    GE_CHECK_NOTNULL(model_);
+    root_model_->SetRootGraph(GraphUtils::GetComputeGraph(model_->GetGraph()));
+    return SUCCESS;
+  }
+
+  bool is_first_model = true;
+  for (size_t mode_index = 0;  mode_index < file_header_->model_num; ++mode_index) {
+    GeModelPtr cur_model = ge::MakeShared<ge::GeModel>();
+    Status ret = LoadModelData(om_load_helper, cur_model, mode_index);
+    if (ret != SUCCESS) {
+      return GE_EXEC_LOAD_MODEL_PARTITION_FAILED;
+    }
+
+    if (is_first_model) {
+      is_first_model = false;
+      root_model_->SetRootGraph(GraphUtils::GetComputeGraph(cur_model->GetGraph()));
+      root_model_->SetModelId(cur_model->GetModelId());
+      model_ = cur_model;
+      continue;
+    }
+
+    ret = LoadWeights(om_load_helper, cur_model, mode_index);
+    if (ret != SUCCESS) {
+      return GE_EXEC_LOAD_WEIGHT_PARTITION_FAILED;
+    }
+
+    ret = LoadTBEKernelStore(om_load_helper, cur_model, mode_index);
+    if (ret != SUCCESS) {
+      return GE_EXEC_LOAD_KERNEL_PARTITION_FAILED;
+    }
+
+    ret = LoadCustAICPUKernelStore(om_load_helper, cur_model, mode_index);
+    if (ret != SUCCESS) {
+      return GE_EXEC_LOAD_KERNEL_PARTITION_FAILED;
+    }
+
+    ret = LoadTask(om_load_helper, cur_model, mode_index);
+    if (ret != SUCCESS) {
+      return GE_EXEC_LOAD_TASK_PARTITION_FAILED;
+    }
+    root_model_->SetSubgraphInstanceNameToModel(cur_model->GetName(), cur_model);
+  }
+
+  return SUCCESS;
+}
+
 Status ModelHelper::LoadModelData(OmFileLoadHelper &om_load_helper) {
   ModelPartition partition_model_def;
   // no need to check value, DATA->NetOutput
@@ -366,6 +624,28 @@ void ModelHelper::SetModelToGeModel(ge::Model &model) {
   model_->SetAttr(model.MutableAttrMap());
 }
 
+Status ModelHelper::LoadModelData(OmFileLoadHelper &om_load_helper, GeModelPtr &cur_model, size_t mode_index) {
+  ModelPartition partition_model_def;
+  // no need to check value, DATA->NetOutput
+  om_load_helper.GetModelPartition(ModelPartitionType::MODEL_DEF, partition_model_def, mode_index);
+  GELOGD("Model_def partition addr:%p,size:%u", partition_model_def.data, partition_model_def.size);
+
+  ge::Model model;
+  if (ge::Model::Load(partition_model_def.data, partition_model_def.size, model) != SUCCESS) {
+    GELOGE(INTERNAL_ERROR, "Load model failed.");
+    return INTERNAL_ERROR;
+  }
+
+  cur_model->SetGraph(model.GetGraph());
+  cur_model->SetName(model.GetName());
+  cur_model->SetVersion(model.GetVersion());
+  cur_model->SetPlatformVersion(model.GetPlatformVersion());
+  cur_model->SetAttr(model.MutableAttrMap());
+
+  return SUCCESS;
+}
+
+
 Status ModelHelper::LoadWeights(OmFileLoadHelper &om_load_helper) {
   ModelPartition partition;
   if (om_load_helper.GetModelPartition(ModelPartitionType::WEIGHTS_DATA, partition) != SUCCESS) {
@@ -379,6 +659,19 @@ Status ModelHelper::LoadWeights(OmFileLoadHelper &om_load_helper) {
   return SUCCESS;
 }
 
+Status ModelHelper::LoadWeights(OmFileLoadHelper &om_load_helper, GeModelPtr &cur_model, size_t mode_index) {
+  ModelPartition partition;
+  if (om_load_helper.GetModelPartition(ModelPartitionType::WEIGHTS_DATA, partition, mode_index) != SUCCESS) {
+    GELOGE(FAILED, "Get weight model partition failed.");
+    return FAILED;
+  }
+  ge::Buffer weight = ge::Buffer::CopyFrom(partition.data, partition.size);
+  cur_model->SetWeight(weight);
+
+  GELOGD("GetWeight size:%u", partition.size);
+  return SUCCESS;
+}
+
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::LoadTask(OmFileLoadHelper &om_load_helper) {
   ModelPartition task_partition;
   if (om_load_helper.GetModelPartition(ModelPartitionType::TASK_INFO, task_partition) != SUCCESS) {
@@ -398,6 +691,27 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::LoadTask(Om
   return SUCCESS;
 }
 
+FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::LoadTask(OmFileLoadHelper &om_load_helper,
+                                                                              GeModelPtr &cur_model,
+                                                                              size_t mode_index) {
+  ModelPartition task_partition;
+  if (om_load_helper.GetModelPartition(ModelPartitionType::TASK_INFO, task_partition, mode_index) != SUCCESS) {
+    GELOGE(FAILED, "Get task model partition failed.");
+    return FAILED;
+  }
+  std::shared_ptr<ModelTaskDef> task = ge::MakeShared<ModelTaskDef>();
+  GE_CHECK_NOTNULL(task);
+  if (task_partition.size != 0) {
+    if (!ReadProtoFromArray(task_partition.data, task_partition.size, task.get())) {
+      GELOGE(INTERNAL_ERROR, "ReadProtoFromArray failed.");
+      return INTERNAL_ERROR;
+    }
+    GELOGD("TASK_INFO op_size:%zu, stream_num:%u", task->op().size(), task->stream_num());
+  }
+  cur_model->SetModelTaskDef(task);
+  return SUCCESS;
+}
+
 Status ModelHelper::LoadTBEKernelStore(OmFileLoadHelper &om_load_helper) {
   // Load tbe kernels
   ModelPartition partition_kernel_def;
@@ -414,6 +728,23 @@ Status ModelHelper::LoadTBEKernelStore(OmFileLoadHelper &om_load_helper) {
   return SUCCESS;
 }
 
+Status ModelHelper::LoadTBEKernelStore(OmFileLoadHelper &om_load_helper, GeModelPtr &cur_model, size_t mode_index) {
+  // Load tbe kernels
+  ModelPartition partition_kernel_def;
+  TBEKernelStore kernel_store;
+  if (om_load_helper.GetModelPartition(ModelPartitionType::TBE_KERNELS, partition_kernel_def, mode_index) ==
+      SUCCESS) {
+    GELOGD("Kernels partition size:%u", partition_kernel_def.size);
+    if (kernel_store.Load(partition_kernel_def.data, partition_kernel_def.size)) {
+      GELOGD("Load tbe kernels success");
+    } else {
+      GELOGW("Load tbe kernels failed");
+    }
+  }
+  cur_model->SetTBEKernelStore(kernel_store);
+  return SUCCESS;
+}
+
 Status ModelHelper::LoadCustAICPUKernelStore(OmFileLoadHelper &om_load_helper) {
   // Load cust aicpu kernels
   ModelPartition partition_kernel_def;
@@ -421,19 +752,39 @@ Status ModelHelper::LoadCustAICPUKernelStore(OmFileLoadHelper &om_load_helper) {
   if (om_load_helper.GetModelPartition(ModelPartitionType::CUST_AICPU_KERNELS, partition_kernel_def) == SUCCESS) {
     GELOGD("Kernels partition size:%u", partition_kernel_def.size);
     if (kernel_store.Load(partition_kernel_def.data, partition_kernel_def.size)) {
-      GELOGI("Load cust aicpu kernels success");
+      GELOGD("Load cust aicpu kernels success");
+    } else {
+      GELOGW("Load cust aicpu kernels failed");
     }
   }
   model_->SetCustAICPUKernelStore(kernel_store);
   return SUCCESS;
 }
 
+Status ModelHelper::LoadCustAICPUKernelStore(OmFileLoadHelper &om_load_helper,
+                                             GeModelPtr &cur_model, size_t mode_index) {
+  // Load cust aicpu kernels
+  ModelPartition partition_kernel_def;
+  CustAICPUKernelStore kernel_store;
+  if (om_load_helper.GetModelPartition(ModelPartitionType::CUST_AICPU_KERNELS, partition_kernel_def, mode_index)
+      == SUCCESS) {
+    GELOGD("Kernels partition size:%u", partition_kernel_def.size);
+    if (kernel_store.Load(partition_kernel_def.data, partition_kernel_def.size)) {
+      GELOGD("Load cust aicpu kernels success");
+    } else {
+      GELOGW("Load cust aicpu kernels failed");
+    }
+  }
+  cur_model->SetCustAICPUKernelStore(kernel_store);
+  return SUCCESS;
+}
+
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY GeModelPtr ModelHelper::GetGeModel() {
   if (model_ != nullptr) {
     return model_;
   }
 
-  GELOGI("Model has not been loaded!");
+  GELOGD("Model has not been loaded!");
   std::shared_ptr<ge::GeModel> out_model = ge::MakeShared<ge::GeModel>();
   if (out_model == nullptr) {
     return nullptr;
@@ -441,6 +792,20 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY GeModelPtr ModelHelper::GetGeMo
   return out_model;
 }
 
+FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY GeRootModelPtr ModelHelper::GetGeRootModel() {
+  if (root_model_ != nullptr) {
+    return root_model_;
+  }
+
+  GELOGD("Model has not been loaded!");
+  std::shared_ptr<ge::GeRootModel> out_model = ge::MakeShared<ge::GeRootModel>();
+  if (out_model == nullptr) {
+    return nullptr;
+  }
+  return out_model;
+}
+
+
 Status ModelHelper::ReleaseLocalModelData() noexcept {
   Status result = SUCCESS;
   if (model_addr_tmp_ != nullptr) {
diff --git a/ge/common/helper/om_file_helper.cc b/ge/common/helper/om_file_helper.cc
index ce88cd08..d1c52b13 100644
--- a/ge/common/helper/om_file_helper.cc
+++ b/ge/common/helper/om_file_helper.cc
@@ -52,6 +52,17 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status OmFileLoadHelper::Init(u
   return SUCCESS;
 }
 
+FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status OmFileLoadHelper::Init(uint8_t *model_data,
+                                                                               uint32_t model_data_size,
+                                                                               uint32_t model_num) {
+  Status status = LoadModelPartitionTable(model_data, model_data_size, model_num);
+  if (status != SUCCESS) {
+    return status;
+  }
+  is_inited_ = true;
+  return SUCCESS;
+}
+
 // Use both
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status OmFileLoadHelper::GetModelPartition(ModelPartitionType type,
                                                                                             ModelPartition &partition) {
@@ -79,6 +90,37 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status OmFileLoadHelper::GetMod
   return SUCCESS;
 }
 
+FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status OmFileLoadHelper::GetModelPartition(ModelPartitionType type,
+                                                                                            ModelPartition &partition,
+                                                                                            size_t model_index) {
+  if (!is_inited_) {
+    GELOGE(PARAM_INVALID, "OmFileLoadHelper has not been initialized!");
+    return PARAM_INVALID;
+  }
+  if (model_index >= model_contexts_.size()) {
+    GELOGE(PARAM_INVALID, "cur index : %zu, model_contexts size:%zu", model_index, model_contexts_.size());
+    return PARAM_INVALID;
+  }
+  auto &cur_ctx = model_contexts_[model_index];
+  bool found = false;
+  for (ModelPartition &part : cur_ctx.partition_datas_) {
+    if (part.type == type) {
+      partition = part;
+      found = true;
+      break;
+    }
+  }
+
+  if (!found) {
+    if (type != ModelPartitionType::TBE_KERNELS && type != ModelPartitionType::WEIGHTS_DATA &&
+        type != ModelPartitionType::CUST_AICPU_KERNELS) {
+      GELOGE(FAILED, "GetModelPartition:type:%d is not in partition_datas!", static_cast<int>(type));
+      return FAILED;
+    }
+  }
+  return SUCCESS;
+}
+
 Status OmFileLoadHelper::CheckModelValid(const ge::ModelData &model) const {
   // Parameter validity check
   if (model.model_data == nullptr) {
@@ -138,7 +180,8 @@ Status OmFileLoadHelper::LoadModelPartitionTable(uint8_t *model_data, const uint
     context_.partition_datas_.push_back(partition);
 
     if (partition.size > model_data_size || mem_offset > model_data_size - partition.size) {
-      GELOGE(ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID, "The partition size %zu is greater than the model data size %u.",
+      GELOGE(ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID,
+             "The partition size %zu is greater than the model data size %u.",
              partition.size + mem_offset, model_data_size);
       return ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID;
     }
@@ -148,6 +191,61 @@ Status OmFileLoadHelper::LoadModelPartitionTable(uint8_t *model_data, const uint
   return SUCCESS;
 }
 
+Status OmFileLoadHelper::LoadModelPartitionTable(uint8_t *model_data, uint32_t model_data_size, uint32_t model_num) {
+  if (model_data == nullptr) {
+    GELOGE(PARAM_INVALID, "Param model_data must not be null!");
+    return PARAM_INVALID;
+  }
+
+  uint32_t cur_offset = 0;
+  for (uint32_t index = 0; index < model_num; ++index) {
+    // Init partition table
+    auto partition_table = reinterpret_cast<ModelPartitionTable *>(model_data + cur_offset);
+    size_t partition_table_size = SIZE_OF_MODEL_PARTITION_TABLE(*partition_table);
+    cur_offset += partition_table_size;
+    GELOGD("Cur model index %zu: ModelPartitionTable num :%u, "
+           "ModelFileHeader length :%zu, ModelPartitionTable length :%zu",
+           index, partition_table->num, sizeof(ModelFileHeader), partition_table_size);
+    if (model_data_size <= cur_offset) {
+      GELOGE(GE_EXEC_MODEL_DATA_SIZE_INVALID, "invalid model data, partition_table->num:%u, model data size %u",
+             partition_table->num, model_data_size);
+      return GE_EXEC_MODEL_DATA_SIZE_INVALID;
+    }
+
+    for (uint32_t i = 0; i < partition_table->num; i++) {
+      ModelPartition partition;
+      partition.size = partition_table->partition[i].mem_size;
+      partition.data = model_data + cur_offset;
+      partition.type = partition_table->partition[i].type;
+      if (index >= model_contexts_.size()) {
+        if (index != model_contexts_.size()) {
+          GELOGE(FAILED, "cur index is %zu make model_contexts_ overflow", index);
+          return FAILED;
+        }
+
+        OmFileContext tmp_ctx;
+        tmp_ctx.partition_datas_.push_back(partition);
+        model_contexts_.push_back(tmp_ctx);
+      } else {
+        model_contexts_[index].partition_datas_.push_back(partition);
+      }
+
+      if (partition.size > model_data_size || cur_offset > model_data_size - partition.size) {
+        GELOGE(GE_EXEC_MODEL_DATA_SIZE_INVALID, "The partition size %zu is greater than the model data size %u.",
+               partition.size + cur_offset, model_data_size);
+        return GE_EXEC_MODEL_DATA_SIZE_INVALID;
+      }
+      cur_offset += partition.size;
+      GELOGD("Partition, type:%d, size:%u, model_index:%zu", static_cast<int>(partition.type), partition.size, index);
+    }
+  }
+  if (cur_offset != model_data_size) {
+    GELOGE(FAILED, "do not get the complete model, read end offset:%zu, all size:%zu", cur_offset, model_data_size);
+    return FAILED;
+  }
+  return SUCCESS;
+}
+
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY const std::vector<ModelPartition>
   &OmFileSaveHelper::GetModelPartitions() const {
   return context_.partition_datas_;
@@ -172,6 +270,28 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelPartitionTable *OmFileSave
   return partition_table;
 }
 
+FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelPartitionTable *OmFileSaveHelper::GetPartitionTable(
+    size_t cur_ctx_index) {
+  auto &cur_ctx = model_contexts_[cur_ctx_index];
+  auto partition_size = static_cast<uint32_t>(cur_ctx.partition_datas_.size());
+  // Build ModelPartitionTable, flex array
+  cur_ctx.partition_table_.clear();
+  cur_ctx.partition_table_.resize(sizeof(ModelPartitionTable) + sizeof(ModelPartitionMemInfo) * partition_size, 0);
+
+  auto partition_table = reinterpret_cast<ModelPartitionTable *>(cur_ctx.partition_table_.data());
+  partition_table->num = partition_size;
+
+  uint32_t mem_offset = 0;
+  for (uint32_t i = 0; i < partition_size; i++) {
+    ModelPartition partition = cur_ctx.partition_datas_[i];
+    partition_table->partition[i] = {partition.type, mem_offset, partition.size};
+    mem_offset += partition.size;
+    GELOGD("Partition, type:%d, size:%u", static_cast<int>(partition.type), partition.size);
+  }
+  return partition_table;
+}
+
+
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status OmFileSaveHelper::AddPartition(ModelPartition &partition) {
   if (ge::CheckUint32AddOverflow(context_.model_data_len_, partition.size) != SUCCESS) {
     GELOGE(FAILED, "UINT32 %u and %u addition can result in overflow!", context_.model_data_len_, partition.size);
@@ -182,6 +302,27 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status OmFileSaveHelper::AddPar
   return SUCCESS;
 }
 
+Status OmFileSaveHelper::AddPartition(ModelPartition &partition, size_t cur_index) {
+  if (ge::CheckUint32AddOverflow(context_.model_data_len_, partition.size) != SUCCESS) {
+    GELOGE(FAILED, "UINT32 %u and %u addition can result in overflow!", context_.model_data_len_, partition.size);
+    return FAILED;
+  }
+  if (cur_index >= model_contexts_.size()) {
+    if (cur_index != model_contexts_.size()) {
+      GELOGE(FAILED, "cur index is %zu make model_contexts_ overflow", cur_index);
+      return FAILED;
+    }
+    OmFileContext tmp_ctx;
+    tmp_ctx.model_data_len_ += partition.size;
+    tmp_ctx.partition_datas_.push_back(partition);
+    model_contexts_.push_back(tmp_ctx);
+  } else {
+    model_contexts_[cur_index].model_data_len_ += partition.size;
+    model_contexts_[cur_index].partition_datas_.push_back(partition);
+  }
+  return SUCCESS;
+}
+
 Status OmFileSaveHelper::SaveModel(const SaveParam &save_param, const char *output_file, ModelBufferData &model,
                                    bool is_offline) {
   (void)save_param.cert_file;
@@ -198,6 +339,10 @@ Status OmFileSaveHelper::SaveModel(const SaveParam &save_param, const char *outp
 
 Status OmFileSaveHelper::SaveModelToFile(const char *output_file, ModelBufferData &model, bool is_offline) {
 #if !defined(NONSUPPORT_SAVE_TO_FILE)
+  if (context_.partition_datas_.empty()) {
+    GE_CHK_BOOL_EXEC(!model_contexts_.empty(), return FAILED, "mode contexts empty");
+    context_ = model_contexts_.front();
+  }
   uint32_t model_data_len = context_.model_data_len_;
   if (model_data_len == 0) {
     GELOGE(domi::PARAM_INVALID, "Model data len error! should not be 0");
@@ -231,4 +376,53 @@ Status OmFileSaveHelper::SaveModelToFile(const char *output_file, ModelBufferDat
   return SUCCESS;
 #endif
 }
+
+Status OmFileSaveHelper::SaveRootModel(const SaveParam &save_param, const char *output_file,
+                                       ModelBufferData &model, bool is_offline) {
+  (void)save_param.cert_file;
+  (void)save_param.ek_file;
+  (void)save_param.encode_mode;
+  (void)save_param.hw_key_file;
+  (void)save_param.pri_key_file;
+
+#if !defined(NONSUPPORT_SAVE_TO_FILE)
+  vector<ModelPartitionTable *> model_partition_tabels;
+  vector<vector<ModelPartition>> all_model_partitions;
+  for (size_t ctx_index = 0; ctx_index < model_contexts_.size(); ++ctx_index) {
+    auto &cur_ctx = model_contexts_[ctx_index];
+    uint32_t cur_model_data_len = cur_ctx.model_data_len_;
+    if (cur_model_data_len == 0) {
+      GELOGE(domi::PARAM_INVALID, "Model data len error! should not be 0");
+      return domi::PARAM_INVALID;
+    }
+
+    auto tmp_table = GetPartitionTable(ctx_index);
+    if (tmp_table == nullptr) {
+      GELOGE(ge::GE_GRAPH_SAVE_FAILED, "SaveModelToFile execute failed: partition_table is NULL.");
+      return ge::GE_GRAPH_SAVE_FAILED;
+    }
+    uint32_t size_of_table = SIZE_OF_MODEL_PARTITION_TABLE(*tmp_table);
+    FMK_UINT32_ADDCHECK(size_of_table, cur_model_data_len)
+    FMK_UINT32_ADDCHECK(size_of_table + cur_model_data_len, model_header_.length)
+    model_header_.length += size_of_table + cur_model_data_len;
+    model_partition_tabels.push_back(tmp_table);
+    all_model_partitions.push_back(cur_ctx.partition_datas_);
+    GELOGD("sizeof(ModelPartitionTable):%u, cur_model_data_len:%u, cur_context_index:%zu",
+           size_of_table, cur_model_data_len, ctx_index);
+  }
+  Status ret;
+  if (is_offline) {
+    ret = FileSaver::SaveToFile(output_file, model_header_, model_partition_tabels, all_model_partitions);
+  } else {
+    GELOGW("do not support save ge root model to buff now");
+    return FAILED;
+  }
+  if (ret == SUCCESS) {
+    GELOGD("Save model success without encrypt.");
+  }
+  return ret;
+#else
+  return SUCCESS;
+#endif
+}
 }  // namespace ge
diff --git a/ge/common/op/ge_op_utils.cc b/ge/common/op/ge_op_utils.cc
index 579190d6..fc2990b6 100644
--- a/ge/common/op/ge_op_utils.cc
+++ b/ge/common/op/ge_op_utils.cc
@@ -357,7 +357,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void OpUtils::TransDataHWCK2KCH
   const char *w_data = (const char *)input;
 
   int64_t count = h * w * c * k;
-  GE_IF_BOOL_EXEC(count <= 0, GELOGW("Count value must be greater than 0, but count = %ld", count); return );
+  GE_IF_BOOL_EXEC(count <= 0, GELOGW("Count value must be greater than 0, but count = %ld", count); return);
   float *buf = new (std::nothrow) float[count]();
   GE_RT_VOID_CHECK_NOTNULL(buf);
   float *src_buff = nullptr;
diff --git a/ge/common/profiling/ge_profiling.cc b/ge/common/profiling/ge_profiling.cc
new file mode 100644
index 00000000..640f77a1
--- /dev/null
+++ b/ge/common/profiling/ge_profiling.cc
@@ -0,0 +1,199 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "common/profiling/ge_profiling.h"
+#include "runtime/base.h"
+#include "common/profiling/profiling_manager.h"
+#include "framework/common/debug/ge_log.h"
+#include "framework/common/debug/log.h"
+#include "graph/load/graph_loader.h"
+#include "init/gelib.h"
+#include "framework/common/ge_inner_error_codes.h"
+
+namespace {
+const uint32_t kDeviceListIndex = 3;
+const std::string kDeviceNums = "devNums";
+const std::string kDeviceIdList = "devIdList";
+const std::string kProfilingInit = "prof_init";
+const std::string kProfilingFinalize = "prof_finalize";
+const std::string kProfilingStart = "prof_start";
+const std::string kProfilingStop = "prof_stop";
+const std::string kProfModelSubscribe = "prof_model_subscribe";
+const std::string kProfModelUnsubscribe = "prof_model_cancel_subscribe";
+const std::string kRtSetDeviceRegName = "profiling";
+
+const std::map<ProfCommandHandleType, std::string> kProfCommandTypeMap = {
+    {kProfCommandhandleInit, kProfilingInit},
+    {kProfCommandhandleStart, kProfilingStart},
+    {kProfCommandhandleStop, kProfilingStop},
+    {kProfCommandhandleFinalize, kProfilingFinalize},
+    {kProfCommandhandleModelSubscribe, kProfModelSubscribe},
+    {kProfCommandhandleModelUnsubscribe, kProfModelUnsubscribe}};
+}  // namespace
+
+bool TransProfConfigToParam(const ProfCommandHandleData &profCommand, vector<string> &prof_config_params) {
+  prof_config_params.clear();
+  prof_config_params.emplace_back(kDeviceNums);
+  prof_config_params.emplace_back(std::to_string(profCommand.devNums));
+  prof_config_params.emplace_back(kDeviceIdList);
+  std::string devID = "";
+  if (profCommand.devNums == 0) {
+    GELOGW("The device num is invalid.");
+    return false;
+  }
+  for (uint32_t i = 0; i < profCommand.devNums; i++) {
+    devID.append(std::to_string(profCommand.devIdList[i]));
+    if (i != profCommand.devNums - 1) {
+      devID.append(",");
+    }
+  }
+
+  prof_config_params.push_back(devID);
+  return true;
+}
+
+bool isProfConfigValid(const uint32_t *deviceid_list, uint32_t device_nums) {
+  if (deviceid_list == nullptr) {
+    GELOGE(ge::PARAM_INVALID, "deviceIdList is nullptr");
+    return false;
+  }
+  if (device_nums == 0 || device_nums > MAX_DEV_NUM) {
+    GELOGE(ge::PARAM_INVALID, "The device nums: %u is invalid.", device_nums);
+    return false;
+  }
+
+  // real device num
+  int32_t dev_count = 0;
+  rtError_t rt_err = rtGetDeviceCount(&dev_count);
+  if (rt_err != RT_ERROR_NONE) {
+    GELOGE(ge::INTERNAL_ERROR, "Get the Device count fail.");
+    return false;
+  }
+
+  if (device_nums > static_cast<uint32_t>(dev_count)) {
+    GELOGE(ge::PARAM_INVALID, "Device num(%u) is not in range 1 ~ %d.", device_nums, dev_count);
+    return false;
+  }
+
+  std::unordered_set<uint32_t> record;
+  for (size_t i = 0; i < device_nums; ++i) {
+    uint32_t dev_id = deviceid_list[i];
+    if (dev_id >= static_cast<uint32_t>(dev_count)) {
+      GELOGE(ge::PARAM_INVALID, "Device id %u is not in range 0 ~ %d(exclude %d)", dev_id, dev_count, dev_count);
+      return false;
+    }
+    if (record.count(dev_id) > 0) {
+      GELOGE(ge::PARAM_INVALID, "Device id %u is duplicatedly set", dev_id);
+      return false;
+    }
+    record.insert(dev_id);
+  }
+  return true;
+}
+
+ge::Status RegProfCtrlCallback(MsprofCtrlCallback func) {
+  if (func == nullptr) {
+    GELOGE(ge::PARAM_INVALID, "Msprof ctrl callback is nullptr.");
+    return ge::PARAM_INVALID;
+  }
+  if (ge::ProfilingManager::Instance().GetMsprofCallback().msprofCtrlCallback != nullptr) {
+    GELOGW("Msprof ctrl callback is exist, just ignore it.");
+  } else {
+    GELOGI("GE register Msprof ctrl callback.");
+    ge::ProfilingManager::Instance().SetMsprofCtrlCallback(func);
+  }
+  return ge::SUCCESS;
+}
+
+ge::Status RegProfSetDeviceCallback(MsprofSetDeviceCallback func) {
+  if (func == nullptr) {
+    GELOGE(ge::PARAM_INVALID, "MsprofSetDeviceCallback callback is nullptr.");
+    return ge::PARAM_INVALID;
+  }
+  // Pass MsprofSetDeviceCallback to runtime
+  GELOGI("GE pass setdevice callback to runtime.");
+  ge::Status rt_ret = rtRegDeviceStateCallback(kRtSetDeviceRegName.c_str(), static_cast<rtDeviceStateCallback>(func));
+  if (rt_ret != ge::SUCCESS) {
+    GELOGE(rt_ret, "Pass MsprofSetDeviceCallback to runtime failed!");
+    return rt_ret;
+  }
+  return ge::SUCCESS;
+}
+
+ge::Status RegProfReporterCallback(MsprofReporterCallback func) {
+  if (func == nullptr) {
+    GELOGE(ge::PARAM_INVALID, "MsprofReporterCallback callback is nullptr.");
+    return ge::PARAM_INVALID;
+  }
+  if (ge::ProfilingManager::Instance().GetMsprofCallback().msprofReporterCallback != nullptr) {
+    GELOGW("Msprof reporter callback is exist, just ignore it.");
+  } else {
+    GELOGI("GE register Msprof reporter callback.");
+    ge::ProfilingManager::Instance().SetMsprofReporterCallback(func);
+    // Pass MsprofReporterCallback to runtime
+    ge::Status rt_ret = rtSetMsprofReporterCallback(func);
+    if (rt_ret != ge::SUCCESS) {
+      GELOGE(rt_ret, "Pass MsprofReporterCallback to runtime failed!!");
+      return rt_ret;
+    }
+    // Pass MsprofReporterCallback to hccl
+  }
+  return ge::SUCCESS;
+}
+
+ge::Status ProfCommandHandle(ProfCommandHandleType type, void *data, uint32_t len) {
+  if (type != kProfCommandhandleFinalize) {
+    GE_CHECK_NOTNULL(data);
+  }
+  ProfCommandHandleData *prof_config_param = (ProfCommandHandleData *)data;
+  auto iter = kProfCommandTypeMap.find(type);
+  if (iter == kProfCommandTypeMap.end()) {
+    GELOGW("The prof comand type is invalid.");
+    return ge::PARAM_INVALID;
+  }
+  std::vector<string> prof_params;
+  if (type == kProfCommandhandleStart || type == kProfCommandhandleStop) {
+    if (!isProfConfigValid(prof_config_param->devIdList, prof_config_param->devNums)) {
+      return ge::FAILED;
+    }
+  
+    if (!TransProfConfigToParam(*prof_config_param, prof_params)) {
+      GELOGE(ge::PARAM_INVALID, "Transfer profilerConfig to string vector failed");
+      return ge::PARAM_INVALID;
+    }
+  }
+  ge::GraphLoader graph_loader;
+  ge::Command command;
+  command.cmd_params.clear();
+  command.cmd_type = iter->second;
+  command.cmd_params = prof_params;
+  if (type != kProfCommandhandleFinalize) {
+    command.module_index = prof_config_param->profSwitch;
+  }
+  GELOGI("GE commandhandle execute, Command Type: %d, data type config: 0x%llx", type, command.module_index);
+  if (type == kProfCommandhandleStart || type == kProfCommandhandleStop) {
+    GELOGI("Profiling device nums:%s , deviceID:[%s]", prof_params[0].c_str(), prof_params[kDeviceListIndex].c_str());
+  }
+  ge::Status ret = graph_loader.CommandHandle(command);
+  if (ret != ge::SUCCESS) {
+    GELOGE(ret, "Handle profiling command failed");
+    return ge::FAILED;
+  }
+
+  GELOGI("Successfully execute profiling command type: %d, command 0x%llx.", type, command.module_index);
+  return ge::SUCCESS;
+}
+
diff --git a/ge/common/profiling/ge_runner_profiling.cc b/ge/common/profiling/ge_runner_profiling.cc
new file mode 100644
index 00000000..067aafe3
--- /dev/null
+++ b/ge/common/profiling/ge_runner_profiling.cc
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "common/profiling/ge_runner_profiling.h"
+#include "init/gelib.h"
+
+bool IsInitialize() {
+  std::shared_ptr<ge::GELib> instance_ptr = ge::GELib::GetInstance();
+  if (instance_ptr == nullptr || instance_ptr->InitFlag() == false) {
+    return false;
+  }
+  return true;
+}
diff --git a/ge/common/profiling/profiling_manager.cc b/ge/common/profiling/profiling_manager.cc
index 2f0f061f..456cb0a4 100644
--- a/ge/common/profiling/profiling_manager.cc
+++ b/ge/common/profiling/profiling_manager.cc
@@ -24,16 +24,9 @@
 #include "graph/load/new_model_manager/davinci_model.h"
 
 namespace {
-const char *const kJobID = "jobID";
-const char *const kDeviceID = "deviceID";
-const char *const kStartCfg = "startCfg";
-const char *const kFeatures = "features";
-const char *const kConf = "conf";
-const char *const kEvents = "events";
-const char *const kAiCoreEvents = "ai_core_events";
-const char *const kName = "name";
-const char *const kTraceID = "traceId";
-const char *const kProfDir = "resultPath";
+const char *const kTrainingTrace = "training_trace";
+const char *const kFpPoint = "fp_point";
+const char *const kBpPoint = "bp_point";
 const size_t kReportMaxLen = 2048;
 const int32_t kMaxDeviceNum = 256;
 const std::string kConfigNumsdev = "devNums";
@@ -45,7 +38,13 @@ const std::string kProfModelUnsubscribe = "prof_model_cancel_subscribe";
 }  // namespace
 
 namespace ge {
-ProfilingManager::ProfilingManager() : subscribe_count_(0) {}
+ProfilingManager::ProfilingManager() : is_load_profiling_(false),
+                                       is_execute_profiling_(false),
+                                       is_training_trace_(false),
+                                       subscribe_count_(0) {
+  prof_cb_.msprofCtrlCallback = nullptr;
+  prof_cb_.msprofReporterCallback = nullptr;
+}
 
 ProfilingManager::~ProfilingManager() {}
 
@@ -58,44 +57,29 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::In
 #ifdef DAVINCI_SUPPORT_PROFILING
   vector<int32_t>().swap(device_id_);
   subscribe_count_ = 0;
-  job_id_ = options.job_id;
-
-  GELOGI("ProfilingManager::Init  job_id:%s", job_id_.c_str());
-
+  GELOGI("ProfilingManager::Init  job_id:%s", options.job_id.c_str());
 
-
-  Status ret;
-  if (!recv_profiling_config_.empty()) {
-    GELOGI("Profiling json config from acl:%s", recv_profiling_config_.c_str());
-    ret = InitFromAclCfg(recv_profiling_config_);
-  } else {
-    ret = InitFromOptions(options);
-    if (ret == SUCCESS && is_load_profiling_) {
-      device_id_.push_back(options.device_id);
-    }
-  }
+  struct MsprofGeOptions prof_conf = {{ 0 }};
+  Status ret = InitFromOptions(options, prof_conf);
   if (ret != SUCCESS) {
     GELOGE(ret, "Failed to init profiling.");
     return ret;
   }
 
-  if (is_load_profiling_) {
-    // register Framework to profiling
-    int result = Msprof::Engine::Init(GE_PROFILING_MODULE, &engine_);
-    if (result != 0) {
-      GELOGE(FAILED, "Register profiling engine failed.");
-      return FAILED;
+  if (is_execute_profiling_) {
+    if (prof_cb_.msprofCtrlCallback == nullptr) {
+      GELOGE(ge::PARAM_INVALID, "MsprofCtrlCallback callback is nullptr.");
+      return ge::PARAM_INVALID;
     }
-    // profiling startup first time
-    GELOGI("Begin to init profiling, device num %zu", device_id_.size());
-    for (size_t i = 0; i < device_id_.size(); ++i) {
-      ret = StartProfiling(0, device_id_[i]);
-      if (ret != SUCCESS) {
-        GELOGW("Profiling start failed on device %d.", device_id_[i]);
-        continue;
-      }
-      GELOGI("Profiling init succ on device %d.", device_id_[i]);
+    int32_t cb_ret = prof_cb_.msprofCtrlCallback(
+        static_cast<uint32_t>(MsprofCtrlCallbackType::MSPROF_CTRL_INIT_GE_OPTIONS),
+        static_cast<void *>(&prof_conf), sizeof(MsprofGeOptions));
+    if (cb_ret != 0) {
+      GELOGE(FAILED, "Call msprofCtrlCallback failed, type:%u, return:%d",
+             static_cast<uint32_t>(MsprofCtrlCallbackType::MSPROF_CTRL_INIT_GE_OPTIONS), cb_ret);
+      return FAILED;
     }
+    GELOGI("Profiling init success");
   } else {
     GELOGI("The profiling is off, skip the initialization");
   }
@@ -103,288 +87,116 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::In
   return SUCCESS;
 }
 
-FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::InitFromAclCfg(
-    const std::string &config) {
+ge::Status ProfilingManager::InitFromOptions(const Options &options, MsprofGeOptions &prof_conf) {
 #ifdef DAVINCI_SUPPORT_PROFILING
-  try {
-    is_load_profiling_ = false;
-    is_execute_profiling_ = false;
-    profiling_opts_.clear();
-    op_trace_conf_.clear();
-    Json start_prof_conf = Json::parse(config);
-    Json &prof_conf = start_prof_conf[kStartCfg][0];
-    job_id_ = prof_conf[kJobID];
-    auto iter = prof_conf.find(kProfDir);
-    if (iter != prof_conf.end()) {
-      prof_dir_ = prof_conf[kProfDir];
-    }
-    Json &device_id = prof_conf[kDeviceID];
-    if (device_id.size() != 0) {
-      vector<int32_t>().swap(device_id_);
-      bool is_all = false;
-      for (size_t i = 0; i < device_id.size(); i++) {
-        std::string device_id_str = device_id[i].get<std::string>();
-        if (device_id_str == "all") {
-          is_all = true;
-          break;
-        }
-        device_id_.push_back(std::stoi(device_id_str));
-      }
-      if (is_all) {
-        int32_t count = 0;
-        rtError_t rt_err = rtGetDeviceCount(&count);
-        if (rt_err != RT_ERROR_NONE) {
-          GELOGE(FAILED, "Call rtGetDeviceCount to get device failed.");
-        }
-
-        vector<int32_t>().swap(device_id_);
-        for (int32_t i = 0; i < count; ++i) {
-          device_id_.push_back(i);
-        }
-      }
+  // enable profiling by env
+  char env_profiling_mode[MMPA_MAX_PATH] = { 0x00 };
+  is_load_profiling_ = false; // Change in ProfInit
+  is_execute_profiling_ = false;
+
+  if (options.profiling_mode == "1" && !options.profiling_options.empty()) {
+    // enable profiling by ge option
+    if (memcpy_s(prof_conf.options, MSPROF_OPTIONS_DEF_LEN_MAX, options.profiling_options.c_str(),
+                 options.profiling_options.size()) != EOK) {
+      GELOGE(INTERNAL_ERROR, "copy profiling_options failed.");
+      return INTERNAL_ERROR;
     }
-
-    Json &features = prof_conf[kFeatures];
-    if (ParseFeaturesFromAclCfg(features) != SUCCESS) {
-      GELOGE(FAILED, "Parse feature from acl cfg failed.");
-      return FAILED;
+    is_execute_profiling_ = true;
+    GELOGI("The profiling in options is %s, %s. origin option: %s", options.profiling_mode.c_str(),
+          prof_conf.options, options.profiling_options.c_str());
+  } else {
+    (void)mmGetEnv("PROFILING_MODE", env_profiling_mode, MMPA_MAX_PATH);
+    (void)mmGetEnv("PROFILING_OPTIONS", prof_conf.options, MSPROF_OPTIONS_DEF_LEN_MAX);
+    // The env is invalid
+    if ((strcmp("true", env_profiling_mode) != 0) || (strcmp(prof_conf.options, "\0") == 0)) {
+      return SUCCESS;
     }
-    is_load_profiling_ = true;
+    // enable profiling by env
     is_execute_profiling_ = true;
-  } catch (...) {
-    GELOGE(FAILED, "Json conf is not invalid !");
+    GELOGI("The profiling in env is %s, %s", env_profiling_mode, prof_conf.options); 
+  }
+
+  if (!is_execute_profiling_) {
+    return SUCCESS;
+  }
+
+  // Parse json str for bp fp
+  Status ret = ParseOptions(prof_conf.options);
+  if (ret != ge::SUCCESS) {
+    GELOGE(ge::PARAM_INVALID, "Parse training trace param failed.");
     return ge::PARAM_INVALID;
   }
+
+  if (memcpy_s(prof_conf.jobId, sizeof(prof_conf.jobId), options.job_id.c_str(),
+               sizeof(options.job_id.c_str())) != EOK) {
+    GELOGE(INTERNAL_ERROR, "copy job_id failed.");
+    return INTERNAL_ERROR;
+  }
 #endif
   return ge::SUCCESS;
 }
 
-FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::ParseFeaturesFromAclCfg(
-    const Json &features) {
-#ifdef DAVINCI_SUPPORT_PROFILING
+ge::Status ProfilingManager::ParseOptions(const std::string &options) {
+  if (options.empty()) {
+    GELOGE(ge::PARAM_INVALID, "Profiling options is empty.");
+    return ge::PARAM_INVALID;
+  }
   try {
-    for (size_t i = 0; i < features.size(); ++i) {
-      const Json &feature = features[i];
-      if ((feature.find(kName) == feature.end()) || feature[kName].is_null()) {
-        continue;
-      }
-      const std::string &name = feature[kName];
-      if (name == "op_trace") {
-        const Json &conf = feature[kConf];
-        const Json &events = conf[0][kEvents];
-        const std::string &ai_core_events = events[0][kAiCoreEvents];
-        GELOGI("Op trace config from acl ai_core_events:%s", ai_core_events.c_str());
-        is_op_trace_ = true;
-        ProfMgrConf prof_mgr_conf;
-        int result = ProfMgrGetConf(ai_core_events, &prof_mgr_conf);
-        if (result != 0) {
-          GELOGE(FAILED, "ProfMgrGetConf failed.");
-          return FAILED;
-        }
-        op_trace_conf_ = prof_mgr_conf.conf;
-        op_trace_iter_num_ = static_cast<int32_t>(op_trace_conf_.size());
-        GELOGI("Op trace profiling iter num %d,", op_trace_iter_num_);
-      } else if (name == "task_trace") {
-        is_op_trace_ = false;
-        if (feature.find(kConf) != feature.end()) {
-          const Json &conf = feature[kConf];
-          std::stringstream task_trace_conf;
-          task_trace_conf << conf;
-          task_trace_conf_ = task_trace_conf.str();
-        }
-        GELOGI("Task trace config from acl");
-      } else if (name == "system_trace") {
-        is_op_trace_ = false;
-        const Json &conf = feature[kConf];
-        std::stringstream system_trace_conf;
-        system_trace_conf << conf;
-        system_trace_conf_ = system_trace_conf.str();
-        GELOGI("System trace config from acl");
-      }
-      profiling_opts_.push_back(name);
+    Json prof_options = Json::parse(options);
+    const std::string training_trace = prof_options[kTrainingTrace];
+    if (training_trace.empty()) {
+      GELOGI("Training trace will not take effect.");
+      return ge::SUCCESS;
+    }
+    GELOGI("GE profiling training trace:%s", training_trace.c_str());
+    if (training_trace != "on") {
+      GELOGE(ge::PARAM_INVALID, "Training trace param:%s is invalid.", training_trace.c_str());
+      return ge::PARAM_INVALID;
+    }
+    fp_point_ = prof_options[kFpPoint];
+    bp_point_ = prof_options[kBpPoint];
+    if (!fp_point_.empty() && !bp_point_.empty()) {
+      GELOGI("Training trace bp fp is set, bp_point:%s, fp_point:%s.", bp_point_.c_str(), fp_point_.c_str());
     }
   } catch (...) {
-    GELOGE(ge::PARAM_INVALID, "Json conf feature is not invalid !");
+    GELOGE(FAILED, "Json prof_conf options is invalid.");
     return ge::PARAM_INVALID;
   }
-#endif
   return ge::SUCCESS;
 }
 
-FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::InitFromOptions(const Options &options) {
+FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::StopProfiling() {
 #ifdef DAVINCI_SUPPORT_PROFILING
-  // enable profiling support two ways: env and front end
-  char profiling_mode_temp[MMPA_MAX_PATH] = { 0x00 };
-  char prof_options_temp[MMPA_MAX_PATH] = { 0x00 };
-  (void)mmGetEnv("PROFILING_MODE", profiling_mode_temp, MMPA_MAX_PATH);
-  (void)mmGetEnv("PROFILING_OPTIONS", prof_options_temp, MMPA_MAX_PATH );
-  const char *profiling_mode = profiling_mode_temp;
-  const char *prof_options = prof_options_temp;
-  if ((profiling_mode == nullptr) || (strcmp("true", profiling_mode) != 0) || (prof_options == nullptr)) {
-    is_load_profiling_ = false;
-    is_execute_profiling_ = false;
-  } else {
-    std::string prof_options_str = std::string(prof_options);
-    profiling_opts_ = StringUtils::Split(prof_options_str, ':');
-    is_load_profiling_ = true;
-    is_execute_profiling_ = true;
-    GELOGI("The profiling in env is %s, %s", profiling_mode, prof_options);
-  }
-  if (!is_load_profiling_) {
-    const std::string enable_profiling = "1";
-    if (options.profiling_mode != enable_profiling || options.profiling_options.empty()) {
-      is_load_profiling_ = false;
-      is_execute_profiling_ = false;
-      return SUCCESS;
-    } else {
-      profiling_opts_ = StringUtils::Split(options.profiling_options, ':');
-      is_load_profiling_ = true;
-      is_execute_profiling_ = true;
-      GELOGI("The profiling in options is %s, %s", options.profiling_mode.c_str(), options.profiling_options.c_str());
-    }
-  }
-  // features:'training_trace', 'task_trace' or 'op_trace'  etc
-  if (!profiling_opts_.empty()) {
-    if (profiling_opts_[0] == "op_trace") {
-      is_op_trace_ = true;
-      // op trace get conf
-      ProfMgrConf prof_mgr_conf;
-      int result = ProfMgrGetConf("", &prof_mgr_conf);
-      if (result != 0) {
-        GELOGE(FAILED, "ProfMgrGetConf failed.");
-        return FAILED;
-      }
-      op_trace_conf_ = prof_mgr_conf.conf;
-      op_trace_iter_num_ = static_cast<int32_t>(op_trace_conf_.size());
-      GELOGI("op trace profiling iter num %d,", op_trace_iter_num_);
-    } else {
-      is_op_trace_ = false;
-      op_trace_iter_num_ = 1;
+  uint64_t module = GetProfilingModule();
+  // The following if case will not be executed in normal case, inc case of ProfStopProfiling is abnormal
+  int32_t device_num = static_cast<int32_t>(device_id_.size());
+  if (device_num != 0) {
+    auto device_id_ptr = std::unique_ptr<uint32_t[]>(new (std::nothrow) uint32_t[device_num]);
+    if (device_id_ptr == nullptr) {
+      GELOGE(FAILED, "Stop profiling: device id ptr is null.");
+      return;
     }
-  }
-#endif
-  return ge::SUCCESS;
-}
-
-FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::StartProfiling(int32_t iter_num,
-                                                                                             int32_t device_id) {
-#ifdef DAVINCI_SUPPORT_PROFILING
-  if (!profiling_opts_.empty()) {
-    GELOGI("Start profiling index is %d", iter_num);
-    // current one docker only use one device
-    Json p_device;
-
-    try {
-      // profiling need physical_device_id
-      p_device[kDeviceID] = std::to_string(device_id);
-      p_device[kJobID] = job_id_;
-      p_device[kTraceID] = std::to_string(GetContext().TraceId());
-      if (!prof_dir_.empty()) {
-        p_device[kProfDir] = prof_dir_;
-        GELOGI("Prof dir: %s.", prof_dir_.c_str());
-      }
-
-      Json features;
-      if (is_op_trace_) {
-        Json f;
-        f[kName] = "op_trace";
-        Json conf;
-        if (op_trace_conf_.size() <= static_cast<size_t>(iter_num)) {
-          GELOGE(FAILED, "Op trace iter num is invalid!");
-          return FAILED;
-        }
-        Json events;
-        events[0] = nlohmann::json::parse(op_trace_conf_[iter_num]);
-        conf[0][kEvents] = events;
-        f[kConf] = conf;
-        features[0] = f;
-        if (iter_num == 0) {
-          is_load_ = true;
-        }
-      } else {
-        for (std::vector<std::string>::size_type i = 0; i < profiling_opts_.size(); i++) {
-          Json f;
-          if (profiling_opts_[i] == "system_trace") {
-            f[kConf] = nlohmann::json::parse(system_trace_conf_);
-          } else if (profiling_opts_[i] == "task_trace") {
-            if (!task_trace_conf_.empty()) {
-              f[kConf] = nlohmann::json::parse(task_trace_conf_);
-            }
-          }
-          f[kName] = profiling_opts_[i];
-          features[i] = f;
-        }
-        is_load_ = true;
-      }
-      p_device[kFeatures] = features;
-      // only one device, but sProfMgrStartUp API require for device list
-      Json devices;
-      devices[0] = p_device;
-
-      Json start_cfg;
-      start_cfg[kStartCfg] = devices;
-
-      // convert json to string
-      std::stringstream ss;
-      ss << start_cfg;
-      send_profiling_config_ = ss.str();
-      GELOGI("Profiling config %s\n", send_profiling_config_.c_str());
-    } catch (...) {
-      GELOGE(FAILED, "Op trace json conf is not invalid !");
-      return FAILED;
+    for (int32_t i = 0; i < device_num; i++) {
+      device_id_ptr[i] = static_cast<uint32_t>(device_id_[i]);
     }
-
-    // runtime startup for profiling
-    uint64_t module = GetProfilingModule();
-    int32_t device_num = 1;
-    uint32_t device_id_rt = static_cast<uint32_t>(device_id);
-    GE_CHK_RT_RET(rtProfilerStart(module, device_num, &device_id_rt));
-
-    // call profiling startup API
-    ProfMgrCfg prof_cfg = {send_profiling_config_};
-    void *prof_handle = ProfMgrStartUp(&prof_cfg);
-    if (prof_handle == nullptr) {
-      GELOGW("ProfMgrStartUp failed on device %d ", device_id);
-      return FAILED;
+    rtError_t rt_ret = rtProfilerStop(module, device_num, device_id_ptr.get());
+    if (rt_ret != RT_ERROR_NONE) {
+      GELOGW("Call rtProfilerStop failed, ret:%d", rt_ret);
     }
-    GELOGD("StartProfiling, prof_handle: %p", prof_handle);
-    prof_handle_vec_.push_back(prof_handle);
   }
-#endif
-  return SUCCESS;
-}
-
-FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::StopProfiling() {
-#ifdef DAVINCI_SUPPORT_PROFILING
-  Msprof::Engine::Reporter *reporter = PluginImpl::GetPluginReporter();
-  if (reporter != nullptr) {
-    int ret = reporter->Flush();
-    GELOGI("Report data end, ret is %d", ret);
+ 
+  // stop profiling
+  if (prof_cb_.msprofCtrlCallback == nullptr) {
+      GELOGE(ge::PARAM_INVALID, "MsprofCtrlCallback callback is nullptr.");
+      return;
   }
-  uint64_t module = GetProfilingModule();
-  int32_t device_num = static_cast<int32_t>(device_id_.size());
-  auto device_id_ptr = std::unique_ptr<uint32_t[]>(new (std::nothrow) uint32_t[device_num]);
-  if (device_id_ptr == nullptr) {
-    GELOGE(FAILED, "Stop profiling: device id ptr is null.");
+  int32_t cb_ret = prof_cb_.msprofCtrlCallback(static_cast<uint32_t>(MsprofCtrlCallbackType::MSPROF_CTRL_FINALIZE),
+                                               nullptr, 0);
+  if (cb_ret != 0) {
+    GELOGW("call msprofCtrlCallback failed, type:%u, return:%d",
+           static_cast<uint32_t>(MsprofCtrlCallbackType::MSPROF_CTRL_FINALIZE), cb_ret);
     return;
   }
-  for (int32_t i = 0; i < device_num; i++) {
-    device_id_ptr[i] = static_cast<uint32_t>(device_id_[i]);
-  }
-  rtError_t rt_ret = rtProfilerStop(module, device_num, device_id_ptr.get());
-  if (rt_ret != RT_ERROR_NONE) {
-    GELOGW("Call rtProfilerStop failed, ret:%d", rt_ret);
-  }
-
-  for (size_t i = 0; i < prof_handle_vec_.size(); ++i) {
-    int result = ProfMgrStop(prof_handle_vec_[i]);
-    if (result != 0) {
-      GELOGW("ProfMgr stop return fail:%d, handle:%p", result, prof_handle_vec_[i]);
-    }
-  }
-  vector<void *>().swap(prof_handle_vec_);
-  is_load_ = false;
-  recv_profiling_config_ = "";
   GELOGI("Stop Profiling success.");
 #endif
 }
@@ -392,12 +204,6 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::StopProf
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::ProfilingTaskDescInfo(
     uint32_t model_id, const std::vector<TaskDescInfo> &task_desc_info, const int32_t &device_id) {
 #ifdef DAVINCI_SUPPORT_PROFILING
-  Msprof::Engine::Reporter *reporter = PluginImpl::GetPluginReporter();
-  if (reporter == nullptr) {
-    GELOGI("Profiling report is nullptr!");
-    return;
-  }
-
   std::string data;
   for (const auto &task : task_desc_info) {
     std::string model_name = task.model_name;
@@ -412,7 +218,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::Profilin
                      .append(std::to_string(stream_id)).append(" ")
                      .append(std::to_string(model_id)).append("\n"));
 
-    Msprof::Engine::ReporterData reporter_data{};
+    ReporterData reporter_data{};
     reporter_data.deviceId = device_id;
     reporter_data.data = (unsigned char *)data.c_str();
     reporter_data.dataLen = data.size();
@@ -422,9 +228,9 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::Profilin
       return;
     }
 
-    ret = reporter->Report(&reporter_data);
-    if (ret != SUCCESS) {
-      GELOGE(ret, "Reporter data of task_desc_info fail!");
+    int32_t cb_ret = CallMsprofReport(reporter_data);
+    if (cb_ret != 0) {
+      GELOGE(cb_ret, "Reporter data of task_desc_info failed, ret:%d", cb_ret);
       return;
     }
   }
@@ -436,9 +242,6 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::Profilin
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::ProfilingGraphDescInfo(
     uint32_t model_id, const std::vector<ComputeGraphDescInfo> &compute_graph_desc_info, const int32_t &device_id) {
 #ifdef DAVINCI_SUPPORT_PROFILING
-  Msprof::Engine::Reporter *reporter = PluginImpl::GetPluginReporter();
-  GE_IF_BOOL_EXEC(reporter == nullptr, GELOGI("Profiling report is nullptr!"); return;);
-
   std::string data;
   for (const auto &graph : compute_graph_desc_info) {
     data.append("model_name:")
@@ -493,64 +296,52 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::Profilin
     }
 
     data.append(" model_id:").append(std::to_string(model_id));
-
     data.append("\n");
 
-    Msprof::Engine::ReporterData reporter_data{};
-    Report(device_id, data, *reporter, reporter_data);
-
+    GraphDescReport(device_id, data);
     data.clear();
   }
 #endif
 }
 
-FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::Report(
-    const int32_t &device_id, const string &data, Msprof::Engine::Reporter &reporter,
-    Msprof::Engine::ReporterData &reporter_data) {
+void ProfilingManager::GraphDescReport(const int32_t &device_id, const string &data) {
 #ifdef DAVINCI_SUPPORT_PROFILING
+  ReporterData reporter_data{};
+  int ret = -1;
+  int32_t cb_ret = -1;
   size_t index = data.size() / kReportMaxLen;
   if (index >= 1) {
     reporter_data.deviceId = device_id;
-    int ret = memcpy_s(reporter_data.tag, MSPROF_ENGINE_MAX_TAG_LEN + 1, "graph_desc_info", sizeof("graph_desc_info"));
+    ret = memcpy_s(reporter_data.tag, MSPROF_ENGINE_MAX_TAG_LEN + 1, "graph_desc_info", sizeof("graph_desc_info"));
     GE_IF_BOOL_EXEC(ret != EOK, GELOGE(ret, "Report data tag of graph_desc_info memcpy error!"); return;);
     for (size_t i = 0; i < index; ++i) {
       reporter_data.data = (unsigned char *)data.c_str() + kReportMaxLen * i;
       reporter_data.dataLen = kReportMaxLen;
-      ret = reporter.Report(&reporter_data);
-      GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(ret, "Reporter data of graph_desc_info fail!"); return;);
+      cb_ret = CallMsprofReport(reporter_data);
+      GE_IF_BOOL_EXEC(cb_ret != 0, GELOGE(cb_ret, "Reporter data of graph_desc_info failed, ret:%d", cb_ret); return;);
     }
     reporter_data.dataLen = data.size() - kReportMaxLen * index;
     if (reporter_data.dataLen != 0) {
       reporter_data.data = (unsigned char *)data.c_str() + kReportMaxLen * index;
-      ret = reporter.Report(&reporter_data);
-      GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(ret, "Reporter data of graph_desc_info fail!"); return;);
+      cb_ret = CallMsprofReport(reporter_data);
+      GE_IF_BOOL_EXEC(cb_ret != 0, GELOGE(cb_ret, "Reporter data of graph_desc_info failed, ret:%d", cb_ret); return;);
     }
   } else {
     reporter_data.deviceId = device_id;
     reporter_data.data = (unsigned char *)data.c_str();
     reporter_data.dataLen = data.size();
-    int ret = memcpy_s(reporter_data.tag, MSPROF_ENGINE_MAX_TAG_LEN + 1, "graph_desc_info", sizeof("graph_desc_info"));
+    ret = memcpy_s(reporter_data.tag, MSPROF_ENGINE_MAX_TAG_LEN + 1, "graph_desc_info", sizeof("graph_desc_info"));
     GE_IF_BOOL_EXEC(ret != EOK, GELOGE(ret, "Report data tag of graph_desc_info memcpy error!"); return;);
 
-    ret = reporter.Report(&reporter_data);
-    GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(ret, "Reporter data of graph_desc_info fail!"); return;);
-  }
-#endif
-}
-
-FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::PluginUnInit(const std::string &module) const {
-#ifdef DAVINCI_SUPPORT_PROFILING
-  int ret = Msprof::Engine::UnInit(module);
-  if (ret != SUCCESS) {
-    GELOGE(ret, "profiling plugin uninit failed, ret:%d", ret);
+    cb_ret = CallMsprofReport(reporter_data);
+    GE_IF_BOOL_EXEC(cb_ret != 0, GELOGE(cb_ret, "Reporter data of graph_desc_info failed, ret:%d", cb_ret); return;);
   }
 #endif
 }
 
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::ReportProfilingData(
     uint32_t model_id, const std::vector<TaskDescInfo> &task_desc_info,
-    const std::vector<ComputeGraphDescInfo> &compute_graph_desc_info,
-    bool check_device) {
+    const std::vector<ComputeGraphDescInfo> &compute_graph_desc_info) {
 #ifdef DAVINCI_SUPPORT_PROFILING
   int32_t logic_device_id = 0;
   rtError_t rt_ret = rtGetDevice(&logic_device_id);
@@ -559,13 +350,6 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::ReportPr
     return;
   }
   GELOGD("current logic_device_id:%d", logic_device_id);
-  if (check_device) {
-    auto ret = std::find(device_id_.begin(), device_id_.end(), logic_device_id);
-    if (ret == device_id_.end()) {
-      GELOGE(FAILED, "get valid phy_device_id failed, profiling report failed.");
-      return;
-    }
-  }
   GELOGD("start ProfilingTaskDescInfo.");
   ProfilingTaskDescInfo(model_id, task_desc_info, logic_device_id);
   GELOGD("start ProfilingGraphDescInfo.");
@@ -574,11 +358,6 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::ReportPr
 #endif
 }
 
-FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::SetProfilingConfig(
-    const std::string &profiling_cfg) {
-  recv_profiling_config_ = profiling_cfg;
-}
-
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY uint64_t ProfilingManager::GetProfilingModule() {
   uint64_t module = PROF_MODEL_EXECUTE_MASK |
                     PROF_RUNTIME_API_MASK |
@@ -594,9 +373,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY uint64_t ProfilingManager::GetP
   return module;
 }
 
-void ProfilingManager::UpdateSubscribeDeviceModuleMap(std::string prof_type,
-                                                      uint32_t device_id,
-                                                      uint64_t module) {
+void ProfilingManager::UpdateSubscribeDeviceModuleMap(std::string prof_type, uint32_t device_id, uint64_t module) {
 #ifdef DAVINCI_SUPPORT_PROFILING
   if (prof_type == kProfModelSubscribe) {
     if (subs_dev_module_.find(device_id) != subs_dev_module_.end()) {
@@ -608,9 +385,13 @@ void ProfilingManager::UpdateSubscribeDeviceModuleMap(std::string prof_type,
       subs_dev_module_[device_id] = dev_info;
     }
   } else if (prof_type == kProfModelUnsubscribe) {
-    if (subs_dev_module_.find(device_id) != subs_dev_module_.end()) {
-      if (subs_dev_module_[device_id].subscribe_count > 0) {
-        subs_dev_module_[device_id].subscribe_count--;
+    auto iter = subs_dev_module_.find(device_id);
+    if (iter != subs_dev_module_.end()) {
+      if (iter->second.subscribe_count > 0) {
+        iter->second.subscribe_count--;
+      }
+      if (iter->second.subscribe_count == 0) {
+        subs_dev_module_.erase(iter);
       }
     }
   } else {
@@ -626,10 +407,11 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::ProfMo
   uint64_t model_load_mask = module & PROF_MODEL_LOAD_MASK;
   if ((subscribe_count_ == 0) && (model_load_mask == PROF_MODEL_LOAD_MASK)) {
     // register framework to profiling
-    int32_t result = Msprof::Engine::Init(GE_PROFILING_MODULE, &engine_);
-    if (result != SUCCESS) {
-      GELOGE(FAILED, "Register profiling engine failed.");
-      return FAILED;
+    // register Framework to profiling
+    int32_t cb_ret = PluginInit();
+    if (cb_ret != 0) {
+      GELOGE(cb_ret, "profiling plugin init failed, ret:%d", cb_ret);
+      return cb_ret;
     }
     GELOGI("Prof subscribe: model load profiling on.");
   }
@@ -647,7 +429,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::ProfMo
   UpdateSubscribeDeviceModuleMap(kProfModelSubscribe, device[0], module);
 
   // Report profiling data
-  Status p_ret = davinci_model->ReportProfilingData(false);
+  Status p_ret = davinci_model->ReportProfilingData();
   if (p_ret != SUCCESS) {
     GELOGE(p_ret, "Report profiling data failed.");
     return p_ret;
@@ -672,6 +454,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::ProfMo
   auto iter = subs_dev_module_.find(device[0]);
   if (iter != subs_dev_module_.end()) {
     if (subs_dev_module_[device[0]].subscribe_count == 1) {
+      // The same device_id, only stop at last time
       rtError_t rt_ret = rtProfilerStop(subs_dev_module_[device[0]].module, dev_num, device);
       if (rt_ret != RT_ERROR_NONE) {
         GELOGE(FAILED, "Runtime profiler stop failed.");
@@ -679,15 +462,15 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::ProfMo
       }
     }
     UpdateSubscribeDeviceModuleMap(kProfModelUnsubscribe, device[0], subs_dev_module_[device[0]].module);
+  } else {
+    GELOGE(FAILED, "The device_id:%u has not been subscribed, do not need to cancel.", device[0]);
+    return FAILED;
   }
 
   subscribe_count_--;
   if (subscribe_count_ == 0) {
-    int32_t ret = Msprof::Engine::UnInit(GE_PROFILING_MODULE);
-    if (ret != SUCCESS) {
-      GELOGE(ret, "Profiling plugin uninit failed, ret:%d", ret);
-      return ret;
-    }
+    // profiling plugin uninit at last subscription
+    PluginUnInit();
   }
 #endif
   return SUCCESS;
@@ -700,11 +483,12 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::ProfIn
 
   if (model_load_mask == PROF_MODEL_LOAD_MASK) {
     // register Framework to profiling
-    int32_t result = Msprof::Engine::Init(GE_PROFILING_MODULE, &engine_);
-    if (result != SUCCESS) {
-      GELOGE(FAILED, "Register profiling engine failed.");
-      return FAILED;
+    int32_t cb_ret = PluginInit();
+    if (cb_ret != 0) {
+      GELOGE(cb_ret, "profiling plugin init failed, ret:%d", cb_ret);
+      return cb_ret;
     }
+
     int32_t device_num = -1;
     rtError_t rt_ret = rtProfilerStart(model_load_mask, device_num, nullptr);
     if (rt_ret != RT_ERROR_NONE) {
@@ -719,7 +503,6 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::ProfIn
   if (training_trace_mask == PROF_TRAINING_TRACE_MASK) {
     is_training_trace_ = true;
   }
-  is_acl_api_mode_ = true;
   GELOGI("Prof init success.");
 #endif
   return SUCCESS;
@@ -730,19 +513,17 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::ProfFi
   std::lock_guard<std::mutex> lock(mutex_);
   is_load_profiling_ = false;
   is_training_trace_ = false;
-  is_acl_api_mode_ = false;
+  is_execute_profiling_ = false;
+
+  // profiling plugin uninit
+  PluginUnInit();
 
-  int32_t ret = Msprof::Engine::UnInit(GE_PROFILING_MODULE);
-  if (ret != SUCCESS) {
-    GELOGE(ret, "Profiling plugin uninit failed, ret:%d", ret);
-  }
   int32_t dev_num = -1;
   rtError_t rt_ret = rtProfilerStop(PROF_MODEL_LOAD_MASK, dev_num, nullptr);
   if (rt_ret != RT_ERROR_NONE) {
     GELOGE(FAILED, "Runtime profiler stop failed.");
     return FAILED;
   }
-
   for (auto device_id_module : device_id_module_map_) {
     if (device_id_module.second != 0) {
       uint32_t device_id = static_cast<uint32_t>(device_id_module.first);
@@ -792,6 +573,7 @@ Status ProfilingManager::ProfParseDeviceId(const std::map<std::string, std::stri
         return FAILED;
       } catch (std::out_of_range &) {
         GELOGE(FAILED, "Device id: %s is  out of range.", decvice_id[i].c_str());
+        return FAILED;
       } catch (...) {
         GELOGE(FAILED, "Device id: %s cannot change to int.", decvice_id[i].c_str());
         return FAILED;
@@ -818,6 +600,7 @@ Status ProfilingManager::ProfParseParam(const std::map<std::string, std::string>
       return FAILED;
     } catch (std::out_of_range &) {
       GELOGE(FAILED, "Device num: %s is  out of range.", iter->second.c_str());
+      return FAILED;
     } catch (...) {
       GELOGE(FAILED, "Device num: %s cannot change to int.", iter->second.c_str());
       return FAILED;
@@ -859,7 +642,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::ProfSt
   for (int32_t i = 0; i < device_num; i++) {
     device_id_ptr[i] = static_cast<uint32_t>(device_list[i]);
   }
-  GELOGD("Runtime config param: 0x%llx, device num: %d.", module, device_num);
+  GELOGI("Runtime config param: 0x%llx, device num: %d.", module, device_num);
 
   rtError_t rt_ret = rtProfilerStart(module, device_num, device_id_ptr.get());
   if (rt_ret != RT_ERROR_NONE) {
@@ -878,7 +661,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::ProfSt
     GELOGW("Prof start: load model module is invalid.");
   }
   UpdateDeviceIdModuleMap(kProfStart, module, device_list);
-  GELOGD("Prof start profiling success.");
+  GELOGI("Prof start profiling success.");
 #endif
   return SUCCESS;
 }
@@ -901,7 +684,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::ProfSt
   for (int32_t i = 0; i < device_num; i++) {
     device_id_ptr[i] = static_cast<uint32_t>(device_list[i]);
   }
-  GELOGD("Prof stop: runtime config param: 0x%llx, device num: %d", module, device_num);
+  GELOGI("Prof stop: runtime config param: 0x%llx, device num: %d", module, device_num);
   rtError_t rt_ret = rtProfilerStop(module, device_num, device_id_ptr.get());
   if (rt_ret != RT_ERROR_NONE) {
     GELOGE(FAILED, "Prof stop: runtime profiler config proc failed.");
@@ -921,7 +704,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::ProfSt
     GELOGW("Prof stop: load model module is invalid.");
   }
   UpdateDeviceIdModuleMap(kProfStop, module, device_list);
-  GELOGD("Prof stop profiling success.");
+  GELOGI("Prof stop profiling success.");
 #endif
   return SUCCESS;
 }
@@ -963,47 +746,90 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool ProfilingManager::Profilin
   if (rt_ret != RT_ERROR_NONE) {
     GELOGE(rt_ret, "Runtime get logic_device_id failed, current logic_device_id:%d", logic_device_id);
   }
-  GELOGD("Current logic_device_id:%d", logic_device_id);
+  GELOGI("Current logic_device_id:%d", logic_device_id);
 
   bool execute_model_prof_on = false;
   auto iter = std::find(device_id_.begin(), device_id_.end(), logic_device_id);
   if (iter != device_id_.end()) {
     execute_model_prof_on = true;
   }
-  GELOGD("Flag is_execute_profiling: %d, execute_model_prof_on: %d", is_execute_profiling_, execute_model_prof_on);
-  return is_execute_profiling_ || execute_model_prof_on;
+  GELOGI("Flag is_execute_profiling: %d, execute_model_prof_on: %d", is_execute_profiling_, execute_model_prof_on);
+  return  execute_model_prof_on;
 }
 
-/**
- * @brief Profiling PluginImpl
- */
-// PluginImpl static variable init
-Msprof::Engine::Reporter *PluginImpl::reporter_ = nullptr;
-
-PluginImpl::PluginImpl(const std::string &module) : module_(module) { GELOGI("Create PluginImpl\n"); }
-
-int PluginImpl::Init(const Msprof::Engine::Reporter *reporter) {
-  GELOGI("PluginImpl init");
-  reporter_ = const_cast<Msprof::Engine::Reporter *>(reporter);
-  return 0;
+FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::PluginInit() const {
+  if (prof_cb_.msprofReporterCallback == nullptr) {
+    GELOGE(ge::PARAM_INVALID, "MsprofReporterCallback callback is nullptr.");
+    return ge::PARAM_INVALID;
+  }
+  return prof_cb_.msprofReporterCallback(
+      static_cast<uint32_t>(MsprofReporterModuleId::MSPROF_MODULE_FRAMEWORK),
+      static_cast<uint32_t>(MsprofReporterCallbackType::MSPROF_REPORTER_INIT),
+      nullptr, 0);
 }
 
-int PluginImpl::UnInit() {
-  GELOGI("PluginImpl Uninit");
-  reporter_ = nullptr;
-  return 0;
+FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::PluginUnInit() const {
+#ifdef DAVINCI_SUPPORT_PROFILING
+  if (prof_cb_.msprofReporterCallback == nullptr) {
+    GELOGE(ge::PARAM_INVALID, "MsprofReporterCallback callback is nullptr.");
+    return;
+  }
+  int32_t cb_ret = prof_cb_.msprofReporterCallback(
+      static_cast<uint32_t>(MsprofReporterModuleId::MSPROF_MODULE_FRAMEWORK),
+      static_cast<uint32_t>(MsprofReporterCallbackType::MSPROF_REPORTER_UNINIT),
+      nullptr, 0);
+  if (cb_ret != 0) {
+    GELOGW("profiling plugin uninit failed, ret:%d", cb_ret);
+  }
+#endif
 }
 
-Msprof::Engine::PluginIntf *ProfilingEngineImpl::CreatePlugin() {
-  GELOGI(" Create Plugin");
-  return new (std::nothrow) PluginImpl(GE_PROFILING_MODULE);
+FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::CallMsprofReport(
+    ReporterData &reporter_data) const {
+  if (prof_cb_.msprofReporterCallback == nullptr) {
+    GELOGE(ge::PARAM_INVALID, "MsprofReporterCallback callback is nullptr.");
+    return ge::PARAM_INVALID;
+  }    
+  return prof_cb_.msprofReporterCallback(
+      static_cast<uint32_t>(MsprofReporterModuleId::MSPROF_MODULE_FRAMEWORK),
+      static_cast<uint32_t>(MsprofReporterCallbackType::MSPROF_REPORTER_REPORT),
+      static_cast<void *>(&reporter_data), sizeof(ReporterData));
 }
 
-int ProfilingEngineImpl::ReleasePlugin(Msprof::Engine::PluginIntf *plugin) {
-  if (plugin != nullptr) {
-    delete plugin;
-    plugin = nullptr;
+FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::GetFpBpPoint(
+    std::string &fp_point, std::string &bp_point) {
+  // Env or options mode, fp_point_/bp_point_ have initiliazed on profiling init
+  if (!fp_point_.empty() && !bp_point_.empty()) {
+    fp_point = fp_point_;
+    bp_point = bp_point_;
+    GELOGI("Bp Fp have been initialized in env or options. bp_point: %s, fp_point: %s", bp_point.c_str(), fp_point.c_str());
+    return;
+  }
+  // ProfApi mode and training trace is set
+  try {
+    char env_profiling_options[MSPROF_OPTIONS_DEF_LEN_MAX] = { 0x00 };
+    INT32 ret = mmGetEnv("PROFILING_OPTIONS", env_profiling_options, MSPROF_OPTIONS_DEF_LEN_MAX);
+    if (ret != EN_OK) {
+      GELOGI("PROFILING_OPTIONS env is not exist.");
+      return;
+    }
+    GELOGI("Parse env PROFILING_OPTIONS:%s.", env_profiling_options);
+    Json prof_options = Json::parse(env_profiling_options);
+
+    fp_point_ = prof_options[kFpPoint];
+    bp_point_ = prof_options[kBpPoint];
+
+    fp_point = fp_point_;
+    bp_point = bp_point_;
+    if (!fp_point_.empty() && !bp_point_.empty()) {
+      GELOGI("Training trace bp fp is set, bp_point:%s, fp_point:%s.", bp_point_.c_str(), fp_point_.c_str());
+    }
+  } catch (...) {
+    GELOGE(FAILED, "Json prof options is invalid.");
+    return;
   }
-  return 0;
+  return;
 }
+
+
 }  // namespace ge
diff --git a/ge/common/profiling/profiling_manager.h b/ge/common/profiling/profiling_manager.h
index 66cefc32..5fa4fac4 100755
--- a/ge/common/profiling/profiling_manager.h
+++ b/ge/common/profiling/profiling_manager.h
@@ -26,9 +26,7 @@
 #include "framework/common/ge_inner_error_codes.h"
 #include "framework/common/ge_types.h"
 #include "external/register/register_types.h"
-#include "toolchain/prof_engine.h"
-#include "toolchain/prof_mgr_core.h"
-#include "toolchain/prof_acl_api.h"
+#include "toolchain/prof_callback.h"
 
 using std::map;
 using std::string;
@@ -37,35 +35,33 @@ using Json = nlohmann::json;
 
 namespace {
   const std::string GE_PROFILING_MODULE = "Framework";
+  // DataTypeConfig MASK
+  #define PROF_ACL_API_MASK                0x0001
+  #define PROF_TASK_TIME_MASK              0x0002
+  #define PROF_AICORE_METRICS_MASK         0x0004
+  #define PROF_AICPU_TRACE_MASK            0x0008
+  #define PROF_MODEL_EXECUTE_MASK          0x0010
+  #define PROF_RUNTIME_API_MASK            0x0020
+  #define PROF_RUNTIME_TRACE_MASK          0x0040
+  #define PROF_SCHEDULE_TIMELINE_MASK      0x0080
+  #define PROF_SCHEDULE_TRACE_MASK         0x0100
+  #define PROF_AIVECTORCORE_METRICS_MASK   0x0200
+  #define PROF_SUBTASK_TIME_MASK           0x0400
+  #define PROF_TRAINING_TRACE_MASK         0x0800
+  #define PROF_HCCL_TRACE_MASK             0x1000
+  #define PROF_DATA_PROCESS_MASK           0x2000
+  #define PROF_MODEL_LOAD_MASK             0x8000000000000000
+
 }  // namespace
 namespace ge {
 struct DeviceSubsInfo {
   uint64_t module;
   uint32_t subscribe_count;
 };
-// register Plugin
-class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY PluginImpl : public Msprof::Engine::PluginIntf {
- public:
-  explicit PluginImpl(const std::string &module);
-  ~PluginImpl() {}
-
-  int Init(const Msprof::Engine::Reporter *reporter);
-  int UnInit();
-  static Msprof::Engine::Reporter *GetPluginReporter() { return reporter_; }
 
- private:
-  static Msprof::Engine::Reporter *reporter_;
-  std::string module_;
-};
-
-// register Engine
-class ProfilingEngineImpl : public Msprof::Engine::EngineIntf {
- public:
-  ProfilingEngineImpl() {}
-  ~ProfilingEngineImpl() {}
-
-  Msprof::Engine::PluginIntf *CreatePlugin();
-  int ReleasePlugin(Msprof::Engine::PluginIntf *plugin);
+struct MsprofCallback {
+  MsprofCtrlCallback msprofCtrlCallback;
+  MsprofReporterCallback msprofReporterCallback;
 };
 
 class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ProfilingManager {
@@ -73,68 +69,54 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ProfilingManager {
   ProfilingManager();
   virtual ~ProfilingManager();
   static ProfilingManager &Instance();
-  ge::Status Init(const Options &options);
-  ge::Status InitFromOptions(const Options &options);
-  ge::Status InitFromAclCfg(const std::string &config);
-  ge::Status StartProfiling(int32_t iter, int32_t device_id);
-  void UpdateSubscribeDeviceModuleMap(std::string prof_type, uint32_t device_id, uint64_t module);
-  ge::Status ProfModelSubscribe(uint64_t module, void *model);
-  ge::Status ProfModelUnsubscribe(void *model);
-  ge::Status ProfInit(uint64_t module);
-  ge::Status ProfFinalize();
-  ge::Status ProfStartProfiling(uint64_t module, const std::map<std::string, std::string> &config_para);
-  ge::Status ProfStopProfiling(uint64_t module, const std::map<std::string, std::string> &config_para);
+  Status Init(const Options &options);
+  Status ProfInit(uint64_t module);
+  Status ProfFinalize();
+  Status ProfStartProfiling(uint64_t module, const std::map<std::string, std::string> &config_para);
+  Status ProfStopProfiling(uint64_t module, const std::map<std::string, std::string> &config_para);
+  Status ProfModelSubscribe(uint64_t module, void *model);
+  Status ProfModelUnsubscribe(void *model);
   void StopProfiling();
-  bool ProfilingOpTraceOn() const { return is_op_trace_; }
-  bool ProfilingLoadFlag() const { return is_load_; }
   bool ProfilingTrainingTraceOn() const { return is_training_trace_; }
   bool ProfilingModelLoadOn() const { return is_load_profiling_; }
   bool ProfilingModelExecuteOn() const;
-  bool ProfilingOn() const { return is_load_profiling_ && is_execute_profiling_; } // only used  by command pattern
-  bool IsAclApiMode() const { return is_acl_api_mode_; }
-  int32_t GetOpTraceIterNum() const { return op_trace_iter_num_; }
+  bool ProfilingOn() const { return is_load_profiling_ && is_execute_profiling_; } // is_execute_profiling_ only used by ge option and env
   void ReportProfilingData(uint32_t model_id, const std::vector<TaskDescInfo> &task_desc_info,
-                           const std::vector<ComputeGraphDescInfo> &compute_graph_desc_info,
-                           bool check_device);
-  void Report(const int32_t &device_id, const string &data, Msprof::Engine::Reporter &reporter,
-              Msprof::Engine::ReporterData &reporter_data);
+                           const std::vector<ComputeGraphDescInfo> &compute_graph_desc_info);
   void ProfilingTaskDescInfo(uint32_t model_id, const std::vector<TaskDescInfo> &task_desc_info,
                              const int32_t &device_id);
   void ProfilingGraphDescInfo(uint32_t model_id, const std::vector<ComputeGraphDescInfo> &compute_graph_desc_info,
                               const int32_t &device_id);
-  void SetProfilingConfig(const string &profiling_cfg);
-  vector<int32_t> GetProfilingDeviceId() const { return  device_id_; }
-  void PluginUnInit(const std::string &module) const;
+  Status PluginInit() const;
+  void PluginUnInit() const;
+  Status CallMsprofReport(ReporterData &reporter_data) const;
+  struct MsprofCallback &GetMsprofCallback() { return prof_cb_; }
+  void SetMsprofCtrlCallback(MsprofCtrlCallback func) { prof_cb_.msprofCtrlCallback = func; }
+  void SetMsprofReporterCallback(MsprofReporterCallback func) { prof_cb_.msprofReporterCallback = func; }
+  void GetFpBpPoint(std::string &fp_point, std::string &bp_point);
  private:
-  ge::Status ParseFeaturesFromAclCfg(const Json &feature);
-  ge::Status ProfParseParam(const std::map<std::string, std::string> &config_para, int32_t &device_num,
-                            vector<int32_t> &device_list);
-  ge::Status ProfParseDeviceId(const std::map<std::string, std::string> &config_para,
+  Status InitFromOptions(const Options &options, MsprofGeOptions &prof_conf);
+  Status ParseOptions(const std::string &options);
+  Status ProfParseParam(const std::map<std::string, std::string> &config_para, int32_t &device_num,
+                        vector<int32_t> &device_list);
+  Status ProfParseDeviceId(const std::map<std::string, std::string> &config_para,
                                vector<int32_t> &device_list);
   uint64_t GetProfilingModule();
+  void GraphDescReport(const int32_t &device_id, const string &data);
   void UpdateDeviceIdModuleMap(string prof_type, uint64_t module, const vector<int32_t> &device_list);
-  bool is_load_profiling_ = false;
-  bool is_execute_profiling_ = false;
-  bool is_op_trace_ = false;
-  bool is_load_ = false;
-  bool is_training_trace_ = false;
-  bool is_acl_api_mode_ = false;
-  int32_t op_trace_iter_num_ = 0;
-  string job_id_;
-  string prof_dir_;
+  void UpdateSubscribeDeviceModuleMap(std::string prof_type, uint32_t device_id, uint64_t module);
+
+  bool is_load_profiling_;
+  bool is_execute_profiling_;
+  bool is_training_trace_;
   vector<int32_t> device_id_;
-  vector<string> op_trace_conf_;
-  vector<string> profiling_opts_;
-  vector<void *> prof_handle_vec_;
-  string recv_profiling_config_;
-  string send_profiling_config_;
-  string system_trace_conf_;
-  string task_trace_conf_;
-  const ProfilingEngineImpl engine_;
   map<int32_t, uint64_t> device_id_module_map_; // key: device_id, value: profiling on module
   map<uint32_t, DeviceSubsInfo> subs_dev_module_; // key: device_id, value: profiling on module
   uint32_t subscribe_count_;
   std::mutex mutex_;
+  MsprofCallback prof_cb_;
+  std::string fp_point_;
+  std::string bp_point_;
 };
 }  // namespace ge
 #endif  // GE_COMMON_PROFILING_PROFILING_MANAGER_H_
diff --git a/ge/common/types.cc b/ge/common/types.cc
index 54dc769f..1cc70347 100644
--- a/ge/common/types.cc
+++ b/ge/common/types.cc
@@ -801,7 +801,7 @@ const uint32_t XRGB_CHN_NUM = 4;
 ///
 const bool DEFAULT_GLOBAL_POOLING = false;
 
-const uint32_t MODEL_VERSION = 0x10000000; ///< Model version 1.0///
+const uint32_t MODEL_VERSION = 0x20000000; ///< Model version 2.0///
 
 // Eltwise's input size
 const int ELTWISE_MIN_INPUT_SIZE = 2;
diff --git a/ge/common/util.cc b/ge/common/util.cc
index 480be3c1..0a343a83 100644
--- a/ge/common/util.cc
+++ b/ge/common/util.cc
@@ -51,14 +51,15 @@ namespace {
  * If such an exception is encountered during operation,
  * the proto file can be divided into several small files or the limit value can be increased.
  */
-const int kProtoReadBytesLimit = INT_MAX;     // Max size of 2 GB minus 1 byte.
-const int kWarningThreshold = 536870912 * 2;  // 536870912 represent 512M
+const int kFileSizeOutLimitedOrOpenFailed = -1;
+const int kProtoReadBytesLimit = INT_MAX;  // Max size of 2 GB minus 1 byte.
+const int kWarningThreshold = 1073741824;  // 536870912 * 2 536870912 represent 512M
 
 /// The maximum length of the file.
-const uint32_t kMaxFileSizeLimit = UINT32_MAX; // 4G for now
+const uint32_t kMaxFileSizeLimit = UINT32_MAX;  // 4G for now
 const int kMaxBuffSize = 256;
 const char *const kPathValidReason = "The path can only contain 'a-z' 'A-Z' '0-9' '-' '.' '_' and chinese character";
-constexpr uint32_t kMaxConfigFileByte = 10 * 1024 * 1024;
+constexpr uint32_t kMaxConfigFileByte = 10485760;  // 10 * 1024 * 1024
 }  // namespace
 
 namespace ge {
@@ -76,7 +77,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool ReadProtoFromBinaryFile(co
   std::string real_path = RealPath(file);
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(real_path.empty(), return false, "pb file path '%s' not valid", file);
 
-  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(GetFileLength(real_path) == -1, return false, "file size not valid.");
+  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(GetFileLength(real_path) == kFileSizeOutLimitedOrOpenFailed, return false,
+                                 "file size not valid.");
 
   std::ifstream fs(real_path, std::ifstream::in | std::ifstream::binary);
   if (!fs.is_open()) {
@@ -118,20 +120,20 @@ long GetFileLength(const std::string &input_file) {
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(real_path.empty(), return -1, "input_file path '%s' not valid", input_file.c_str());
   unsigned long long file_length = 0;
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(
-      mmGetFileSize(input_file.c_str(), &file_length) != EN_OK,
-      ErrorManager::GetInstance().ATCReportErrMessage("E19001", {"file", "errmsg"}, {input_file, strerror(errno)});
-      return -1, "Open file[%s] failed. %s", input_file.c_str(), strerror(errno));
+    mmGetFileSize(input_file.c_str(), &file_length) != EN_OK,
+    ErrorManager::GetInstance().ATCReportErrMessage("E19001", {"file", "errmsg"}, {input_file, strerror(errno)});
+    return kFileSizeOutLimitedOrOpenFailed, "Open file[%s] failed. %s", input_file.c_str(), strerror(errno));
 
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG((file_length == 0),
                                  ErrorManager::GetInstance().ATCReportErrMessage("E19015", {"filepath"}, {input_file});
                                  return -1, "File[%s] size is 0, not valid.", input_file.c_str());
 
-  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(file_length > kMaxFileSizeLimit,
-                                 ErrorManager::GetInstance().ATCReportErrMessage(
-                                     "E19016", {"filepath", "filesize", "maxlen"},
-                                     {input_file, std::to_string(file_length), std::to_string(kMaxFileSizeLimit)});
-                                 return -1, "File[%s] size %lld is out of limit: %d.", input_file.c_str(), file_length,
-                                        kMaxFileSizeLimit);
+  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(
+    file_length > kMaxFileSizeLimit, ErrorManager::GetInstance().ATCReportErrMessage(
+                                       "E19016", {"filepath", "filesize", "maxlen"},
+                                       {input_file, std::to_string(file_length), std::to_string(kMaxFileSizeLimit)});
+    return kFileSizeOutLimitedOrOpenFailed, "File[%s] size %lld is out of limit: %d.", input_file.c_str(), file_length,
+           kMaxFileSizeLimit);
   return static_cast<long>(file_length);
 }
 
@@ -187,7 +189,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool ReadBytesFromBinaryFile(co
   std::streamsize size = file.tellg();
 
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG((size <= 0), file.close(); return false, "file length <= 0, not valid.");
-  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(size > static_cast<int64_t >(kMaxFileSizeLimit), file.close();
+  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(size > static_cast<int64_t>(kMaxFileSizeLimit), file.close();
                                  return false, "file size %ld is out of limit: %d.", size, kMaxFileSizeLimit);
 
   file.seekg(0, std::ios::beg);  // [no need to check value]
@@ -210,8 +212,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY int CreateDirectory(const std::
   GE_CHK_BOOL_EXEC(!directory_path.empty(), return -1, "directory path is empty.");
   auto dir_path_len = directory_path.length();
   if (dir_path_len >= MMPA_MAX_PATH) {
-    ErrorManager::GetInstance().ATCReportErrMessage(
-        "E19002", {"filepath", "size"}, {directory_path, std::to_string(MMPA_MAX_PATH)});
+    ErrorManager::GetInstance().ATCReportErrMessage("E19002", {"filepath", "size"},
+                                                    {directory_path, std::to_string(MMPA_MAX_PATH)});
     GELOGW("Path[%s] len is too long, it must be less than %d", directory_path.c_str(), MMPA_MAX_PATH);
     return -1;
   }
@@ -224,8 +226,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY int CreateDirectory(const std::
         if (ret != 0) {
           if (errno != EEXIST) {
             ErrorManager::GetInstance().ATCReportErrMessage("E19006", {"path"}, {directory_path});
-            GELOGW("Can not create directory %s. Make sure the directory exists and writable.",
-                   directory_path.c_str());
+            GELOGW("Can not create directory %s. Make sure the directory exists and writable.", directory_path.c_str());
             return ret;
           }
         }
@@ -265,7 +266,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool ReadProtoFromText(const ch
 
   std::string real_path = RealPath(file);
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(real_path.empty(), ErrorManager::GetInstance().ATCReportErrMessage(
-                                                        "E19000", {"path", "errmsg"}, {file, strerror(errno)});
+                                                      "E19000", {"path", "errmsg"}, {file, strerror(errno)});
                                  return false, "Path[%s]'s realpath is empty, errmsg[%s]", file, strerror(errno));
 
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(GetFileLength(real_path) == -1, return false, "file size not valid.");
@@ -301,13 +302,13 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool ReadProtoFromMem(const cha
   google::protobuf::io::IstreamInputStream input(&fs);
   bool ret = google::protobuf::TextFormat::Parse(&input, message);
   GE_IF_BOOL_EXEC(
-      !ret, GELOGE(ret, "Call [google::protobuf::TextFormat::Parse] func ret fail, please check your text file."));
+    !ret, GELOGE(ret, "Call [google::protobuf::TextFormat::Parse] func ret fail, please check your text file."));
 
   return ret;
 }
 
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY uint64_t GetCurrentTimestamp() {
-  mmTimeval tv {};
+  mmTimeval tv{};
   int ret = mmGetTimeOfDay(&tv, nullptr);
   GE_LOGE_IF(ret != EN_OK, "Func gettimeofday may failed: ret=%d", ret);
   auto total_use_time = tv.tv_usec + tv.tv_sec * 1000000;  // 1000000: seconds to microseconds
@@ -315,7 +316,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY uint64_t GetCurrentTimestamp()
 }
 
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY uint32_t GetCurrentSecondTimestap() {
-  mmTimeval tv {};
+  mmTimeval tv{};
   int ret = mmGetTimeOfDay(&tv, nullptr);
   GE_LOGE_IF(ret != EN_OK, "Func gettimeofday may failed: ret=%d", ret);
   auto total_use_time = tv.tv_sec;  // seconds
@@ -350,8 +351,9 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool CheckInt64MulOverflow(int6
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY std::string RealPath(const char *path) {
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(path == nullptr, return "", "path pointer is NULL.");
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(strlen(path) >= MMPA_MAX_PATH,
-      ErrorManager::GetInstance().ATCReportErrMessage("E19002", {"filepath", "size"}, {path, std::to_string(MMPA_MAX_PATH)});
-      return "", "Path[%s] len is too long, it must be less than %d", path, MMPA_MAX_PATH);
+                                 ErrorManager::GetInstance().ATCReportErrMessage("E19002", {"filepath", "size"},
+                                                                                 {path, std::to_string(MMPA_MAX_PATH)});
+                                 return "", "Path[%s] len is too long, it must be less than %d", path, MMPA_MAX_PATH);
 
   // Nullptr is returned when the path does not exist or there is no permission
   // Return absolute path when path is accessible
@@ -385,16 +387,16 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool CheckInputPathValid(const
   // Path section: Support upper and lower case letters, numbers dots(.) chinese and underscores
   // File name section: Support upper and lower case letters, numbers, underscores chinese and dots(.)
 #ifdef __GNUC__
-        std::string mode = "^[\u4e00-\u9fa5A-Za-z0-9./_-]+$";
+  std::string mode = "^[\u4e00-\u9fa5A-Za-z0-9./_-]+$";
 #else
-        std::string mode = "^[a-zA-Z]:([\\\\/][^\\s\\\\/:*?<>\"|][^\\\\/:*?<>\"|]*)*([/\\\\][^\\s\\\\/:*?<>\"|])?$";
+  std::string mode = "^[a-zA-Z]:([\\\\/][^\\s\\\\/:*?<>\"|][^\\\\/:*?<>\"|]*)*([/\\\\][^\\s\\\\/:*?<>\"|])?$";
 #endif
 
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(
-      !ValidateStr(real_path, mode),
-      ErrorManager::GetInstance().ATCReportErrMessage("E10001", {"parameter", "value", "reason"},
-                                                      {atc_param, real_path, kPathValidReason});
-      return false, "Invalid value for %s[%s], %s.", atc_param.c_str(), real_path.c_str(), kPathValidReason);
+    !ValidateStr(real_path, mode),
+    ErrorManager::GetInstance().ATCReportErrMessage("E10001", {"parameter", "value", "reason"},
+                                                    {atc_param, real_path, kPathValidReason});
+    return false, "Invalid value for %s[%s], %s.", atc_param.c_str(), real_path.c_str(), kPathValidReason);
 
   // The absolute path points to a file that is not readable
   if (mmAccess2(real_path.c_str(), M_R_OK) != EN_OK) {
@@ -416,24 +418,25 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool CheckOutputPathValid(const
   }
 
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(strlen(file_path.c_str()) >= MMPA_MAX_PATH,
-      ErrorManager::GetInstance().ATCReportErrMessage(
-          "E19002", {"filepath", "size"}, {file_path, std::to_string(MMPA_MAX_PATH)});
-      return "", "Path[%s] len is too long, it must be less than %d", file_path.c_str(), MMPA_MAX_PATH);
+                                 ErrorManager::GetInstance().ATCReportErrMessage(
+                                   "E19002", {"filepath", "size"}, {file_path, std::to_string(MMPA_MAX_PATH)});
+                                 return "", "Path[%s] len is too long, it must be less than %d", file_path.c_str(),
+                                        MMPA_MAX_PATH);
 
   // A regular matching expression to verify the validity of the input file path
   // Path section: Support upper and lower case letters, numbers dots(.) chinese and underscores
   // File name section: Support upper and lower case letters, numbers, underscores chinese and dots(.)
 #ifdef __GNUC__
-     std::string mode = "^[\u4e00-\u9fa5A-Za-z0-9./_-]+$";
+  std::string mode = "^[\u4e00-\u9fa5A-Za-z0-9./_-]+$";
 #else
-     std::string mode = "^[a-zA-Z]:([\\\\/][^\\s\\\\/:*?<>\"|][^\\\\/:*?<>\"|]*)*([/\\\\][^\\s\\\\/:*?<>\"|])?$";
+  std::string mode = "^[a-zA-Z]:([\\\\/][^\\s\\\\/:*?<>\"|][^\\\\/:*?<>\"|]*)*([/\\\\][^\\s\\\\/:*?<>\"|])?$";
 #endif
 
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(
-      !ValidateStr(file_path, mode),
-      ErrorManager::GetInstance().ATCReportErrMessage("E10001", {"parameter", "value", "reason"},
-                                                      {atc_param, file_path, kPathValidReason});
-      return false, "Invalid value for %s[%s], %s.", atc_param.c_str(), file_path.c_str(), kPathValidReason);
+    !ValidateStr(file_path, mode),
+    ErrorManager::GetInstance().ATCReportErrMessage("E10001", {"parameter", "value", "reason"},
+                                                    {atc_param, file_path, kPathValidReason});
+    return false, "Invalid value for %s[%s], %s.", atc_param.c_str(), file_path.c_str(), kPathValidReason);
 
   std::string real_path = RealPath(file_path.c_str());
   // Can get absolute path (file exists)
diff --git a/ge/executor/CMakeLists.txt b/ge/executor/CMakeLists.txt
index d7dfdc84..cc5c1710 100644
--- a/ge/executor/CMakeLists.txt
+++ b/ge/executor/CMakeLists.txt
@@ -17,6 +17,7 @@ set(SRC_LIST
     "../common/dump/dump_properties.cc"
     "../common/dump/dump_manager.cc"
     "../common/dump/dump_op.cc"
+    "../common/profiling/ge_profiling.cc"
     "../graph/load/graph_loader.cc"
     "../graph/execute/graph_execute.cc"
     "../omm/csa_interact.cc"
@@ -244,7 +245,6 @@ target_link_libraries(ge_executor_shared PRIVATE
     mmpa
     graph
     register
-    msprof
     error_manager
     ascend_hal_stub
     ascend_protobuf
diff --git a/ge/executor/ge_executor.cc b/ge/executor/ge_executor.cc
index d03a8d7b..57ab7800 100755
--- a/ge/executor/ge_executor.cc
+++ b/ge/executor/ge_executor.cc
@@ -283,7 +283,8 @@ Status GeExecutor::Initialize() {
   // Start profiling
   Options profiling_options;
   profiling_options.device_id = 0;
-  profiling_options.job_id = "";
+  // job id need to be set, the value is meaningless;
+  profiling_options.job_id = "1";
   ProfilingManager::Instance().Init(profiling_options);
 
   isInit_ = true;
@@ -303,7 +304,7 @@ Status GeExecutor::Finalize() {
   // Stop profiling
   if (ProfilingManager::Instance().ProfilingOn()) {
     ProfilingManager::Instance().StopProfiling();
-    ProfilingManager::Instance().PluginUnInit(GE_PROFILING_MODULE);
+    ProfilingManager::Instance().PluginUnInit();
   }
 
   GELOGI("Uninit GeExecutor over.");
@@ -638,7 +639,8 @@ Status GeExecutor::UnloadModel(uint32_t model_id) {
     return ACL_ERROR_GE_INTERNAL_ERROR;
   }
 
-  std::shared_ptr<hybrid::HybridDavinciModel> hybrid_davinci_model = ModelManager::GetInstance()->GetHybridModel(model_id);
+  std::shared_ptr<hybrid::HybridDavinciModel> hybrid_davinci_model =
+      ModelManager::GetInstance()->GetHybridModel(model_id);
   if (hybrid_davinci_model != nullptr) {
     uint64_t session_id = hybrid_davinci_model->GetSessionId();
     VarManagerPool::Instance().RemoveVarManager(session_id);
diff --git a/ge/executor/module.mk b/ge/executor/module.mk
index 9566ca64..34c2a37e 100644
--- a/ge/executor/module.mk
+++ b/ge/executor/module.mk
@@ -8,6 +8,7 @@ local_ge_executor_src_files :=  \
     ../common/dump/dump_op.cc \
     ../common/ge/plugin_manager.cc \
     ../common/ge/op_tiling_manager.cc \
+    ../common/profiling/ge_profiling.cc \
     ../graph/load/graph_loader.cc \
     ../graph/execute/graph_execute.cc \
     ../omm/csa_interact.cc \
@@ -177,7 +178,6 @@ local_ge_executor_shared_library :=        \
     libmmpa                                \
     libgraph                               \
     libregister                            \
-    libmsprof                              \
     liberror_manager                       \
 
 local_ge_executor_ldflags := -lrt -ldl     \
@@ -234,7 +234,6 @@ LOCAL_SHARED_LIBRARIES :=                  \
     libmmpa                                \
     libgraph                               \
     libregister                            \
-    libmsprof                              \
     liberror_manager                       \
     stub/libascend_hal                     \
 
@@ -272,7 +271,6 @@ LOCAL_SHARED_LIBRARIES :=                  \
     libruntime                             \
     libslog                                \
     libmmpa                                \
-    libmsprof                              \
 
 LOCAL_LDFLAGS += $(local_ge_executor_ldflags)
 
@@ -304,7 +302,6 @@ LOCAL_SHARED_LIBRARIES :=                  \
     libruntime                             \
     libslog                                \
     libmmpa                                \
-    libmsprof                              \
 
 ifeq ($(device_os),android)
 LOCAL_LDFLAGS += -ldl
diff --git a/ge/ge_inference.mk b/ge/ge_inference.mk
index 0987f148..80887e8b 100755
--- a/ge/ge_inference.mk
+++ b/ge/ge_inference.mk
@@ -164,6 +164,7 @@ OMG_HOST_SRC_FILES := \
     host_kernels/slice_d_kernel.cc \
     host_kernels/dynamic_stitch_kernel.cc \
     host_kernels/identity_kernel.cc \
+    host_kernels/reformat_kernel.cc \
     graph/passes/stop_gradient_pass.cc \
     graph/passes/prevent_gradient_pass.cc \
     graph/passes/identity_pass.cc \
diff --git a/ge/ge_local_engine/engine/host_cpu_engine.cc b/ge/ge_local_engine/engine/host_cpu_engine.cc
index f1e152f4..c836d4d6 100755
--- a/ge/ge_local_engine/engine/host_cpu_engine.cc
+++ b/ge/ge_local_engine/engine/host_cpu_engine.cc
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 #include "host_cpu_engine.h"
-#include <dlfcn.h>
 #include "graph/common/omg_util.h"
 #include "graph/utils/op_desc_utils.h"
 #include "graph/utils/tensor_adapter.h"
@@ -96,8 +95,8 @@ Status GetDataNumber(const GeTensorDesc &out_desc, uint64_t &data_num) {
 
 void HostCpuEngine::CloseSo() {
   for (auto handle : lib_handles_) {
-    if (dlclose(handle) != 0) {
-      GELOGW("failed to close handle, message: %s", dlerror());
+    if (mmDlclose(handle) != 0) {
+      GELOGW("failed to close handle, message: %s", mmDlerror());
     }
   }
   lib_handles_.clear();
@@ -323,13 +322,13 @@ Status HostCpuEngine::LoadLibs(std::vector<std::string> &lib_paths) {
 
 Status HostCpuEngine::LoadLib(const std::string &lib_path) {
   GELOGI("To invoke dlopen on lib: %s", lib_path.c_str());
-  auto handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL);
+  auto handle = mmDlopen(lib_path.c_str(), MMPA_RTLD_NOW | MMPA_RTLD_GLOBAL);
   if (handle == nullptr) {
-    GELOGE(INTERNAL_ERROR, "Failed to invoke dlopen. path = %s, error = %s", lib_path.c_str(), dlerror());
+    GELOGE(INTERNAL_ERROR, "Failed to invoke dlopen. path = %s, error = %s", lib_path.c_str(), mmDlerror());
     return INTERNAL_ERROR;
   }
 
-  auto initialize = (Status (*)(const HostCpuContext &))dlsym(handle, "Initialize");
+  auto initialize = (Status (*)(const HostCpuContext &))mmDlsym(handle, "Initialize");
   if (initialize != nullptr) {
     GELOGI("Invoke function Initialize in lib: %s", lib_path.c_str());
     if (initialize(HostCpuContext()) != SUCCESS) {
diff --git a/ge/ge_runner.mk b/ge/ge_runner.mk
index a2679ed1..c0f59320 100644
--- a/ge/ge_runner.mk
+++ b/ge/ge_runner.mk
@@ -29,6 +29,8 @@ LIBGE_LOCAL_SRC_FILES := \
     common/dump/dump_manager.cc \
     common/dump/dump_properties.cc \
     common/dump/dump_op.cc \
+    common/profiling/ge_profiling.cc \
+    common/profiling/ge_runner_profiling.cc \
     engine_manager/dnnengine_manager.cc \
     ge_local_engine/engine/host_cpu_engine.cc \
     generator/ge_generator.cc \
@@ -170,6 +172,7 @@ LIBGE_LOCAL_SRC_FILES := \
     host_kernels/sub_kernel.cc \
     host_kernels/transdata_kernel.cc \
     host_kernels/unpack_kernel.cc \
+    host_kernels/reformat_kernel.cc \
     graph/passes/folding_pass.cc \
     graph/passes/get_original_format_pass.cc \
     graph/passes/guarantee_const_pass.cc \
@@ -306,7 +309,6 @@ LIBGE_LOCAL_SRC_FILES := \
 LIBCLIENT_LOCAL_SRC_FILES := \
     proto/ge_api.proto \
     client/ge_api.cc \
-    client/ge_prof.cc \
 
 RUNNER_LOCAL_C_INCLUDES := \
     $(LOCAL_PATH) ./ \
@@ -371,7 +373,7 @@ LOCAL_SRC_FILES += $(LIBCLIENT_LOCAL_SRC_FILES)
 
 LOCAL_STATIC_LIBRARIES := libge_memory \
                           libadump_server \
-                          libmsprofiler \
+                          libmsprofiler_fwk \
                           libmmpa \
 
 LOCAL_SHARED_LIBRARIES := \
@@ -381,7 +383,6 @@ LOCAL_SHARED_LIBRARIES := \
     libgraph \
     libregister \
     libge_common \
-    libmsprof \
     liberror_manager \
 
 LOCAL_LDFLAGS := -lrt -ldl
@@ -408,7 +409,6 @@ endif
 LOCAL_C_INCLUDES := $(RUNNER_LOCAL_C_INCLUDES)
 
 LOCAL_SRC_FILES := ../../out/ge/lib64/stub/ge_api.cc \
-                   ../../out/ge/lib64/stub/ge_prof.cc \
                    ../../out/ge/lib64/stub/ge_ir_build.cc \
 
 LOCAL_SHARED_LIBRARIES :=
@@ -464,7 +464,6 @@ LOCAL_SHARED_LIBRARIES := \
     libc_sec \
     libslog \
     libmmpa \
-    libmsprof \
 
 LOCAL_LDFLAGS := -lrt -ldl
 
@@ -497,7 +496,6 @@ LOCAL_SHARED_LIBRARIES := \
     libc_sec \
     libslog \
     libmmpa \
-    libmsprof \
 
 LOCAL_LDFLAGS := -lrt -ldl
 
diff --git a/ge/ge_runtime/runtime_model.cc b/ge/ge_runtime/runtime_model.cc
index fb0f3e85..8baa5b05 100644
--- a/ge/ge_runtime/runtime_model.cc
+++ b/ge/ge_runtime/runtime_model.cc
@@ -28,6 +28,7 @@
 
 namespace ge {
 namespace model_runner {
+const int kOffsetUnit = 8;
 RuntimeModel::~RuntimeModel() {
   GELOGI("RuntimeModel destructor start");
 
@@ -495,7 +496,7 @@ bool RuntimeModel::InitConstantInfo(std::shared_ptr<DavinciModel> &davinci_model
         return false;
       }
       uint64_t *buff = reinterpret_cast<uint64_t *>(const_cast<char *>(constant->weight_data.data()));
-      int64_t offset = elem_num * 8;
+      int64_t offset = elem_num * kOffsetUnit;
       uintptr_t hbm_raw_data_base_addr = reinterpret_cast<uintptr_t>(constant->output_addrs[0]) + offset;
       for (int64_t i = elem_num - 1; i >= 0; --i) {
         buff[i] = hbm_raw_data_base_addr + (buff[i] - buff[0]);
diff --git a/ge/generator/ge_generator.cc b/ge/generator/ge_generator.cc
index 16d63f6b..e50feae2 100644
--- a/ge/generator/ge_generator.cc
+++ b/ge/generator/ge_generator.cc
@@ -156,7 +156,12 @@ static Status AddInputs(const ComputeGraphPtr &graph, const NodePtr &node, GeTen
   }
 
   string op_type;
-  if (!AttrUtils::GetStr(tensor, kAttrOpType, op_type) || op_type.empty()) {
+  bool is_const = false;
+  (void)AttrUtils::GetBool(tensor, CONST_ATTR_NAME_INPUT, is_const);
+  if (is_const) {
+    GELOGD("Get input[%d] is const", index);
+    op_type = CONSTANTOP;
+  } else if (!AttrUtils::GetStr(tensor, kAttrOpType, op_type) || op_type.empty()) {
     op_type = DATA;
   }
 
@@ -165,6 +170,18 @@ static Status AddInputs(const ComputeGraphPtr &graph, const NodePtr &node, GeTen
   if (data_op == nullptr) {
     return FAILED;
   }
+  if (is_const) {
+    ConstGeTensorPtr tensor_value;
+    if (!AttrUtils::GetTensor(tensor, ge::ATTR_NAME_WEIGHTS, tensor_value)) {
+      GELOGE(FAILED, "Get value failed, node name:%s.", tensor.GetName().c_str());
+      return FAILED;
+    }
+    if (!AttrUtils::SetTensor(data_op, ge::ATTR_NAME_WEIGHTS, tensor_value)) {
+      GELOGE(FAILED, "Set attr ATTR_NAME_WEIGHTS fail.");
+      return FAILED;
+    }
+  }
+
   (void)AttrUtils::SetBool(data_op, "_is_single_op", true);
 
   GE_CHK_BOOL_EXEC(data_op->AddInputDesc(tensor) == GRAPH_SUCCESS, return FAILED, "Add input desc fail.");
@@ -240,6 +257,8 @@ class GeGenerator::Impl {
 
   Status SaveModel(const string &file_name_prefix, GeModelPtr &models, ModelBufferData &model);
 
+  Status SaveRootModel(const string &file_name_prefix, GeRootModelPtr &model, ModelBufferData &model_buff);
+
   Status SaveParams(GeModelPtr &ge_model, const string &type, const map<string, GeAttrValue> &attrs,
                     const vector<GeTensor> &inputs, const vector<GeTensor> &outputs);
 
@@ -505,19 +524,7 @@ Status GeGenerator::GenerateModel(const Graph &graph, const string &file_name_pr
 
   GE_CHECK_NOTNULL(ge_root_model);
   GE_CHECK_NOTNULL(ge_root_model->GetRootGraph());
-  ModelHelper model_helper;
-  string model_name = "";
-  Status name_ret = model_helper.GetModelNameFromMergedGraphName(ge_root_model->GetRootGraph()->GetName(), model_name);
-  if (name_ret != SUCCESS) {
-    ErrorManager::GetInstance().ATCReportErrMessage("E10000", {"parameter"}, {"output"});
-    GELOGE(FAILED, "Get model_name failed. Param --output is invalid");
-    return PARAM_INVALID;
-  }
-  map<string, GeModelPtr> name_to_ge_model = ge_root_model->GetSubgraphInstanceNameToModel();
-  GeModelPtr &ge_model = name_to_ge_model[ge_root_model->GetRootGraph()->GetName()];
-  GE_RETURN_WITH_LOG_IF_FALSE(ge_model != nullptr, "ge_model can not be null");
-  ge_model->SetName(model_name);
-  ret = impl_->SaveModel(file_name_prefix, ge_model, model);
+  ret = impl_->SaveRootModel(file_name_prefix, ge_root_model, model);
   if (ret != SUCCESS) {
     GELOGE(ret, "Save model failed");
     if (impl_->graph_manager_.Finalize() != SUCCESS) {
@@ -567,6 +574,9 @@ Status GeGenerator::CheckForSingleOp(OpDescPtr &op_desc, const vector<GeTensor>
 Status GeGenerator::BuildSingleOp(OpDescPtr &op_desc, const vector<GeTensor> &inputs, const vector<GeTensor> &outputs,
                                   const string &model_file_name, OpEngineType engine_type, ModelBufferData &model_buff,
                                   bool is_offline) {
+  if (!is_offline) {
+    (void)AttrUtils::SetBool(op_desc, ATTR_DYNAMIC_SHAPE_SINGLE_AICPU, true);
+  }
 
   if (CheckForSingleOp(op_desc, inputs, outputs) != SUCCESS) {
     GELOGE(PARAM_INVALID, "input param is invalid when build single op!");
@@ -594,40 +604,11 @@ Status GeGenerator::BuildSingleOp(OpDescPtr &op_desc, const vector<GeTensor> &in
 
   // 2. Create ComputeGraph.
   string name = ge::CurrentTimeInStr() + "_" + model_file_name;
-  ge::ComputeGraphPtr compute_graph = MakeShared<ComputeGraph>(name);
-  GE_CHECK_NOTNULL_EXEC(compute_graph, return INTERNAL_ERROR);
-
-  // 3. Add Node to ComputeGraph.
-  NodePtr op_node = compute_graph->AddNode(op_desc);
-  GE_CHECK_NOTNULL_EXEC(op_node, return INTERNAL_ERROR);
-
-  // 4. Create InputData node.
-  int32_t arg_index = 0;
-  if (inputs.empty()) {
-    for (const auto &input_desc : op_desc->GetAllInputsDescPtr()) {
-      GE_CHECK_NOTNULL_EXEC(input_desc, return INTERNAL_ERROR);
-      if (!IsNeedConnectInputOpForSingleOp(*input_desc)) {
-        continue;
-      }
-      GE_CHK_STATUS_RET_NOLOG(AddInputs(compute_graph, op_node, *input_desc, arg_index, false));
-      arg_index++;
-    }
-  } else {
-    for (const auto &in_desc : inputs) {
-      GeTensorDesc input_desc = in_desc.GetTensorDesc();
-      GE_CHK_STATUS_RET_NOLOG(AddInputs(compute_graph, op_node, input_desc, arg_index, true));
-      arg_index++;
-    }
+  Graph graph;
+  if (BuildSingleOpGraph(op_desc, inputs, outputs, name, graph) != ge::SUCCESS) {
+    GELOGE(GRAPH_FAILED, "make graph fail.");
+    return GRAPH_FAILED;
   }
-
-  // 5. Create Output node.
-  if (!outputs.empty()) {
-    GE_CHK_STATUS_RET_NOLOG(AddOutputs(compute_graph, op_node, outputs));
-  }
-
-  // dump ComputeGraph.
-  compute_graph->Dump();
-  Graph graph = ge::GraphUtils::CreateGraphFromComputeGraph(compute_graph);
   GELOGI("ATC parser success in single op build.");
 
   GeRootModelPtr ge_root_model = nullptr;
@@ -683,6 +664,46 @@ Status GeGenerator::BuildSingleOpModel(OpDescPtr &op_desc, const vector<GeTensor
   return BuildSingleOp(op_desc, inputs, outputs, kFileNameSuffix, engine_type, model_buff, false);
 }
 
+Status GeGenerator::BuildSingleOpGraph(OpDescPtr &op_desc, const vector<GeTensor> &inputs,
+                                       const vector<GeTensor> &outputs, std::string graph_name, Graph &graph) {
+  ge::ComputeGraphPtr compute_graph = MakeShared<ComputeGraph>(graph_name);
+  GE_CHECK_NOTNULL_EXEC(compute_graph, return INTERNAL_ERROR);
+
+  // 1. Add Node to ComputeGraph.
+  NodePtr op_node = compute_graph->AddNode(op_desc);
+  GE_CHECK_NOTNULL_EXEC(op_node, return INTERNAL_ERROR);
+
+  // 2. Create InputData node.
+  int32_t arg_index = 0;
+  if (inputs.empty()) {
+    for (const auto &input_desc : op_desc->GetAllInputsDescPtr()) {
+      GE_CHECK_NOTNULL_EXEC(input_desc, return INTERNAL_ERROR);
+      if (!IsNeedConnectInputOpForSingleOp(*input_desc)) {
+        continue;
+      }
+      GE_CHK_STATUS_RET_NOLOG(AddInputs(compute_graph, op_node, *input_desc, arg_index, false));
+      arg_index++;
+    }
+  } else {
+    for (const auto &in_desc : inputs) {
+      GeTensorDesc input_desc = in_desc.GetTensorDesc();
+      GE_CHK_STATUS_RET_NOLOG(AddInputs(compute_graph, op_node, input_desc, arg_index, true));
+      arg_index++;
+    }
+  }
+
+  // 3. Create Output node.
+  if (!outputs.empty()) {
+    GE_CHK_STATUS_RET_NOLOG(AddOutputs(compute_graph, op_node, outputs));
+  }
+
+  // dump ComputeGraph node.
+  compute_graph->Dump();
+  graph = ge::GraphUtils::CreateGraphFromComputeGraph(compute_graph);
+
+  return SUCCESS;
+}
+
 Status GeGenerator::Impl::SaveParams(GeModelPtr &ge_model, const string &type, const map<string, GeAttrValue> &attrs,
                                      const vector<GeTensor> &inputs, const vector<GeTensor> &outputs) {
   GE_CHECK_NOTNULL_EXEC(ge_model, return PARAM_INVALID);
@@ -712,6 +733,44 @@ Status GeGenerator::Impl::SaveModel(const string &file_name_prefix, GeModelPtr &
   return SUCCESS;
 }
 
+Status GeGenerator::Impl::SaveRootModel(const string &file_name_prefix, GeRootModelPtr &ge_root_model,
+                                        ModelBufferData &model_buff) {
+  bool is_unknown_shape = false;
+  auto ret = ge_root_model->CheckIsUnknownShape(is_unknown_shape);
+  if (ret != SUCCESS) {
+    GELOGE(FAILED, "Check root model is unkonwn shape failed");
+    return FAILED;
+  }
+  GELOGD("begin save root model, cur model is unkonwn shape model ? : %d", is_unknown_shape);
+  GE_CHK_BOOL_EXEC(!ge_root_model->GetSubgraphInstanceNameToModel().empty(), return FAILED,
+                   "ge root model has no sub model")
+  GeModelPtr model_root = nullptr;
+  if (is_unknown_shape) {
+    model_root = make_shared<GeModel>();
+    model_root->SetGraph(GraphUtils::CreateGraphFromComputeGraph(ge_root_model->GetRootGraph()));
+    ge_root_model->SetSubgraphInstanceNameToModel(ge_root_model->GetRootGraph()->GetName(), model_root);
+    model_root->SetName(ge_root_model->GetRootGraph()->GetName());
+  } else {
+    model_root = ge_root_model->GetSubgraphInstanceNameToModel().begin()->second;
+  }
+  // set atc version
+  if (!SetAtcVersionInfo(*(model_root.get()))) {
+    GELOGW("SetPackageVersionInfo of atc failed!");
+  }
+  // set opp version
+  if (!SetOppVersionInfo(*(model_root.get()))) {
+    GELOGW("SetPackageVersionInfo of ops failed!");
+  }
+  ModelHelper model_helper;
+  model_helper.SetSaveMode(is_offline_);
+  ret = model_helper.SaveToOmRootModel(ge_root_model, save_param_, file_name_prefix, model_buff, is_unknown_shape);
+  if (ret != SUCCESS) {
+    GELOGE(ret, "Save to om model failed");
+    return ret;
+  }
+  return SUCCESS;
+}
+
 Status GeGenerator::Impl::BuildModel(const Graph &graph, const vector<GeTensor> &inputs,
                                      GeRootModelPtr &ge_root_model) {
   static std::atomic<GraphId> atomic_graph_id(0);
diff --git a/ge/graph/build/graph_builder.cc b/ge/graph/build/graph_builder.cc
index bdb02b3a..87d2a206 100644
--- a/ge/graph/build/graph_builder.cc
+++ b/ge/graph/build/graph_builder.cc
@@ -349,7 +349,8 @@ static Status GenerateTaskForConstant(const std::shared_ptr<ComputeGraph> &graph
           GELOGD("Insert MemcpyAsync node between %s and %s.", in_node->GetName().c_str(), node->GetName().c_str());
           std::string name = node->GetName() + "_input_" + std::to_string(in_data_anchor->GetIdx()) + "_Memcpy";
           if (InsertMemcpyNode(graph, peer_out_anchor, {in_data_anchor}, name) != SUCCESS) {
-            GELOGE(FAILED, "Insert memcpy between %s and %s failed.", in_node->GetName().c_str(), node->GetName().c_str());
+            GELOGE(FAILED, "Insert memcpy between %s and %s failed.",
+                   in_node->GetName().c_str(), node->GetName().c_str());
             return FAILED;
           }
         }
@@ -475,7 +476,7 @@ Status GraphBuilder::GetTaskInfo(const ge::ModelBuilder &builder, const ModelPtr
 }
 
 Status GraphBuilder::SetInputSize(const ge::NodePtr &node_ptr) {
-  // set input_desc.size = src_node.output_desc.size
+  // Set the size of input_desc to 'src_node.output_desc.size'
   if (node_ptr->GetType() == DATA) {
     bool is_unknown_shape = false;
     GE_CHK_STATUS_RET(ge::NodeUtils::GetNodeUnknownShapeStatus(*node_ptr, is_unknown_shape),
@@ -498,7 +499,7 @@ Status GraphBuilder::SetInputSize(const ge::NodePtr &node_ptr) {
     GE_IF_BOOL_EXEC(src_op == nullptr, continue);
     auto node_op_desc = node_ptr->GetOpDesc();
     GE_IF_BOOL_EXEC(node_op_desc == nullptr, continue);
-    // set dst_node.input_desc = src_node.output_desc
+    // Set the input_desc of dst_node to 'src_node.output_desc'
     auto output_desc = src_op->GetOutputDescPtr(peer_out_anchor->GetIdx());
     int64_t size = 0;
     GE_IF_BOOL_EXEC(ge::TensorUtils::GetSize(*output_desc, size) != SUCCESS, GELOGI("Get size failed!"));
@@ -512,7 +513,6 @@ Status GraphBuilder::SetInputSize(const ge::NodePtr &node_ptr) {
     auto input_desc = node_op_desc->MutableInputDesc(in_data_anchor->GetIdx());
     GE_CHECK_NOTNULL(input_desc);
     (void) ge::TensorUtils::SetSize(*input_desc, size);
-    GE_CHK_STATUS_RET(node_op_desc->UpdateInputDesc(in_data_anchor->GetIdx(), *input_desc));
     GELOGD("%s input desc, dim_size: %zu, mem_size: %ld, format: %s, type: %s.", node_ptr->GetName().c_str(),
            input_desc->GetShape().GetDimNum(), size, TypeUtils::FormatToSerialString(input_desc->GetFormat()).c_str(),
            TypeUtils::DataTypeToSerialString(input_desc->GetDataType()).c_str());
diff --git a/ge/graph/build/memory/binary_block_mem_assigner.cc b/ge/graph/build/memory/binary_block_mem_assigner.cc
index ecd2488c..fff589f3 100644
--- a/ge/graph/build/memory/binary_block_mem_assigner.cc
+++ b/ge/graph/build/memory/binary_block_mem_assigner.cc
@@ -21,8 +21,8 @@
 namespace {
 const uint32_t kRangeCeilInterval = 2;
 const uint32_t kLogBase = 2;
-const int64_t kLargeBlockSize = 8 * 1024 * 1024;
-const int64_t kLargeBlockRangeSize = 10;
+const int64_t kLargeBlockSize = 8388608;   // 8 * 1024 * 1024
+const int64_t kLargeBlockRangeSize = 2;
 }  // namespace
 
 namespace ge {
@@ -73,15 +73,17 @@ Status BinaryBlockMemAssigner::GetMemoryRanges(vector<int64_t> &range_ceils) {
     GELOGE(FAILED, "dividend is 0!");
     return FAILED;
   }
+  // Memory size is 512 aligned, so it is not necessary to take less than 512
+  int64_t min_memory_size = (all_memory_size.back() > MEM_ALIGN_SIZE) ? MEM_ALIGN_SIZE : all_memory_size.front();
   auto range_number = static_cast<size_t>(
-    ceil(log(all_memory_size.back() / static_cast<double>(all_memory_size.front())) / log(kLogBase)));
+    ceil(log(all_memory_size.back() / static_cast<double>(min_memory_size)) / log(kLogBase)));
   range_number = (range_number == 0) ? 1 : range_number;
   GELOGD("Range number: %zu", range_number);
 
   vector<vector<int64_t>> ranges(range_number);
   GE_CHK_BOOL_EXEC((range_number != 0), return PARAM_INVALID, "range_number can't be 0.");
   size_t range_number_limit = all_memory_size.size() / range_number;
-  int64_t range_ceil = all_memory_size[0];
+  int64_t range_ceil = min_memory_size;
   for (size_t i = 1; i <= range_number; i++) {
     GE_IF_BOOL_EXEC(TypeUtils::CheckUint64MulOverflow(static_cast<uint64_t>(range_ceil), kRangeCeilInterval),
                     GELOGE(FAILED, "Multiply result is out of range.");
@@ -114,7 +116,7 @@ Status BinaryBlockMemAssigner::GetMemoryRanges(vector<int64_t> &range_ceils) {
       range_ceils.push_back(range.back());
     }
   }
-  GELOGD("Range ceils: %s", ToString(range_ceils).c_str());
+  GELOGI("Range ceils: %s", ToString(range_ceils).c_str());
 
   return SUCCESS;
 }
diff --git a/ge/graph/build/memory/block_mem_assigner.cc b/ge/graph/build/memory/block_mem_assigner.cc
index 00f47573..9dc0cf73 100755
--- a/ge/graph/build/memory/block_mem_assigner.cc
+++ b/ge/graph/build/memory/block_mem_assigner.cc
@@ -65,6 +65,98 @@ void AlignMemOffset(size_t &mem_align_size) {
   mem_align_size = (mem_align_size + MEM_ALIGN_SIZE - 1) / MEM_ALIGN_SIZE * MEM_ALIGN_SIZE;
 }
 
+static bool CompareLifeTime(const NodeTypeIndex &left, const NodeTypeIndex &right) {
+  auto left_node_op_desc = left.node->GetOpDesc();
+  auto right_node_op_desc = right.node->GetOpDesc();
+  if ((left_node_op_desc != nullptr) && (right_node_op_desc != nullptr)
+      && (left_node_op_desc->GetId() < right_node_op_desc->GetId())) {
+    return true;
+  }
+  return false;
+}
+
+void GetLifeList(const MemoryBlock &block, std::vector<NodeTypeIndex> &life_list, bool child) {
+  for (auto &node : block.NodeTypeIndexList()) {
+    life_list.emplace_back(node);
+  }
+
+  if (child) {
+    for (auto child_block : block.ChildBlockList()) {
+      if (child_block == nullptr) {
+        continue;
+      }
+      if (block.stream_id_ != child_block->stream_id_ || !block.same_stream_ || !child_block->same_stream_) {
+        life_list.clear();
+        return;
+      }
+      GetLifeList(*child_block, life_list, child);
+    }
+  }
+}
+
+bool CrossLifeTime(const NodeTypeIndex &left, const NodeTypeIndex &right) {
+  if ((left.node == nullptr) || (right.node == nullptr)) {
+    return true;
+  }
+  auto left_node_op_desc = left.node->GetOpDesc();
+  auto right_node_op_desc = right.node->GetOpDesc();
+  if ((left_node_op_desc != nullptr) && (right_node_op_desc != nullptr)) {
+    if (left_node_op_desc->GetId() < right_node_op_desc->GetId()) {
+      if (left.life_time_end >= static_cast<size_t>(right_node_op_desc->GetId())) {
+        return true;
+      }
+    } else if (left_node_op_desc->GetId() == right_node_op_desc->GetId()) {
+      return true;
+    } else {
+      if (right.life_time_end >= static_cast<size_t>(left_node_op_desc->GetId())) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+///
+/// When child block's life time are not cross with parent block, they can be reused(only same stream).
+/// |-----------------------------parent block---------------------|
+/// |------child block1--------------||------child block2------|
+/// |--child block1-1-|
+///
+bool CanIntervalLifeReuse(MemoryBlock &parent_block, MemoryBlock &child_block) {
+  // judge by interval life time, only same stream can be judged by interval life time
+  if (parent_block.stream_id_ != child_block.stream_id_ || !parent_block.same_stream_ || !child_block.same_stream_
+      || parent_block.NodeTypeIndexList().empty() || child_block.NodeTypeIndexList().empty()) {
+    return false;
+  }
+
+  // quick judge by front and back node
+  if (CrossLifeTime(parent_block.NodeTypeIndexList().front(), child_block.NodeTypeIndexList().front())) {
+    return false;
+  }
+  if (CrossLifeTime(parent_block.NodeTypeIndexList().back(), child_block.NodeTypeIndexList().back())) {
+    return false;
+  }
+
+  std::vector<NodeTypeIndex> life_list;
+  GetLifeList(parent_block, life_list, false);
+  GetLifeList(child_block, life_list, true);
+  if (life_list.empty()) {
+    return false;
+  }
+  std::sort(life_list.begin(), life_list.end(), CompareLifeTime);
+  size_t pre_life_end = 0;
+  for (auto &node : life_list) {
+    auto node_op_desc = node.node->GetOpDesc();
+    if (node_op_desc != nullptr && pre_life_end >= static_cast<size_t>(node_op_desc->GetId())) {
+      // life time cross
+      return false;
+    }
+    pre_life_end = node.life_time_end;
+  }
+  GELOGI("Block size[%zu, %zu] life time are not cross.", parent_block.Size(), child_block.Size());
+  return true;
+}
+
 void MemoryBlock::SetHeadOffset(size_t offset) {
   head_offset_ = offset;
   size_t child_offset = head_offset_;
@@ -125,20 +217,12 @@ size_t MemoryBlock::AlignSize() const {
   return align_block_size;
 }
 
-bool MemoryBlock::IsSameLabel(std::string &first_batch_label) {
-  if (node_type_index_list_.empty()) {
+bool MemoryBlock::IsSameBatchLabel() {
+  // only same batch label can reuse
+  if (batch_label_.empty() || node_type_index_list_.empty()) {
     return false;
   }
 
-  auto node_op_desc = node_type_index_list_[0].node->GetOpDesc();
-  if (node_op_desc == nullptr) {
-    return false;
-  }
-  // not all op has ATTR_NAME_BATCH_LABEL, no need check return value, only check out parameter
-  (void)ge::AttrUtils::GetStr(node_op_desc, ATTR_NAME_BATCH_LABEL, first_batch_label);
-  if (first_batch_label.empty()) {
-    return false;
-  }
   bool all_same_label = true;
   for (size_t index = 1; index < node_type_index_list_.size(); ++index) {
     if (node_type_index_list_[index].node == nullptr) {
@@ -147,8 +231,9 @@ bool MemoryBlock::IsSameLabel(std::string &first_batch_label) {
     std::string batch_label;
     auto index_op_desc = node_type_index_list_[index].node->GetOpDesc();
     GE_IF_BOOL_EXEC(index_op_desc == nullptr, continue);
+    // not all op has ATTR_NAME_BATCH_LABEL, no need check return value, only check out parameter
     (void)ge::AttrUtils::GetStr(index_op_desc, ATTR_NAME_BATCH_LABEL, batch_label);
-    if (first_batch_label != batch_label) {
+    if (batch_label_ != batch_label) {
       all_same_label = false;
       break;
     }
@@ -197,7 +282,7 @@ void MemoryBlock::AddContinuousLifeReuseBlock(MemoryBlock *block, DependStreamLi
 }
 
 void MemoryBlock::AddLifeReuseBlock(MemoryBlock *block, DependStreamLife &total_node_depend_stream_life) {
-  if (CanNotLifeReuse(this) || CanNotLifeReuse(block)) {
+  if (CanNotLifeReuse(this) || CanNotLifeReuse(block) || (batch_label_ != block->batch_label_)) {
     return;
   }
   if (block->continuous_block_) {
@@ -207,16 +292,27 @@ void MemoryBlock::AddLifeReuseBlock(MemoryBlock *block, DependStreamLife &total_
   MemoryBlock *parent = nullptr;
   MemoryBlock *child = nullptr;
   // merge small block to large block
-  if (block->GetDependLifeBegin(stream_id_, total_node_depend_stream_life) > GetLifeEnd()) {
-    if ((child_offset_ + block->AlignSize()) <= AlignSize()) {
-      parent = this;
-      child = block;
-    } else if ((block->child_offset_ + AlignSize()) <= block->AlignSize()) {
-      parent = block;
-      child = this;
+  // noalign size         802816 + 802816 = 1605632       can reuse
+  // after 32 align size  802848 + 802848 > 1605664       can't reuse
+  // after 512 align size 803328 + 803328 > 1606144       can't reuse
+  // so                   803328 + 803328 = 1606144 + 512 can reuse
+  if ((child_offset_ + block->AlignSize()) <= (AlignSize() + MEM_ALIGN_SIZE)) {
+    parent = this;
+    child = block;
+  } else if ((block->child_offset_ + AlignSize()) <= (block->AlignSize() + MEM_ALIGN_SIZE)) {
+    parent = block;
+    child = this;
+  }
+
+  if ((parent != nullptr) && (child != nullptr)) {
+    // Different streams must use stream dependency to judge the life cycle
+    // In case same stream if it has child block, can judge all the child block's life time in CanIntervalLifeReuse
+    bool can_block_life_reuse = (child->child_blocks_.empty()
+        && (block->GetDependLifeBegin(stream_id_, total_node_depend_stream_life) > GetLifeEnd()));
+    if (!can_block_life_reuse && !CanIntervalLifeReuse(*parent, *child)) {
+      return;
     }
-  }
-  if ((parent != nullptr) && (child != nullptr) && child->child_blocks_.empty()) {
+
     parent->child_blocks_.emplace_back(child);
     parent->child_offset_ += child->AlignSize();
     child->deleted_block_ = true;
@@ -261,6 +357,7 @@ size_t MemoryBlock::GetDependLifeBegin(int64_t stream_id, DependStreamLife &tota
 void AddDependLife(const ge::NodePtr &org_node, const ge::NodePtr &node, int64_t stream_id,
                    std::map<int64_t, size_t> &depend_stream_life, DependStreamLife &total_node_depend_stream_life) {
   GE_CHECK_NOTNULL_EXEC(node, return);
+  GE_CHECK_NOTNULL_EXEC(org_node, return);
   auto node_desc = node->GetOpDesc();
   GE_CHECK_NOTNULL_EXEC(node_desc, return);
   auto node_id = node_desc->GetId();
@@ -415,12 +512,60 @@ BlockMemAssigner::~BlockMemAssigner() {
   }
 }
 
+void GetMaxBatchAllMemorySize(std::map<std::string, vector<int64_t>> &batch_all_memory_size,
+                              std::map<std::string, int64_t> batch_total_size, vector<int64_t> &all_memory_size,
+                              std::string &max_batch_label) {
+  // use max batch all memory size for reuse range
+  int64_t max_batch_size = 0;
+  for (const auto &it : batch_total_size) {
+    GELOGI("Batch[%s] total memory size[%ld]", it.first.c_str(), it.second);
+    // no batch label
+    if (it.first.empty()) {
+      continue;
+    }
+    if (it.second > max_batch_size) {
+      max_batch_size = it.second;
+      max_batch_label = it.first;
+    }
+  }
+  GELOGI("Max batch[%s] total memory size[%ld]", max_batch_label.c_str(), max_batch_size);
+
+  for (const auto &it : batch_all_memory_size) {
+    if (it.first.empty() || (it.first == max_batch_label)) {
+      all_memory_size.insert(all_memory_size.end(), it.second.begin(), it.second.end());
+    }
+  }
+  // all_memory_size can't be empty
+  if (all_memory_size.empty()) {
+    all_memory_size.emplace_back(MEM_ALIGN_SIZE);
+  }
+  sort(all_memory_size.begin(), all_memory_size.end());
+  GELOGD("All memory size: %s", ToString(all_memory_size).c_str());
+
+  for (auto iter = all_memory_size.begin(); iter != all_memory_size.end();) {
+    if (*iter == 0) {
+      iter = all_memory_size.erase(iter);
+    } else {
+      ++iter;
+    }
+  }
+}
+
 void BlockMemAssigner::GetOutAndWorkSpaceMem(vector<int64_t> &all_memory_size) {
   vector<int64_t> temp;
+  std::map<std::string, vector<int64_t>> batch_all_memory_size;
+  std::map<std::string, int64_t> batch_total_size;
   for (const NodePtr &n : compute_graph_->GetAllNodes()) {
     auto node_op_desc = n->GetOpDesc();
     GE_IF_BOOL_EXEC(node_op_desc == nullptr, continue);
 
+    if (CheckIsZeroMemNodeType(node_op_desc->GetType())) {
+      continue;
+    }
+
+    std::string batch_label;
+    (void)ge::AttrUtils::GetStr(node_op_desc, ATTR_NAME_BATCH_LABEL, batch_label);
+
     if (node_op_desc->GetType() == ATOMICADDRCLEAN) {
       atomic_addr_clean_id_ = node_op_desc->GetId();
     }
@@ -434,9 +579,14 @@ void BlockMemAssigner::GetOutAndWorkSpaceMem(vector<int64_t> &all_memory_size) {
       if (!reuse_input) {
         int64_t size = 0;
         GE_IF_BOOL_EXEC(ge::TensorUtils::GetSize(output_desc, size) != SUCCESS, GELOGI("Get size failed"));
-        if (anchor_to_symbol_.empty()) {
-          all_memory_size.emplace_back(size);
+        batch_all_memory_size[batch_label].emplace_back(size);
+        if (batch_total_size.find(batch_label) == batch_total_size.end()) {
+          batch_total_size[batch_label] = size;
         } else {
+          batch_total_size[batch_label] += size;
+        }
+
+        if (!anchor_to_symbol_.empty()) {
           auto iter1 = anchor_to_symbol_.find(NodeIndexIO(n, out_anchor->GetIdx(), kOut).ToString());
           if (iter1 == anchor_to_symbol_.end()) {
             continue;
@@ -452,23 +602,11 @@ void BlockMemAssigner::GetOutAndWorkSpaceMem(vector<int64_t> &all_memory_size) {
       }
     }
     temp.clear();
-    GetNodeWorkSpaceSize(n, temp);
-    all_memory_size.insert(all_memory_size.end(), temp.begin(), temp.end());
-  }
-  for (const auto &pair : symbol_size_) {
-    all_memory_size.emplace_back(pair.second);
-  }
-  sort(all_memory_size.begin(), all_memory_size.end());
-  GELOGD("All memory size: %s", ToString(all_memory_size).c_str());
-
-  for (auto iter = all_memory_size.begin(); iter != all_memory_size.end();) {
-    if (*iter == 0) {
-      iter = all_memory_size.erase(iter);
-    } else {
-      ++iter;
-    }
+    GetNodeWorkSpaceSize(n, temp, batch_total_size[batch_label]);
+    batch_all_memory_size[batch_label].insert(batch_all_memory_size[batch_label].end(), temp.begin(), temp.end());
   }
-
+  GELOGI("The last atomic_addr_clean node id: %ld", atomic_addr_clean_id_);
+  GetMaxBatchAllMemorySize(batch_all_memory_size, batch_total_size, all_memory_size, max_batch_label_);
   InitReuseFlag();
   PrintSymbolMap();
 }
@@ -529,16 +667,6 @@ bool CanReuseBySize(const map<string, uint64_t> &reusable_block_counts, const Me
   bool can_reuse = false;
   if (reusable_block.Size() == block_size) {
     can_reuse = true;
-  } else {
-    string key = std::to_string(reusable_block.Size());
-    key += "_" + std::to_string(reusable_block.stream_id_);
-    key += "_" + std::to_string(reusable_block.memory_type_);
-    auto it = reusable_block_counts.find(key);
-    GE_IF_BOOL_EXEC((it != reusable_block_counts.end() && (it->second > kReuseMaxCount)) &&
-                    (reusable_block.Size() > block_size),
-                     can_reuse = true;
-                     GELOGD("Less size mem reuse, reuse block size:%zu, current block size:%zu",
-                            reusable_block.Size(), block_size););
   }
   return can_reuse;
 }
@@ -860,34 +988,35 @@ MemoryBlock *BlockMemAssigner::ApplyMemory(size_t block_size, size_t real_size,
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(n == nullptr, return nullptr, "Input parameter n is null.");
   auto node_op_desc = n->GetOpDesc();
   GE_IF_BOOL_EXEC(node_op_desc == nullptr, return nullptr);
+  std::string batch_label;
+  (void)ge::AttrUtils::GetStr(node_op_desc, ATTR_NAME_BATCH_LABEL, batch_label);
+  if (batch_label.empty() || (batch_label == max_batch_label_)) {
+    size_t align_size = real_size;
+    AlignMemOffset(align_size);
+    theory_memory_size_ += align_size;
+    if (theory_memory_size_ > theory_min_memory_size_) {
+      theory_min_memory_size_ = theory_memory_size_;
+    }
+  }
 
   bool is_reuse_memory = false;
-  string ge_disable_reuse_mem_env = "0";
-  (void)ge::GetContext().GetOption(OPTION_EXEC_DISABLE_REUSED_MEMORY, ge_disable_reuse_mem_env);
-  if (ge_disable_reuse_mem_env != "1") {
+  if (ge_disable_reuse_mem_env_ != "1") {
     bool reuse_mem_flag = (mem_type == kOutput) ? IsPreReuse(n, out_index) :
                           !((workspace_reuse_flag.size() > out_index) && !workspace_reuse_flag[out_index]);
     is_reuse_memory = !node_op_desc->HasAttr(kL2FusionDynamicConvergeOp) &&
                       !node_op_desc->HasAttr(kOpNoReuseMem) && reuse_mem_flag && is_op_reuse_mem;
-    auto stream_id = node_op_desc->GetStreamId();
-    if (is_reuse_memory && !continuous && !reusable_blocks_[memory_type].empty()) {
-      for (auto it = reusable_blocks_[memory_type][stream_id].begin();
-           it != reusable_blocks_[memory_type][stream_id].end(); ++it) {
+    bool do_reuse = is_reuse_memory && !continuous && !reusable_blocks_[memory_type].empty();
+    if (do_reuse) {
+      auto stream_id = node_op_desc->GetStreamId();
+      for (auto it = reusable_blocks_[memory_type][stream_id].rbegin();
+           it != reusable_blocks_[memory_type][stream_id].rend(); ++it) {
         MemoryBlock *reusable_block = *it;
         if (!IsPostReuse(reusable_block)) {
           reusable_block->reuse_mem_ = false;
           GELOGI("Unreusable block.");
           continue;
         }
-        std::string batch_label;
-        if (reusable_block->IsSameLabel(batch_label)) {
-          std::string op_label;
-          (void)ge::AttrUtils::GetStr(node_op_desc, ATTR_NAME_BATCH_LABEL, op_label);
-          if (batch_label != op_label) {
-            GELOGI("label diff, op name %s", node_op_desc->GetName().c_str());
-            continue;
-          }
-        }
+        GE_IF_BOOL_EXEC(reusable_block->batch_label_ != batch_label, continue);
 
         // A node can reuse blocks of the same stream and preorder streams
         if (CanReuseBySize(reusable_block_counts_, *reusable_block, block_size, real_size, continuous)) {
@@ -901,7 +1030,7 @@ MemoryBlock *BlockMemAssigner::ApplyMemory(size_t block_size, size_t real_size,
           reusable_block->continuous_block_ = continuous;
           reusable_block->ref_count_++;
           ReduceReusableBlockCount(*reusable_block, reusable_block_counts_);
-          reusable_blocks_[memory_type][stream_id].erase(it);
+          reusable_blocks_[memory_type][stream_id].erase((++it).base());
           return reusable_block;
         }
       }
@@ -914,10 +1043,11 @@ MemoryBlock *BlockMemAssigner::ApplyMemory(size_t block_size, size_t real_size,
   // Data and netoutput need zero copy block
   block->is_zero_copy_ = IsZeroCopyBlock(n, continuous);
 
-  block->Init(real_size, mem_type, n, out_index, no_align_size);
+  block->Init(real_size, mem_type, n, out_index, no_align_size, node_op_desc->GetStreamId());
   block->stream_id_ = node_op_desc->GetStreamId();
   block->ref_count_++;
   block->continuous_block_ = continuous;
+  block->batch_label_ = batch_label;
   if (mem_type == kOutput) {
     auto iter = anchor_to_symbol_.find(NodeIndexIO(n, out_index, kOut).ToString());
     if (iter != anchor_to_symbol_.end()) {
@@ -945,6 +1075,11 @@ MemoryBlock *BlockMemAssigner::ApplyContinuousMemory(const NodePtr &n, const vec
       return nullptr;
     }
 
+    if (CheckIsZeroMemNodeType(n->GetType())) {
+      zero_memory_list_.emplace_back(n, kOutput, index);
+      continue;
+    }
+
     int64_t size = 0;
     if (ge::TensorUtils::GetSize(*output_op_desc, size) != SUCCESS) {
       GELOGI("Get size failed");
@@ -957,9 +1092,7 @@ MemoryBlock *BlockMemAssigner::ApplyContinuousMemory(const NodePtr &n, const vec
     // only apply total size in first block
     if (index != 0) {
       zero_memory_list_.emplace_back(n, kOutput, index);
-    }
-
-    if (index == 0) {
+    } else {
       NodeIndexIO node_index_io(n, index, kOut);
       auto iter = anchor_to_symbol_.find(node_index_io.ToString());
       if (iter != anchor_to_symbol_.end()) {
@@ -972,6 +1105,10 @@ MemoryBlock *BlockMemAssigner::ApplyContinuousMemory(const NodePtr &n, const vec
     }
   }
 
+  if (total_size == 0) {
+    return nullptr;
+  }
+
   auto block_size = GetBlockSize(total_size, ranges);
   GELOGI("Node[%s] continuous out memory size[%ld] block size[%zu]", node_op_desc->GetName().c_str(),
          total_size, block_size);
@@ -1119,15 +1256,28 @@ bool IsKnownSubgraphData(const NodePtr &node) {
   return node->GetOpDesc()->HasAttr(ATTR_NAME_PARENT_NODE_INDEX);
 }
 
-void BlockMemAssigner::ReleaseMemory(MemoryBlock *to_release, vector<MemoryBlock *> &reusable_memory) {
+void BlockMemAssigner::ReleaseMemory(MemoryBlock *to_release, vector<MemoryBlock *> &reusable_memory,
+                                     bool same_stream) {
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(to_release == nullptr, return, "Input parameter to_release is null.");
   GE_CHK_TRUE_EXEC_INFO(to_release->ref_count_ <= 0, return, "Release memory");
   GE_CHK_TRUE_EXEC_INFO(!to_release->reuse_mem_, return, "doesn't reuse memory");
   --to_release->ref_count_;
+  if (!same_stream) {
+    to_release->same_stream_ = false;
+  }
   if (to_release->ref_count_ == 0) {
-    to_release->SetLifeTimeEnd(life_time_);
-    reusable_memory.emplace_back(to_release);
-    AddReusableBlockCount(*to_release, reusable_block_counts_);
+    if (to_release->reuse_mem_ && !to_release->RealSizeList().empty()) {
+      if (to_release->batch_label_.empty() || (to_release->batch_label_ == max_batch_label_)) {
+        size_t align_size = to_release->RealSizeList().back();
+        AlignMemOffset(align_size);
+        theory_memory_size_ -= align_size;
+      }
+    }
+    if (to_release->same_stream_) {
+      to_release->SetLifeTimeEnd(life_time_);
+      reusable_memory.emplace_back(to_release);
+      AddReusableBlockCount(*to_release, reusable_block_counts_);
+    }
   }
 }
 
@@ -1167,10 +1317,9 @@ void BlockMemAssigner::ReleaseInputNodeOutMemory(const unordered_map<string, vec
              node_type_indexs.back().node->GetName().c_str());
 
       if ((node_type_indexs.back().node == in_anchor->GetPeerOutAnchor()->GetOwnerNode()) &&
-          (node_type_indexs.back().index == static_cast<uint32_t>(in_anchor->GetPeerOutAnchor()->GetIdx())) &&
-          (node->GetOpDesc()->GetStreamId() == block->stream_id_)) {
-        ReleaseMemory(block, reusable_memory);
-        if (block->ref_count_ == 0) {
+          (node_type_indexs.back().index == static_cast<uint32_t>(in_anchor->GetPeerOutAnchor()->GetIdx()))) {
+        ReleaseMemory(block, reusable_memory, (node->GetOpDesc()->GetStreamId() == block->stream_id_));
+        if (block->ref_count_ == 0 && block->same_stream_) {
           SetLastUsedInputMemAttr(node, in_anchor->GetIdx());
         }
       }
@@ -1267,7 +1416,8 @@ Status BlockMemAssigner::AssignOutputMemoryWithReuse(const NodePtr &node, vector
     bool no_need_assign_memory = ((size == 0) || CheckIsZeroMemNodeType(node->GetType()));
     if (!no_need_assign_memory) {
       out_node_set_continuous_input =
-          IsOutNodeSetContinuousInput(node, i, peer_name, peer_input_index, no_need_assign_memory, reset_zero_copy_flag);
+          IsOutNodeSetContinuousInput(node, i, peer_name, peer_input_index,
+                                      no_need_assign_memory, reset_zero_copy_flag);
       GE_IF_BOOL_EXEC(!no_need_assign_memory,
           no_need_assign_memory = IsAtomicOutputMemory(node, i, is_atomic, out_node_set_continuous_input););
     }
@@ -1328,7 +1478,8 @@ void BlockMemAssigner::AssignMemoryWithReuse(vector<int64_t> &ranges) {
       iter->second[stream_id].clear();
     }
     vector<int64_t> temp;
-    GetNodeWorkSpaceSize(n, temp);
+    int64_t tatal_size = 0;
+    GetNodeWorkSpaceSize(n, temp, tatal_size);
     vector<int64_t> workspace_bytes;
     vector<int64_t> tvm_workspace_memory_type;
     bool has_tvm_workspace_mem_type_attr =
@@ -1349,7 +1500,7 @@ void BlockMemAssigner::AssignMemoryWithReuse(vector<int64_t> &ranges) {
       bool workspace_skip_flag = false;
       if (has_tvm_workspace_mem_type_attr && tvm_workspace_memory_type[i] == RT_MEMORY_L1) {
         GELOGI(
-            "fusion: node[%s]workspace index[%zu] is not hbm type, add to zero_memory_list, workspace memory type [%ld]",
+            "fusion:node[%s]workspace index[%zu] is not hbm type, add to zero_memory_list, workspace memory type [%ld]",
             node_op_desc->GetName().c_str(), i, tvm_workspace_memory_type[i]);
         workspace_skip_flag = true;
       }
@@ -1380,9 +1531,7 @@ void BlockMemAssigner::AssignMemoryWithReuse(vector<int64_t> &ranges) {
     (void)mem_block;  // Fix warning
   }
 
-  bool merge_dynamic_batch = false;
-  GE_IF_BOOL_EXEC(!(ge_disable_reuse_mem_env_ == "1"), merge_dynamic_batch = MergeDynamicBatchBlocks());
-  GE_IF_BOOL_EXEC((!(ge_disable_reuse_mem_env_ == "1") && !merge_dynamic_batch), ReuseBlocksByLifeTime(ranges.size()));
+  GE_IF_BOOL_EXEC(!(ge_disable_reuse_mem_env_ == "1"), ReuseBlocksByLifeTime(ranges.size()));
   AssignContinuousBlocks();
   ResizeMemoryBlocks();
 
@@ -1402,92 +1551,19 @@ void BlockMemAssigner::CheckWorkspaceReuse(const vector<bool> &workspace_reuse_f
   }
 }
 
-void BlockMemAssigner::GetNodeWorkSpaceSize(const NodePtr &node, vector<int64_t> &workspace_memory) {
+void BlockMemAssigner::GetNodeWorkSpaceSize(const NodePtr &node, vector<int64_t> &workspace_memory,
+                                            int64_t &total_size) {
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(node->GetOpDesc() == nullptr, return, "Op desc is null.");
   vector<int64_t> workspace_byte_nums = node->GetOpDesc()->GetWorkspaceBytes();
 
   GELOGD("node[%s] size:%zu", node->GetOpDesc()->GetName().c_str(), workspace_byte_nums.size());
   for (int64_t byte_size : workspace_byte_nums) {
     workspace_memory.emplace_back(byte_size);
+    total_size += byte_size;
     GELOGD("push back size:%ld", byte_size);
   }
 }
 
-// descending order
-static bool CompareBlockMaxSize(MemoryBlock *left, MemoryBlock *right) {
-  if (left == nullptr || right == nullptr) {
-    return false;
-  }
-  auto left_max_size = std::max_element(left->RealSizeList().begin(), left->RealSizeList().end());
-  if (left_max_size != left->RealSizeList().end()) {
-    auto right_max_size = std::max_element(right->RealSizeList().begin(), right->RealSizeList().end());
-    if (right_max_size == right->RealSizeList().end() || (*left_max_size > *right_max_size)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-void MergeBlocks(std::vector<MemoryBlock *> &dest, std::vector<MemoryBlock *> &src) {
-  for (size_t i = 0; i < dest.size(); ++i) {
-    if (i >= src.size()) {
-      return;
-    }
-    if (dest[i] != nullptr && src[i] != nullptr) {
-      if (!dest[i]->reuse_mem_ || !src[i]->reuse_mem_) {
-        GELOGD("Diff batch's workspace can't be reused, i: %zu, dest[i]: %s, stream: %ld, src[i]: %s, stream: %ld.",
-               i, dest[i]->String().c_str(), dest[i]->stream_id_, src[i]->String().c_str(), src[i]->stream_id_);
-        continue;
-      }
-      for (auto &symbol : src[i]->SymbolList()) {
-        dest[i]->AddSymbol(symbol);
-      }
-      for (size_t j = 0; j < src[i]->NodeTypeIndexList().size(); ++j) {
-        dest[i]->AddNodeTypeIndex(src[i]->NodeTypeIndexList()[j],
-                                  src[i]->RealSizeList()[j],
-                                  src[i]->NoAlignSizeList()[j]);
-        src[i]->deleted_block_ = true;
-      }
-    }
-  }
-}
-
-bool BlockMemAssigner::MergeDynamicBatchBlocks() {
-  bool merged = false;
-  std::map<std::string, std::vector<MemoryBlock *>> dynamic_batch_blocks;
-  for (auto block : memory_blocks_) {
-    if (block == nullptr) {
-      continue;
-    }
-    std::string batch_label;
-    if (block->IsSameLabel(batch_label)) {
-      dynamic_batch_blocks[batch_label].emplace_back(block);
-    }
-  }
-
-  auto it = dynamic_batch_blocks.begin();
-  auto it_max = it;
-
-  // find max block counts
-  for (; it != dynamic_batch_blocks.end(); ++it) {
-    if (it->second.size() > it_max->second.size()) {
-      it_max = it;
-    }
-    std::sort(it->second.begin(), it->second.end(), CompareBlockMaxSize);
-  }
-  if (it_max != dynamic_batch_blocks.end()) {
-    GELOGD("MergeDynamicBatch %s block counts %zu", it_max->first.c_str(), it_max->second.size());
-  }
-  for (it = dynamic_batch_blocks.begin(); it != dynamic_batch_blocks.end(); ++it) {
-    if (it != it_max) {
-      GELOGD("MergeDynamicBatch from %s to %s", it->first.c_str(), it_max->first.c_str());
-      MergeBlocks(it_max->second, it->second);
-      merged = true;
-    }
-  }
-  return merged;
-}
-
 // asending order
 static bool CompareBlockIndex(MemoryBlock *left, MemoryBlock *right) {
   if (left == nullptr || right == nullptr) {
@@ -1597,38 +1673,93 @@ void BlockMemAssigner::ReuseBlocksByLifeTime(size_t range_size) {
   }
 }
 
+void AddBlockMemOffset(size_t &mem_offset, size_t &p2p_mem_offset, MemoryBlock &block) {
+  if (block.memory_type_ == RT_MEMORY_HBM) {
+    if (block.first_continuous_block_) {
+      mem_offset += MEM_ALIGN_SIZE;
+    }
+    block.Resize();
+    block.SetHeadOffset(mem_offset);
+    mem_offset += block.Size();
+    block.SetTailOffset(mem_offset - 1);
+  } else if (block.memory_type_ == RT_MEMORY_P2P_DDR) {
+    if (block.first_continuous_block_) {
+      p2p_mem_offset += MEM_ALIGN_SIZE;
+    }
+    block.Resize();
+    block.SetHeadOffset(p2p_mem_offset);
+    p2p_mem_offset += block.Size();
+    block.SetTailOffset(p2p_mem_offset - 1);
+  }
+}
+
+bool DynamicBatchBlockReuse(MemoryBlock &block) {
+  return (block.IsSameBatchLabel() && block.reuse_mem_);
+}
+
 ///
 /// @ingroup domi_omg
-/// @brief traverse memory size, resize, calculate offset
+/// @brief get max batch memory size, others reuse this block memory
 /// @param [in&out] memory_blocks_ memory block, after calculating offset
+/// |-dynamic batch block batch1|
+/// |-dynamic batch block batch2----|
+/// |-dynamic batch block batch3--|
 ///
-void BlockMemAssigner::ResizeMemoryBlocks() {
-  for (auto &memory_block : memory_blocks_) {
-    if (memory_block == nullptr || memory_block->deleted_block_ || memory_block->is_zero_copy_) {
+void BlockMemAssigner::ResizeDynamicBatchBlocks() {
+  std::map<std::string, std::vector<MemoryBlock *>> dynamic_batch_blocks;
+  for (auto block : memory_blocks_) {
+    if (block == nullptr) {
       continue;
     }
-    if (memory_block->memory_type_ == RT_MEMORY_HBM) {
-      if (memory_block->first_continuous_block_) {
-        mem_offset_ += MEM_ALIGN_SIZE;
-      }
+    // when memory is not reuseable, it can't be reused by different branch
+    if (DynamicBatchBlockReuse(*block)) {
+      dynamic_batch_blocks[block->batch_label_].emplace_back(block);
+    }
+  }
 
-      memory_block->Resize();
-      memory_block->SetHeadOffset(mem_offset_);
-      mem_offset_ += memory_block->Size();
-      memory_block->SetTailOffset(mem_offset_ - 1);
-    } else if (memory_block->memory_type_ == RT_MEMORY_P2P_DDR) {
-      if (memory_block->first_continuous_block_) {
-        p2p_mem_offset_ += MEM_ALIGN_SIZE;
+  size_t max_mem_offset = mem_offset_;
+  size_t max_p2p_mem_offset = p2p_mem_offset_;
+  for (auto &batch_blocks : dynamic_batch_blocks) {
+    size_t mem_offset = mem_offset_;
+    size_t p2p_mem_offset = p2p_mem_offset_;
+    for (auto block : batch_blocks.second) {
+      if (block == nullptr || block->deleted_block_ || block->is_zero_copy_) {
+        continue;
       }
+      AddBlockMemOffset(mem_offset, p2p_mem_offset, *block);
+    }
+    if (mem_offset > max_mem_offset) {
+      max_mem_offset = mem_offset;
+    }
+    if (p2p_mem_offset > max_p2p_mem_offset) {
+      max_p2p_mem_offset = p2p_mem_offset;
+    }
+    GELOGI("Batch[%s] offset[%zu] p2p_offset[%zu]", batch_blocks.first.c_str(), mem_offset, p2p_mem_offset);
+  }
+  mem_offset_ = max_mem_offset;
+  p2p_mem_offset_ = max_p2p_mem_offset;
+}
 
-      memory_block->Resize();
-      memory_block->SetHeadOffset(p2p_mem_offset_);
-      p2p_mem_offset_ += memory_block->Size();
-      memory_block->SetTailOffset(p2p_mem_offset_ - 1);
+///
+/// @ingroup domi_omg
+/// @brief traverse memory size, resize, calculate offset
+/// @param [in&out] memory_blocks_ memory block, after calculating offset
+/// |-not dynamic batch block-||-dynamic batch block batch1|    |-zero copy block-|
+/// |-not dynamic batch block-||-dynamic batch block batch2----||-zero copy block-|
+/// |-not dynamic batch block-||-dynamic batch block batch3--|  |-zero copy block-|
+///
+void BlockMemAssigner::ResizeMemoryBlocks() {
+  for (auto &memory_block : memory_blocks_) {
+    if (memory_block == nullptr || memory_block->deleted_block_ || memory_block->is_zero_copy_
+        || DynamicBatchBlockReuse(*memory_block)) {
+      continue;
     }
+
+    AddBlockMemOffset(mem_offset_, p2p_mem_offset_, *memory_block);
   }
-  GELOGD("mem_offset_ exclude zero_copy_memory is %zu, p2p_mem_offset_ exclude zero_copy_memory is %zu.",
-         mem_offset_, p2p_mem_offset_);
+  ResizeDynamicBatchBlocks();
+  GELOGI("mem_offset_ exclude zero_copy_memory is %zu, p2p_mem_offset_ exclude zero_copy_memory is %zu,"
+         "theory_min_memory_size %zu", mem_offset_, p2p_mem_offset_, theory_min_memory_size_);
 }
 
 ///
@@ -1641,7 +1772,7 @@ void BlockMemAssigner::ResizeMemoryBlocks() {
 /// @return Status result
 ///
 void SetOffsetSize(const NodeTypeIndex &node_type, const MemoryBlock *block,
-                   size_t real_size, size_t no_align_size, bool child_block) {
+                   size_t real_size, size_t no_align_size, int32_t child_block_level) {
   ge::OpDescPtr op_desc = node_type.node->GetOpDesc();
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(op_desc == nullptr, return, "op_desc is null.");
   string graph_name = node_type.node->GetOwnerComputeGraph()->GetName();
@@ -1689,14 +1820,15 @@ void SetOffsetSize(const NodeTypeIndex &node_type, const MemoryBlock *block,
     }
     op_desc->SetWorkspace(workspace_list);
   }
-  GELOGI("[IMAS]Set %s name[%s] %s[%u] offset to [%ld] streamid[%ld] size[%zu] realsize[%zu]"
-         " noalignsize[%zu] life time begin[%zu] life time end[%zu] child[%d:%d:%d:%d] isref[%d].", graph_name.c_str(),
+  GELOGI("[IMAS]Set %s name[%s] %s[%u] offset to [%ld] streamid[%ld] size[%zu] realsize[%zu] noalignsize[%zu] "
+         "life time begin[%zu] life time end[%zu] child[%d:%d:%d:%d:%d] isref[%d] batch[%s]", graph_name.c_str(),
          op_desc->GetName().c_str(), node_type.GetMemType().c_str(), node_type.index, offset, op_desc->GetStreamId(),
-         block->Size(), real_size, no_align_size, op_desc->GetId(), end, child_block, block->reuse_mem_,
-         block->continuous_block_, block->deleted_block_, node_type.ref_input);
+         block->Size(), real_size, no_align_size, op_desc->GetId(), end, child_block_level, block->reuse_mem_,
+         block->continuous_block_, block->is_zero_copy_, block->same_stream_, node_type.ref_input,
+         block->batch_label_.c_str());
 }
 
-void SetBlockOpMemOffset(MemoryBlock *block, bool child_block) {
+void SetBlockOpMemOffset(MemoryBlock *block, int32_t child_block_level) {
   if (block == nullptr) {
     return;
   }
@@ -1709,9 +1841,14 @@ void SetBlockOpMemOffset(MemoryBlock *block, bool child_block) {
       real_size = block->RealSizeList()[index];
       no_align_size = block->NoAlignSizeList()[index];
     }
-    SetOffsetSize(node_type_index, block, real_size, no_align_size, child_block);
+    SetOffsetSize(node_type_index, block, real_size, no_align_size, child_block_level);
     index++;
   }
+
+  child_block_level++;
+  for (MemoryBlock *child_block : block->ChildBlockList()) {
+      SetBlockOpMemOffset(child_block, child_block_level);
+  }
 }
 
 void BlockMemAssigner::SetOpMemOffset(bool is_zero_copy) {
@@ -1724,16 +1861,13 @@ void BlockMemAssigner::SetOpMemOffset(bool is_zero_copy) {
       continue;
     }
 
-    SetBlockOpMemOffset(memory_block, false);
-    for (MemoryBlock *child_block : memory_block->ChildBlockList()) {
-      SetBlockOpMemOffset(child_block, true);
-    }
+    SetBlockOpMemOffset(memory_block, 0);
   }
 
   if (!is_zero_copy) {
     for (const NodeTypeIndex &node_type_index : zero_memory_list_) {
       MemoryBlock block(0, 0);
-      SetOffsetSize(node_type_index, &block, 0, 0, false);
+      SetOffsetSize(node_type_index, &block, 0, 0, 0);
     }
   }
 }
diff --git a/ge/graph/build/memory/block_mem_assigner.h b/ge/graph/build/memory/block_mem_assigner.h
index f3d26c1d..d514ca34 100755
--- a/ge/graph/build/memory/block_mem_assigner.h
+++ b/ge/graph/build/memory/block_mem_assigner.h
@@ -65,6 +65,7 @@ class MemoryBlock {
         stream_id_(stream_id),
         deleted_block_(false),
         reuse_mem_(reuse_mem),
+        same_stream_(true),
         input_index_(0),
         continuous_block_(false),
         first_continuous_block_(false),
@@ -85,10 +86,14 @@ class MemoryBlock {
     symbol_list_.clear();
   }
 
-  void Init(size_t real_size, OpMemoryType type, const ge::NodePtr &node, uint32_t out_index, size_t no_align_size) {
+  void Init(size_t real_size, OpMemoryType type, const ge::NodePtr &node, uint32_t out_index, size_t no_align_size,
+            int64_t stream_id) {
     real_size_list_.emplace_back(real_size);
     no_align_size_list_.emplace_back(no_align_size);
     node_type_index_list_.emplace_back(node, type, out_index, false);
+    if (stream_id != stream_id_) {
+        same_stream_ = false;
+    }
   }
   size_t Size() const { return block_size_; }
 
@@ -106,6 +111,12 @@ class MemoryBlock {
     node_type_index_list_.emplace_back(node_type_index);
     real_size_list_.emplace_back(real_size);
     no_align_size_list_.emplace_back(no_align_size);
+    if ((node_type_index.node != nullptr) && (node_type_index.node->GetOpDesc() != nullptr)) {
+      auto stream_id = node_type_index.node->GetOpDesc()->GetStreamId();
+      if (stream_id != stream_id_) {
+        same_stream_ = false;
+      }
+    }
   }
 
   void AddSymbol(const std::string &symbol) {
@@ -122,7 +133,7 @@ class MemoryBlock {
 
   std::string String();
 
-  bool IsSameLabel(std::string &first_batch_label);
+  bool IsSameBatchLabel();
 
   void AddContinuousLifeReuseBlock(MemoryBlock *block, DependStreamLife &total_node_depend_stream_life);
 
@@ -142,6 +153,7 @@ class MemoryBlock {
   int64_t stream_id_;
   bool deleted_block_;
   bool reuse_mem_;
+  bool same_stream_;
   uint32_t input_index_;
   bool continuous_block_;
   bool first_continuous_block_;
@@ -149,6 +161,7 @@ class MemoryBlock {
   bool is_zero_copy_;
   std::map<int64_t, size_t> depend_stream_life_;
   int64_t memory_type_;
+  std::string batch_label_;
  private:
   size_t block_size_;
   std::vector<size_t> real_size_list_;
@@ -209,7 +222,7 @@ class BlockMemAssigner : public MemAssigner {
 
   void GetOutAndWorkSpaceMem(std::vector<int64_t> &all_memory_size);
 
-  void GetNodeWorkSpaceSize(const ge::NodePtr &node, std::vector<int64_t> &workspace_memory);
+  void GetNodeWorkSpaceSize(const ge::NodePtr &node, std::vector<int64_t> &workspace_memory, int64_t &total_size);
 
   ///
   /// @ingroup GE
@@ -353,7 +366,7 @@ class BlockMemAssigner : public MemAssigner {
   /// @return void
   /// @author
   ///
-  void ReleaseMemory(MemoryBlock *to_release, vector<MemoryBlock *> &reusable_memory);
+  void ReleaseMemory(MemoryBlock *to_release, vector<MemoryBlock *> &reusable_memory, bool same_stream = true);
 
   ///
   /// @ingroup GE
@@ -379,11 +392,11 @@ class BlockMemAssigner : public MemAssigner {
 
   ///
   /// @ingroup GE
-  /// @brief Merge memory blocks between different batchs
+  /// @brief Resize memory blocks for each batchs
   /// @return merge or not
   /// @author
   ///
-  bool MergeDynamicBatchBlocks();
+  void ResizeDynamicBatchBlocks();
 
   void AssignContinuousBlocks();
 
@@ -436,6 +449,17 @@ class BlockMemAssigner : public MemAssigner {
 
   int64_t atomic_addr_clean_id_ = 0;
 
+  size_t theory_min_memory_size_ = 0;
+
+  size_t theory_memory_size_ = 0;
+
+  std::string max_batch_label_;
+
+  ///
+  /// @          [stream1][nodeid]
+  /// @[nodeid]  [stream2][nodeid]
+  /// @          [stream2][nodeid]
+  ///
   DependStreamLife total_node_depend_stream_life_;
 };
 }  // namespace ge
diff --git a/ge/graph/build/memory/graph_mem_assigner.cc b/ge/graph/build/memory/graph_mem_assigner.cc
index ad0235d5..16d5d38f 100755
--- a/ge/graph/build/memory/graph_mem_assigner.cc
+++ b/ge/graph/build/memory/graph_mem_assigner.cc
@@ -419,7 +419,8 @@ Status GraphMemoryAssigner::AssignContinuousInputMemory(const ge::NodePtr &node,
     GE_IF_BOOL_EXEC(is_peer_output_continuous && (peer_output_size != 1),
                     std::string error = "Current op" + FmtToStr(node->GetOpDesc()->GetName()) +
                         " requires continuous input, while the previous op" + FmtToStr(peer_op_desc->GetName()) +
-                        " requires continuous output. There may be conflict between the two. This node is not supported now.";
+                        " requires continuous output. There may be conflict between the two." +
+                        "This node is not supported now.";
                     GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
                     return PARAM_INVALID;);
 
@@ -429,7 +430,8 @@ Status GraphMemoryAssigner::AssignContinuousInputMemory(const ge::NodePtr &node,
     GE_IF_BOOL_EXEC(is_peer_reference,
                     std::string error = "Current op" + FmtToStr(node->GetOpDesc()->GetName()) +
                         " requires continuous input, while the previous op" + FmtToStr(peer_op_desc->GetName()) +
-                        " requires continuous output. There may be conflict between the two. This node is not supported now.";
+                        " requires continuous output. There may be conflict between the two." +
+                        "This node is not supported now.";
                     GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
                     return PARAM_INVALID;);
 
@@ -1646,9 +1648,9 @@ ge::Status GraphMemoryAssigner::SetAtomicCleanAttr(const NodePtr &node, const ve
     }
     string atomic_mem_size_str = ss.str();
 
-    GELOGI("[IMAS]SetAtomicCleanAttr : Set graph[%s] atomic_node[%s] output offset [%s] size[%s] streamid[%ld]",
+    GELOGI("[IMAS]SetAtomicCleanAttr : Set %s atomic_node name[%s] output[0] offset to [%s] streamid[%ld] size[%s]",
            node->GetOwnerComputeGraph()->GetName().c_str(), node_op_desc->GetName().c_str(),
-           atomic_mem_start_str.c_str(), atomic_mem_size_str.c_str(), node->GetOpDesc()->GetStreamId());
+           atomic_mem_start_str.c_str(), node->GetOpDesc()->GetStreamId(), atomic_mem_size_str.c_str());
   }
   return SUCCESS;
 }
diff --git a/ge/graph/build/model_builder.cc b/ge/graph/build/model_builder.cc
index d7039cfb..37eb499a 100755
--- a/ge/graph/build/model_builder.cc
+++ b/ge/graph/build/model_builder.cc
@@ -282,7 +282,7 @@ Status ModelBuilder::SetInputOutputDesc() {
 void ModelBuilder::AddNodeInputProperty() {
   for (const ge::NodePtr &node : compute_graph_->GetNodes(compute_graph_->GetGraphUnknownFlag())) {
     auto node_op_desc = node->GetOpDesc();
-    GE_IF_BOOL_EXEC(node_op_desc == nullptr, GELOGW("node_op_desc is nullptr!"); return );
+    GE_IF_BOOL_EXEC(node_op_desc == nullptr, GELOGW("node_op_desc is nullptr!"); return);
     vector<string> src_name_list;
     vector<int64_t> src_index_list;
     for (const auto &in_data_anchor : node->GetAllInDataAnchors()) {
@@ -309,10 +309,10 @@ void ModelBuilder::AddNodeInputProperty() {
 
   for (const ge::NodePtr &node : compute_graph_->GetNodes(compute_graph_->GetGraphUnknownFlag())) {
     auto node_op_desc = node->GetOpDesc();
-    GE_IF_BOOL_EXEC(node_op_desc == nullptr, GELOGW("node_op_desc is nullptr!"); return );
+    GE_IF_BOOL_EXEC(node_op_desc == nullptr, GELOGW("node_op_desc is nullptr!"); return);
     GE_IF_BOOL_EXEC(node_op_desc->GetType() == NETOUTPUT, continue);
     auto out_control_anchor = node->GetOutControlAnchor();
-    GE_IF_BOOL_EXEC(out_control_anchor == nullptr, GELOGW("out_control_anchor is nullptr"); return );
+    GE_IF_BOOL_EXEC(out_control_anchor == nullptr, GELOGW("out_control_anchor is nullptr"); return);
     vector<string> dst_name_list;
     vector<int64_t> dst_index_list;
     string dst_name_temp;
@@ -330,7 +330,7 @@ void ModelBuilder::AddNodeInputProperty() {
       dst_name_temp = "";
       int64_t dst_index = kWrongIndex;  // assign an impossible value to dst_index.
       for (const auto &in_data_anchor : out_data_anchor->GetPeerInDataAnchors()) {
-        GE_IF_BOOL_EXEC(in_data_anchor == nullptr, GELOGW("in_data_anchor is nullptr"); return );
+        GE_IF_BOOL_EXEC(in_data_anchor == nullptr, GELOGW("in_data_anchor is nullptr"); return);
         ge::NodePtr dst_node = in_data_anchor->GetOwnerNode();
         dst_name_temp = dst_name_temp.empty() ? dst_node->GetName() : dst_name_temp + ":" + dst_node->GetName();
         dst_index = in_data_anchor->GetIdx();
diff --git a/ge/graph/build/stream_allocator.cc b/ge/graph/build/stream_allocator.cc
index 4378f71b..a1cda506 100644
--- a/ge/graph/build/stream_allocator.cc
+++ b/ge/graph/build/stream_allocator.cc
@@ -49,7 +49,8 @@ inline bool HasContinuousStreamLabel(const ge::OpDescPtr &op_desc, std::string &
 }
 
 bool IsHcclOp(const string &op_type) {
-  const set<string> hccl_op_types({ge::HCOMBROADCAST, ge::HCOMALLGATHER, ge::HCOMALLREDUCE, ge::HCOMREDUCESCATTER, ge::HCOMREDUCE});
+  const set<string> hccl_op_types({ge::HCOMBROADCAST, ge::HCOMALLGATHER,
+                                   ge::HCOMALLREDUCE, ge::HCOMREDUCESCATTER, ge::HCOMREDUCE});
   return hccl_op_types.find(op_type) != hccl_op_types.end();
 }
 }  // namespace
diff --git a/ge/graph/build/stream_graph_optimizer.cc b/ge/graph/build/stream_graph_optimizer.cc
index 582c080b..2933d413 100644
--- a/ge/graph/build/stream_graph_optimizer.cc
+++ b/ge/graph/build/stream_graph_optimizer.cc
@@ -38,7 +38,7 @@ void StreamGraphOptimizer::RefreshNodeId(const ComputeGraphPtr &comp_graph, Grap
         continue;
       }
       for (ge::NodePtr &node : subgraph->GetDirectNode()) {
-        GE_CHECK_NOTNULL_EXEC(node->GetOpDesc(), return );
+        GE_CHECK_NOTNULL_EXEC(node->GetOpDesc(), return);
         if ((node->GetType() == END) || (node->GetType() == PLACEHOLDER)) {
           node->GetOpDesc()->SetId(static_cast<int64_t>(node_size));
           node_size++;
diff --git a/ge/graph/build/task_generator.cc b/ge/graph/build/task_generator.cc
index 41607f1f..b506f945 100755
--- a/ge/graph/build/task_generator.cc
+++ b/ge/graph/build/task_generator.cc
@@ -49,8 +49,6 @@ const char *const kIsLastNode = "is_last_node";
 const char *const kIsInputVar = "INPUT_IS_VAR";
 const char *const kIsOutputVar = "OUTPUT_IS_VAR";
 const char *const kProfilingMode = "PROFILING_MODE";
-const char *const kProfilingFpPoint = "FP_POINT";
-const char *const kProfilingBpPoint = "BP_POINT";
 const uint32_t kProfilingArStep = 2;
 const uint64_t kProfilingFpStartLogid = 1;
 const uint64_t kProfilingBpEndLogid = 2;
@@ -810,35 +808,23 @@ Status TaskGenerator::GetFpBpIndex(const ComputeGraphPtr &graph, ProfilingPoint
                                    vector<uint32_t> &all_reduce_nodes, std::string &fp_point_str,
                                    std::string &bp_point_str) const {
 
-  if (ge::GetContext().GetOption(OPTION_EXEC_PROFILING_FPPONIT_OPTIONS, fp_point_str) == SUCCESS &&
-      ge::GetContext().GetOption(OPTION_EXEC_PROFILING_BPPONIT_OPTIONS, bp_point_str) == SUCCESS &&
-      !fp_point_str.empty() && !bp_point_str.empty()) {
-      return SUCCESS;
-  }
+  ProfilingManager::Instance().GetFpBpPoint(fp_point_str, bp_point_str);
 
   Status ret = SUCCESS;
-  const char *fp_point = std::getenv(kProfilingFpPoint);
-  if (fp_point == nullptr) {
+  if (fp_point_str.empty()) {
     ret = AutoFindFpOpIndex(graph, profiling_point);
     if (ret != SUCCESS) {
       GELOGW("First forward profiling op_index not set and FindFpOpIndex failed.");
       return FAILED;
     }
-  } else {
-    fp_point_str = string(fp_point);
-    GELOGI("Get fp_point_str from env %s", fp_point_str.c_str());
   }
 
-  const char *bp_point = std::getenv(kProfilingBpPoint);
-  if (bp_point == nullptr) {
+  if (bp_point_str.empty()) {
     ret = AutoFindBpOpIndex(graph, profiling_point, all_reduce_nodes);
     if (ret != SUCCESS) {
       GELOGW("Last backward profiling op_index not set and FindBpOpIndex failed.");
       return FAILED;
     }
-  } else {
-    bp_point_str = string(bp_point);
-    GELOGI("Get bp_point_str from env %s", bp_point_str.c_str());
   }
 
   return SUCCESS;
diff --git a/ge/graph/label/case_label_maker.h b/ge/graph/label/case_label_maker.h
index 1078a906..3dbfb2bc 100644
--- a/ge/graph/label/case_label_maker.h
+++ b/ge/graph/label/case_label_maker.h
@@ -86,7 +86,6 @@
                                                                 |    Node    |
                                                                 +------------+
 *******************************************************************************/
-
 namespace ge {
 class CaseOpLabelMaker : public LabelMaker {
  public:
diff --git a/ge/graph/label/if_label_maker.h b/ge/graph/label/if_label_maker.h
index 0807f549..8b07eb96 100644
--- a/ge/graph/label/if_label_maker.h
+++ b/ge/graph/label/if_label_maker.h
@@ -70,7 +70,6 @@
                                                                 |    Node    |
                                                                 +------------+
 *******************************************************************************/
-
 namespace ge {
 class IfOpLabelMaker : public LabelMaker {
  public:
diff --git a/ge/graph/label/partitioned_call_label_maker.h b/ge/graph/label/partitioned_call_label_maker.h
index b89cb94c..3944aabd 100644
--- a/ge/graph/label/partitioned_call_label_maker.h
+++ b/ge/graph/label/partitioned_call_label_maker.h
@@ -54,7 +54,6 @@
         |       c       |
         +---------------+
 *******************************************************************************/
-
 namespace ge {
 class PartitionedCallLabelMaker : public LabelMaker {
  public:
diff --git a/ge/graph/label/while_label_maker.h b/ge/graph/label/while_label_maker.h
index 0eb0deee..6c30475b 100644
--- a/ge/graph/label/while_label_maker.h
+++ b/ge/graph/label/while_label_maker.h
@@ -70,7 +70,6 @@
                                                                 |    Node    |
                                                                 +------------+
 *******************************************************************************/
-
 namespace ge {
 class WhileOpLabelMaker : public LabelMaker {
  public:
diff --git a/ge/graph/load/graph_loader.cc b/ge/graph/load/graph_loader.cc
index aa825a5d..44556422 100755
--- a/ge/graph/load/graph_loader.cc
+++ b/ge/graph/load/graph_loader.cc
@@ -283,7 +283,8 @@ Status GraphLoader::ExecuteModel(uint32_t model_id, rtStream_t stream, bool asyn
                                  std::vector<GeTensorDesc> &output_desc) {
   auto model_manager = ModelManager::GetInstance();
   GE_CHECK_NOTNULL(model_manager);
-  Status ret = model_manager->ExecuteModel(model_id, stream, async_mode, input_data, input_desc, output_data, output_desc);
+  Status ret = model_manager->ExecuteModel(model_id, stream, async_mode,
+                                           input_data, input_desc, output_data, output_desc);
   if (ret != SUCCESS) {
     GELOGE(ret, "Execute model failed, model_id:%u.", model_id);
     return ret;
diff --git a/ge/graph/load/new_model_manager/data_dumper.cc b/ge/graph/load/new_model_manager/data_dumper.cc
index 4534fe73..b331d780 100644
--- a/ge/graph/load/new_model_manager/data_dumper.cc
+++ b/ge/graph/load/new_model_manager/data_dumper.cc
@@ -919,11 +919,11 @@ Status DataDumper::DumpExceptionInfo(const std::vector<rtExceptionInfo> exceptio
       ReplaceStringElem(op_name);
       ReplaceStringElem(op_type);
       string dump_file_path =
-          "./" + op_type + "." + op_name + "." + to_string(op_desc_info.task_id) + "." + to_string(now_time);
+          "./" + op_type + "." + op_name + "." + std::to_string(op_desc_info.task_id) + "." + std::to_string(now_time);
       GELOGI("The exception dump file path is %s", dump_file_path.c_str());
 
       uint64_t proto_size = dump_data.ByteSizeLong();
-      unique_ptr<char[]> proto_msg(new (std::nothrow) char[proto_size]);
+      std::unique_ptr<char[]> proto_msg(new (std::nothrow) char[proto_size]);
       bool ret = dump_data.SerializeToArray(proto_msg.get(), proto_size);
       if (!ret || proto_size == 0) {
         GELOGE(PARAM_INVALID, "Dump data proto serialize failed");
diff --git a/ge/graph/load/new_model_manager/davinci_model.cc b/ge/graph/load/new_model_manager/davinci_model.cc
index 81d47b3b..bc755e07 100755
--- a/ge/graph/load/new_model_manager/davinci_model.cc
+++ b/ge/graph/load/new_model_manager/davinci_model.cc
@@ -16,7 +16,6 @@
 
 #include "graph/load/new_model_manager/davinci_model.h"
 
-#include <cce/dnn.h>
 #include <graph/utils/node_utils.h>
 #include <algorithm>
 #include <map>
@@ -84,7 +83,7 @@ const uint32_t kAddrLen = sizeof(void *);
 const int kDecimal = 10;
 const int kBytes = 8;
 const uint32_t kDataMemAlignSizeCompare = 64;
-const uint32_t kDumpL1FusionOpMByteSize = 2 * 1024 * 1024;
+const uint32_t kDumpL1FusionOpMByteSize = 2097152;   // 2 * 1024 * 1024
 const uint32_t kDumpFlagOfL1Fusion = 0;
 const char *const kDefaultBatchLable = "Batch_default";
 const char *const kGetDynamicDimsName = "ascend_mbatch_get_dynamic_dims_node";
@@ -331,8 +330,8 @@ Status DavinciModel::InitFeatureMapAndP2PMem(void *dev_ptr, size_t mem_size) {
       GELOGE(GE_EXEC_ALLOC_FEATURE_MAP_MEM_FAILED, "Alloc feature map memory failed. size: %zu", data_size);
       return GE_EXEC_ALLOC_FEATURE_MAP_MEM_FAILED;
     }
-    GEEVENT("[IMAS]InitFeatureMapAndP2PMem graph_%u MallocMemory type[F] memaddr[%p] mem_size[%zu]", runtime_param_.graph_id,
-            mem_base_, data_size);
+    GEEVENT("[IMAS]InitFeatureMapAndP2PMem graph_%u MallocMemory type[F] memaddr[%p] mem_size[%zu]",
+            runtime_param_.graph_id, mem_base_, data_size);
 
     if (!is_inner_weight_base_) {
       weights_mem_base_ = mem_base_;
@@ -713,7 +712,7 @@ Status DavinciModel::Init(void *dev_ptr, size_t mem_size, void *weight_ptr, size
   // collect profiling for ge
   auto &profiling_manager = ProfilingManager::Instance();
   if (profiling_manager.ProfilingModelLoadOn()) {
-    Status p_ret = ReportProfilingData(!profiling_manager.IsAclApiMode());
+    Status p_ret = ReportProfilingData();
     if (p_ret != SUCCESS) {
       GELOGE(p_ret, "Report profiling data failed.");
       return p_ret;
@@ -724,14 +723,14 @@ Status DavinciModel::Init(void *dev_ptr, size_t mem_size, void *weight_ptr, size
   return ret;
 }
 
-Status DavinciModel::ReportProfilingData(bool check_device) {
+Status DavinciModel::ReportProfilingData() {
   std::vector<ComputeGraphDescInfo> compute_graph_desc_info;
   Status ret = GetComputeGraphInfo(compute_graph_desc_info);
   if (ret != SUCCESS) {
     GELOGE(ret, "GetComputeGraphInfo failed.");
     return ret;
   }
-  ProfilingManager::Instance().ReportProfilingData(model_id_, GetTaskDescInfo(), compute_graph_desc_info, check_device);
+  ProfilingManager::Instance().ReportProfilingData(model_id_, GetTaskDescInfo(), compute_graph_desc_info);
   GE_CHK_STATUS(SinkModelProfile(), "Sink model profiler failed.");
   op_list_.clear();
 
@@ -1544,7 +1543,8 @@ Status DavinciModel::LoadWithQueue() {
   }
 
   if (output_queue_ids_.size() != new_output_data_info_.size()) {
-    GELOGE(ACL_ERROR_GE_EXEC_MODEL_QUEUE_ID_INVALID, "Output queue ids not match model: output_queue=%zu output_data=%zu",
+    GELOGE(ACL_ERROR_GE_EXEC_MODEL_QUEUE_ID_INVALID,
+           "Output queue ids not match model: output_queue=%zu output_data=%zu",
            output_queue_ids_.size(), new_output_data_info_.size());
     return ACL_ERROR_GE_EXEC_MODEL_QUEUE_ID_INVALID;
   }
@@ -2186,8 +2186,9 @@ Status DavinciModel::CopyInputData(const InputData &input_data, bool device_data
   const std::vector<DataBuffer> &blobs = input_data.blobs;
   for (const auto &data : new_input_data_info_) {
     if (data.first >= blobs.size()) {
-      GELOGE(FAILED, "Blobs not match: blobs=%zu, tensor=%zu, index=%u, size=%ld", blobs.size(),
-             new_input_data_info_.size(), data.first, data.second.GetDataInfo().at(0).first);
+      GELOGE(FAILED, "Blobs not match: blobs=%zu, tensor=%zu, index=%u, size=%ld, op_name(%s)", blobs.size(),
+             new_input_data_info_.size(), data.first, data.second.GetDataInfo().at(0).first,
+             data.second.GetOpName().c_str());
       return FAILED;
     }
 
@@ -2198,13 +2199,14 @@ Status DavinciModel::CopyInputData(const InputData &input_data, bool device_data
     }
     uint64_t data_size = data.second.GetDataSize();
     GE_CHK_BOOL_RET_STATUS(data_size >= data_buf.length, PARAM_INVALID,
-                           "input data size(%lu) does not match model required size(%lu), ret failed.", data_buf.length,
-                           data_size);
+                           "input data size(%lu) does not match model required size(%lu), op_name(%s) ret failed.",
+                           data_buf.length, data_size, data.second.GetOpName().c_str());
     void *mem_addr = data.second.GetBasicAddr();
     void *data_buf_addr = reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(data_buf.data));
     uint64_t data_buf_length = data_buf.length;
-    GELOGI("[IMAS]CopyPlainData memcpy graph_%u type[F] input[%u] dst[%p] src[%p] mem_size[%lu] datasize[%lu]",
-           runtime_param_.graph_id, data.first, mem_addr, data_buf_addr, data_size, data_buf_length);
+    GELOGI("CopyPlainData memcpy graph_%u type[F] input[%s] rank[%u] dst[%p] src[%p] mem_size[%lu] datasize[%lu]",
+           runtime_param_.graph_id, data.second.GetOpName().c_str(), data.first, mem_addr, data_buf_addr, data_size,
+           data_buf_length);
     GE_CHK_RT_RET(rtMemcpy(mem_addr, data_size, data_buf_addr, data_buf_length, kind));
   }
 
@@ -2248,10 +2250,8 @@ inline int64_t SumSize(const vector<int64_t> &size_list) {
 
 Status DavinciModel::SinkModelProfile() {
   // profiling plugin must be registered
-  Msprof::Engine::Reporter *reporter = PluginImpl::GetPluginReporter();
-  GE_IF_BOOL_EXEC(reporter == nullptr, GELOGI("Profiling report is nullptr!"); return SUCCESS);
-
-  Msprof::Engine::ReporterData reporter_data{};
+  auto &prof_mgr = ProfilingManager::Instance();
+  ReporterData reporter_data{};
   // report model data tag name
   std::string tag_name;
   tag_name.append("model_load_info_").append(std::to_string(this->Id()));
@@ -2269,32 +2269,32 @@ Status DavinciModel::SinkModelProfile() {
   reporter_data.deviceId = device_id_;
   reporter_data.data = (unsigned char *)&name_len;
   reporter_data.dataLen = sizeof(int32_t);
-  GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.",
-                   this->Id());
+  GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
+                   "Reporter data fail, model id:%u.", this->Id());
 
   reporter_data.data = (unsigned char *)name.c_str();
   reporter_data.dataLen = name.size();
-  GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.",
-                   this->Id());
+  GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
+                   "Reporter data fail, model id:%u.", this->Id());
 
   uint32_t model_id = this->Id();
   reporter_data.data = (unsigned char *)&model_id;
   reporter_data.dataLen = sizeof(uint32_t);
-  GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.",
-                   this->Id());
+  GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
+                   "Reporter data fail, model id:%u.", this->Id());
 
   // Load Start/End Time
   int64_t start_time = this->GetLoadBeginTime();
   reporter_data.data = (unsigned char *)&start_time;
   reporter_data.dataLen = sizeof(int64_t);
-  GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.",
-                   this->Id());
+  GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
+                   "Reporter data fail, model id:%u.", this->Id());
 
   int64_t end_time = this->GetLoadEndTime();
   reporter_data.data = (unsigned char *)&end_time;
   reporter_data.dataLen = sizeof(int64_t);
-  GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.",
-                   this->Id());
+  GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
+                   "Reporter data fail, model id:%u.", this->Id());
 
   int32_t task_num = task_list_.size();
   std::multimap<uint32_t, uint32_t> op_id_map;
@@ -2308,6 +2308,7 @@ Status DavinciModel::SinkModelProfile() {
       uint32_t op_num = fusion_op_info->original_op_names.size();
       uint32_t task_id = task->GetTaskID();
       if (op_num > 0) {
+        GELOGI("task.id = %u, opNum = %u", task_id, op_num);
         op_id_map.insert(std::make_pair(fusion_op_info->op_index, task_id));
       }
     }
@@ -2350,39 +2351,39 @@ Status DavinciModel::SinkModelProfile() {
       int32_t fusion_op_name_len = fusion_op_name.size() == 0 ? 1 : fusion_op_name.size();
       reporter_data.data = (unsigned char *)&fusion_op_name_len;
       reporter_data.dataLen = sizeof(int32_t);
-      GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.",
-                       this->Id());
+      GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
+                       "Reporter data fail, model id:%u.", this->Id());
 
       reporter_data.data = (unsigned char *)fusion_op_name.c_str();
       reporter_data.dataLen = fusion_op_name_len;
-      GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.",
-                       this->Id());
+      GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
+                       "Reporter data fail, model id:%u.", this->Id());
 
       // original op name before fusion
       reporter_data.data = (unsigned char *)&op_num;
       reporter_data.dataLen = sizeof(int32_t);
-      GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.",
-                       this->Id());
+      GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
+                       "Reporter data fail, model id:%u.", this->Id());
 
       for (uint32_t k = 0; k < op_num; k++) {
         std::string op_name = fusion_op_info->original_op_names[k];
         int32_t op_name_len = op_name.size() == 0 ? 1 : op_name.size();
         reporter_data.data = (unsigned char *)&op_name_len;
         reporter_data.dataLen = sizeof(int32_t);
-        GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.",
-                         this->Id());
+        GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
+                         "Reporter data fail, model id:%u.", this->Id());
         reporter_data.data = (unsigned char *)op_name.c_str();
         reporter_data.dataLen = op_name_len;
-        GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.",
-                         this->Id());
+        GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
+                         "Reporter data fail, model id:%u.", this->Id());
       }
 
       // stream id info
       uint32_t streamId = task->GetStreamId();
       reporter_data.data = (unsigned char *)&streamId;
       reporter_data.dataLen = sizeof(int32_t);
-      GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.",
-                       this->Id());
+      GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
+                       "Reporter data fail, model id:%u.", this->Id());
 
       // memory info
       struct memoryInfo memory_info;
@@ -2398,22 +2399,22 @@ Status DavinciModel::SinkModelProfile() {
           memory_info.weight_size + memory_info.input_size + memory_info.output_size + memory_info.workspace_size;
       reporter_data.data = (unsigned char *)&memory_info;
       reporter_data.dataLen = sizeof(struct memoryInfo);
-      GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.",
-                       this->Id());
+      GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
+                       "Reporter data fail, model id:%u.", this->Id());
 
       // task info
       reporter_data.data = (unsigned char *)&task_count;
       reporter_data.dataLen = sizeof(uint32_t);
-      GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.",
-                       this->Id());
+      GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
+                       "Reporter data fail, model id:%u.", this->Id());
 
       Range task_range = op_id_map.equal_range(op_id);
       for (CIT idx = task_range.first; idx != task_range.second; ++idx) {
         uint32_t task_id = idx->second;
         reporter_data.data = (unsigned char *)&task_id;
         reporter_data.dataLen = sizeof(uint32_t);
-        GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.",
-                         this->Id());
+        GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
+                         "Reporter data fail, model id:%u.", this->Id());
       }
     }
   }
@@ -2422,10 +2423,8 @@ Status DavinciModel::SinkModelProfile() {
 
 Status DavinciModel::SinkTimeProfile(const InputData &current_data) {
   // profiling plugin must be registered
-  Msprof::Engine::Reporter *reporter = PluginImpl::GetPluginReporter();
-  GE_IF_BOOL_EXEC(reporter == nullptr, GELOGI("Profiling report is nullptr!"); return SUCCESS);
-
-  Msprof::Engine::ReporterData reporter_data{};
+  auto &prof_mgr = ProfilingManager::Instance();
+  ReporterData reporter_data{};
   // report model data tag name
   std::string tag_name;
   tag_name.append("model_time_info_")
@@ -2448,33 +2447,33 @@ Status DavinciModel::SinkTimeProfile(const InputData &current_data) {
   size_t name_len = name.size();
   reporter_data.data = (unsigned char *)&name_len;
   reporter_data.dataLen = sizeof(int32_t);
-  GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.",
-                   this->Id());
+  GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
+                   "Reporter data fail, model id:%u.", this->Id());
 
   reporter_data.data = (unsigned char *)name.c_str();
   reporter_data.dataLen = name.size();
-  GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.",
-                   this->Id());
+  GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
+                   "Reporter data fail, model id:%u.", this->Id());
 
   // request id
   uint64_t request_id = current_data.request_id;
   reporter_data.data = (unsigned char *)&request_id;
   reporter_data.dataLen = sizeof(uint32_t);
-  GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED,
+  GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
                    "Reporter data fail, model id:%u, data index:%u.", this->Id(), current_data.index);
 
   // thread id
   int32_t thread_id = GetDataInputTid();
   reporter_data.data = (unsigned char *)&thread_id;
   reporter_data.dataLen = sizeof(int32_t);
-  GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED,
+  GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
                    "Reporter data fail, model id:%u, data index:%u.", this->Id(), current_data.index);
 
   // time info
   time_info_.modelId = this->Id();
   reporter_data.data = (unsigned char *)&time_info_;
   reporter_data.dataLen = sizeof(struct timeInfo);
-  GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED,
+  GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
                    "Reporter data fail, model id:%u, data index:%u.", this->Id(), current_data.index);
 
   return SUCCESS;
@@ -2696,8 +2695,9 @@ Status DavinciModel::ReturnResult(uint32_t data_id, const bool rslt_flg, const b
       is_getnext_sink_dynamic_ = true;
       cur_dynamic_dims_.clear();
       cur_dynamic_dims_.resize(shape_of_cur_dynamic_dims_);
-      GE_CHK_RT_RET(rtMemcpy(cur_dynamic_dims_.data(), shape_of_cur_dynamic_dims_ * sizeof(int64_t),
-                                netoutput_last_input_addr_, netoutput_last_input_size_, RT_MEMCPY_DEVICE_TO_HOST));
+      auto ret = rtMemcpy(cur_dynamic_dims_.data(), shape_of_cur_dynamic_dims_ * sizeof(int64_t),
+                          netoutput_last_input_addr_, netoutput_last_input_size_, RT_MEMCPY_DEVICE_TO_HOST);
+      GE_CHK_RT_RET(ret);
     }
     GELOGD("Cur dynamic dims is %s.", formats::JoinToString(cur_dynamic_dims_).c_str());
     if (GenOutputTensorInfo(op_desc, data_index, output_data, outputs) != SUCCESS) {
@@ -2801,76 +2801,42 @@ void *DavinciModel::Run(DavinciModel *model) {
                                       reinterpret_cast<int64_t *>(shape_data_buffer_data) +
                                       shape_data_buffer_length / sizeof(int64_t));
       GELOGD("Data: cur dynamic dims is %s", formats::JoinToString(model->cur_dynamic_dims_).c_str());
-      delete[] (int64_t *)current_data.blobs.back().data;
+      delete[] reinterpret_cast<int64_t *>(current_data.blobs.back().data);
       current_data.blobs.pop_back();
     }
     GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingModelExecuteOn(), model->SetProfileTime(MODEL_PRE_PROC_END));
     GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingModelExecuteOn(), model->SetProfileTime(MODEL_INFER_START));
-    if (ProfilingManager::Instance().ProfilingOpTraceOn()) {
-      GELOGI("GetOpTraceIterNum:%d", ProfilingManager::Instance().GetOpTraceIterNum());
-      for (int32_t i = 0; i < ProfilingManager::Instance().GetOpTraceIterNum(); i++) {
-        if (!ProfilingManager::Instance().ProfilingLoadFlag()) {
-          vector<int32_t> prof_device_id_vec = ProfilingManager::Instance().GetProfilingDeviceId();
-          for (size_t j = 0; j < prof_device_id_vec.size(); ++j) {
-            // just profiling, no need to check value
-            (void)ProfilingManager::Instance().StartProfiling(i, prof_device_id_vec[j]);
-          }
-        }
-
-        GELOGI("rtModelExecute start.");
-        rt_ret = rtModelExecute(model->rt_model_handle_, model->rt_model_stream_, 0);
-        GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, rslt_flg = false;
-                        (void)model->ReturnResult(current_data.index, false, false, data_wrapper->GetOutput());
-                        continue);  // [No need to check value]
-        GELOGI("rtModelExecute end");
-
-        GELOGI("rtStreamSynchronize start.");
-        rt_ret = rtStreamSynchronize(model->rt_model_stream_);
-        if (rt_ret == kModelAbortNormal || rt_ret == kModelAbortNormalNew) {
-          GELOGI("The model with multiple datasets aborts normally.");
-        } else {
-          GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, rslt_flg = false;
-                          (void)model->ReturnResult(current_data.index, false, seq_end_flag, data_wrapper->GetOutput());
-                          continue);  // [No need to check value]
-        }
-
-        GELOGI("rtStreamSynchronize end.");
-        (void)ProfilingManager::Instance().StopProfiling();  // just profiling, no need to check value
-      }
+    GE_TIMESTAMP_START(rtModelExecute);
+    GELOGI("rtModelExecute start.");
+    rt_ret = rtModelExecute(model->rt_model_handle_, model->rt_model_stream_, 0);
+    GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, rslt_flg = false;
+                    (void)model->ReturnResult(current_data.index, false, false, data_wrapper->GetOutput());
+                    CsaInteract::GetInstance().WriteErrorCode(rt_ret, ERROR_MODULE_RUNTIME, JOBSUBSTATE_GRAPH_EXEC);
+                    continue);
+    GELOGI("rtModelExecute end");
+    GE_IF_BOOL_EXEC(model->is_first_execute_, GE_TIMESTAMP_EVENT_END(rtModelExecute, "GraphExcute::rtModelExecute"));
+
+    GE_TIMESTAMP_START(rtStreamSynchronize);
+    GELOGI("rtStreamSynchronize start.");
+    rt_ret = rtStreamSynchronize(model->rt_model_stream_);
+    if (rt_ret == kEndOfSequence || rt_ret == kEndOfSequenceNew) {
+      seq_end_flag = true;
+    }
+    if (rt_ret == kModelAbortNormal || rt_ret == kModelAbortNormalNew) {
+      GELOGI("The model with multiple datasets aborts normally.");
     } else {
-      GE_TIMESTAMP_START(rtModelExecute);
-      GELOGI("rtModelExecute start.");
-      rt_ret = rtModelExecute(model->rt_model_handle_, model->rt_model_stream_, 0);
-      GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, rslt_flg = false;
-                      (void)model->ReturnResult(current_data.index, false, false, data_wrapper->GetOutput());
-                      CsaInteract::GetInstance().WriteErrorCode(rt_ret, ERROR_MODULE_RUNTIME, JOBSUBSTATE_GRAPH_EXEC);
-                      continue);
-      GELOGI("rtModelExecute end");
-      GE_IF_BOOL_EXEC(model->is_first_execute_, GE_TIMESTAMP_EVENT_END(rtModelExecute, "GraphExcute::rtModelExecute"));
-
-      GE_TIMESTAMP_START(rtStreamSynchronize);
-      GELOGI("rtStreamSynchronize start.");
-      rt_ret = rtStreamSynchronize(model->rt_model_stream_);
-      if (rt_ret == kEndOfSequence || rt_ret == kEndOfSequenceNew) {
-        seq_end_flag = true;
-      }
-      if (rt_ret == kModelAbortNormal || rt_ret == kModelAbortNormalNew) {
-        GELOGI("The model with multiple datasets aborts normally.");
-      } else {
-        GE_IF_BOOL_EXEC(
-          rt_ret != RT_ERROR_NONE, rslt_flg = false; GELOGI("seq_end_flg: %d", seq_end_flag);
-          (void)model->ReturnResult(current_data.index, false, seq_end_flag,
-                                    data_wrapper->GetOutput());  // [No need to check value]
-          CsaInteract::GetInstance().StoreInternalErrorCode(rt_ret, ERROR_MODULE_RUNTIME, JOBSUBSTATE_GRAPH_EXEC);
-          continue);
-      }
-
-      GELOGI("rtStreamSynchronize end.");
-      GE_IF_BOOL_EXEC(model->is_first_execute_,
-                      GE_TIMESTAMP_EVENT_END(rtStreamSynchronize, "GraphExcute::Wait for rtStreamSynchronize"));
-      GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingModelExecuteOn(), model->SetProfileTime(MODEL_INFER_END));
+      GE_IF_BOOL_EXEC(
+        rt_ret != RT_ERROR_NONE, rslt_flg = false; GELOGI("seq_end_flg: %d", seq_end_flag);
+        (void)model->ReturnResult(current_data.index, false, seq_end_flag,
+                                  data_wrapper->GetOutput());  // [No need to check value]
+        CsaInteract::GetInstance().StoreInternalErrorCode(rt_ret, ERROR_MODULE_RUNTIME, JOBSUBSTATE_GRAPH_EXEC);
+        continue);
     }
 
+    GELOGI("rtStreamSynchronize end.");
+    GE_IF_BOOL_EXEC(model->is_first_execute_,
+                    GE_TIMESTAMP_EVENT_END(rtStreamSynchronize, "GraphExcute::Wait for rtStreamSynchronize"));
+    GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingModelExecuteOn(), model->SetProfileTime(MODEL_INFER_END));
     GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingModelExecuteOn(),
                     model->SetProfileTime(MODEL_AFTER_PROC_START));
     GE_TIMESTAMP_START(ReturnResult3);
@@ -3170,21 +3136,29 @@ Status DavinciModel::DistributeTask() {
 
   const auto &model_task_def = ge_model_->GetModelTaskDefPtr();
   for (size_t task_index = 0; task_index < task_list_.size(); ++task_index) {
+    auto &task_def = model_task_def->task(task_index);
     auto &task = task_list_.at(task_index);
     GE_CHK_STATUS_RET(task->Distribute(), "Task[%zu] distribute fail", task_index);
     // for data dump
-    auto op_index = std::max(model_task_def->task(task_index).kernel().context().op_index(),
-                             model_task_def->task(task_index).kernel_ex().op_index());
+    auto op_index = std::max(task_def.kernel().context().op_index(),
+                             task_def.kernel_ex().op_index());
     OpDescPtr op = GetOpByIndex(op_index);
     GE_CHECK_NOTNULL(op);
 
-    SaveDumpOpInfo(runtime_param_, op, task->GetTaskID(), task->GetStreamId());
     if (reinterpret_cast<void *>(task->GetDumpArgs()) != nullptr) {
       bool call_dump = GetDumpProperties().IsLayerNeedDump(name_, om_name_, op->GetName()) && task->CallSaveDumpInfo();
       if (call_dump || is_op_debug_reg_) {
         SaveDumpTask(task->GetTaskID(), task->GetStreamId(), op, task->GetDumpArgs());
       }
     }
+
+    auto task_type = static_cast<rtModelTaskType_t>(task_def.type());
+    bool no_need_profiling = (task_type != RT_MODEL_TASK_KERNEL)
+        && (task_type != RT_MODEL_TASK_KERNEL_EX)
+        && (task_type != RT_MODEL_TASK_HCCL);
+    GE_IF_BOOL_EXEC(no_need_profiling, continue);
+
+    SaveDumpOpInfo(runtime_param_, op, task->GetTaskID(), task->GetStreamId());
     // Load task info for profiling
     TaskDescInfo task_desc_info;
     if (!om_name_.empty()) {
@@ -3193,7 +3167,7 @@ Status DavinciModel::DistributeTask() {
       task_desc_info.model_name = name_;
     }
     task_desc_info.op_name = op->GetName();
-    task_desc_info.block_dim = model_task_def->task(task_index).kernel().block_dim();
+    task_desc_info.block_dim = task_def.kernel().block_dim();
     task_desc_info.task_id = task->GetTaskID();
     task_desc_info.stream_id = task->GetStreamId();
     task_desc_info_.emplace_back(task_desc_info);
@@ -3391,14 +3365,14 @@ bool DavinciModel::CheckInputAndModelSize(const int64_t &input_size, const int64
 ///
 Status DavinciModel::CopyModelData(const InputData &input_data, OutputData &output_data, bool is_dynamic) {
   if (UpdateIoTaskArgs(new_input_data_info_, true, input_data.blobs, is_dynamic, input_data.batch_label) != SUCCESS) {
-    GELOGE(PARAM_INVALID, "[ZCPY] Update input data to model failed.");
-    return PARAM_INVALID;
+    GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[ZCPY] Update input data to model failed.");
+    return ACL_ERROR_GE_PARAM_INVALID;
   }
 
   if (UpdateIoTaskArgs(new_output_data_info_, false, output_data.blobs, is_dynamic, input_data.batch_label) !=
       SUCCESS) {
-    GELOGE(PARAM_INVALID, "[ZCPY] Update output data to model failed.");
-    return PARAM_INVALID;
+    GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[ZCPY] Update output data to model failed.");
+    return ACL_ERROR_GE_PARAM_INVALID;
   }
 
   for (ZeroCopyTask &task : zero_copy_tasks_) {
@@ -3444,7 +3418,7 @@ Status DavinciModel::UpdateIoTaskArgs(const std::map<uint32_t, ZeroCopyOffset> &
     }
 
     if (!CheckInputAndModelSize(buffer.length, data.second.GetDataSize(), is_dynamic)) {
-      GELOGE(FAILED, "Check input size and model size failed");
+      GELOGE(FAILED, "Check input size and model size failed, op[%s]", data.second.GetOpName().c_str());
       return FAILED;
     }
 
@@ -3861,7 +3835,8 @@ Status DavinciModel::NnExecute(rtStream_t stream, bool async_mode, const InputDa
   if (!is_async_mode_) {
     GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingModelExecuteOn(), SetProfileTime(MODEL_AFTER_PROC_START));
     ret = CopyOutputData(input_data.index, output_data, RT_MEMCPY_DEVICE_TO_DEVICE);
-    GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return ret, "Copy Output data to user failed.");
+    GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return ACL_ERROR_GE_INTERNAL_ERROR,
+        "Copy Output data to user failed.");
     GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingModelExecuteOn(), SetProfileTime(MODEL_AFTER_PROC_END));
   }
 
@@ -4061,7 +4036,7 @@ void DavinciModel::SetDataDumperArgs(const ComputeGraphPtr &compute_graph) {
   data_dumper_.SetDeviceId(device_id);
 
   // set loop count addr
-  auto get_var_addr = [](const OpDescPtr &op, const RuntimeParam &runtime_param) -> void * {
+  auto get_var_addr = [](const OpDescPtr &op, const RuntimeParam &runtime_param) -> void *{
     if (op != nullptr) {
       auto v_output_size = ModelUtils::GetOutputSize(op);
       auto v_output_addr = ModelUtils::GetOutputDataAddrs(runtime_param, op);
diff --git a/ge/graph/load/new_model_manager/davinci_model.h b/ge/graph/load/new_model_manager/davinci_model.h
index 650f19eb..19888e1f 100755
--- a/ge/graph/load/new_model_manager/davinci_model.h
+++ b/ge/graph/load/new_model_manager/davinci_model.h
@@ -440,7 +440,7 @@ class DavinciModel {
 
   Status SinkTimeProfile(const InputData &current_data);
 
-  Status ReportProfilingData(bool check_device = true);
+  Status ReportProfilingData();
 
   void SaveDumpOpInfo(const RuntimeParam &model_param, const OpDescPtr &op, uint32_t task_id, uint32_t stream_id) {
     data_dumper_.SaveDumpOpInfo(model_param, op, task_id, stream_id);
diff --git a/ge/graph/load/new_model_manager/model_manager.cc b/ge/graph/load/new_model_manager/model_manager.cc
index d6cdf42d..b595ac39 100755
--- a/ge/graph/load/new_model_manager/model_manager.cc
+++ b/ge/graph/load/new_model_manager/model_manager.cc
@@ -40,9 +40,7 @@ const int kCmdParSize = 2;
 const int kDumpCmdPairSize = 2;
 const std::size_t kProfCmdParaMaxSize = 1000;
 const std::size_t kProfStartCmdParaSize = 2;
-const std::string kCmdTypeProfile = "profile";
 const std::string kCmdTypeDump = "dump";
-const std::string kCmdTypeProfiling = "profiling";
 const std::string kCmdTypeProfInit = "prof_init";
 const std::string kCmdTypeProfFinalize = "prof_finalize";
 const std::string kCmdTypeProfStart = "prof_start";
@@ -51,6 +49,9 @@ const std::string kCmdTypeProfModelSubscribe = "prof_model_subscribe";
 const std::string kCmdTypeProfModelUnsubscribe = "prof_model_cancel_subscribe";
 const char *const kBatchLoadBuf = "batchLoadsoFrombuf";
 const char *const kDeleteCustOp = "deleteCustOp";
+const int kTimeSpecNano = 1000000000;
+const int kTimeSpecMiro = 1000000;
+const int kSessionMaxBias = 100;
 struct CustAicpuSoBuf {
   uint64_t kernelSoBuf;
   uint32_t kernelSoBufLen;
@@ -224,7 +225,7 @@ ge::Status ModelManager::DestroyAicpuSessionForInfer(uint32_t model_id) {
 
 ge::Status ModelManager::DestroyAicpuKernel(uint64_t session_id, uint32_t model_id) {
   GELOGD("destroy aicpu kernel in session_id %lu, model_id %u.", session_id, model_id);
-  std::lock_guard<std::mutex> lock(sess_ids_mutex_);
+  std::lock_guard<std::mutex> lock(map_mutex_);
   std::string model_key = std::to_string(session_id) + "_" + std::to_string(model_id);
   if (model_aicpu_kernel_.find(model_key) != model_aicpu_kernel_.end()) {
     Status ret = KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType::FWK_ADPT_KERNEL_DESTROY, session_id, model_id);
@@ -237,7 +238,7 @@ ge::Status ModelManager::DestroyAicpuKernel(uint64_t session_id, uint32_t model_
 }
 
 ge::Status ModelManager::CreateAicpuKernel(uint64_t session_id, uint32_t model_id, uint64_t kernel_id) {
-  std::lock_guard<std::mutex> lock(sess_ids_mutex_);
+  std::lock_guard<std::mutex> lock(map_mutex_);
   std::vector<uint64_t> v_aicpu_kernel;
   std::string model_key = std::to_string(session_id) + "_" + std::to_string(model_id);
   if (model_aicpu_kernel_.find(model_key) != model_aicpu_kernel_.end()) {
@@ -345,7 +346,7 @@ Status ModelManager::LoadModelOnline(uint32_t &model_id, const shared_ptr<ge::Ge
 
     GELOGI("Parse model %u success.", model_id);
 
-    davinci_model->SetProfileTime(MODEL_LOAD_START, (timespec.tv_sec * 1000 * 1000 * 1000 +
+    davinci_model->SetProfileTime(MODEL_LOAD_START, (timespec.tv_sec * kTimeSpecNano +
                                                      timespec.tv_nsec));  // 1000 ^ 3 converts second to nanosecond
     davinci_model->SetProfileTime(MODEL_LOAD_END);
   } while (0);
@@ -629,8 +630,7 @@ Status ModelManager::Stop(uint32_t model_id) {
 ///
 Status ModelManager::HandleCommand(const Command &command) {
   static const std::map<std::string, std::function<uint32_t(const Command &)>> cmds = {
-      {kCmdTypeProfile, HandleProfileCommand}, {kCmdTypeDump, HandleDumpCommand},
-      {kCmdTypeProfiling, HandleAclProfilingCommand}, {kCmdTypeProfInit, HandleProfInitCommand},
+      {kCmdTypeDump, HandleDumpCommand}, {kCmdTypeProfInit, HandleProfInitCommand},
       {kCmdTypeProfFinalize, HandleProfFinalizeCommand}, {kCmdTypeProfStart, HandleProfStartCommand},
       {kCmdTypeProfStop, HandleProfStopCommand},
       {kCmdTypeProfModelSubscribe, HandleProfModelSubscribeCommand},
@@ -645,21 +645,6 @@ Status ModelManager::HandleCommand(const Command &command) {
   }
 }
 
-Status ModelManager::HandleAclProfilingCommand(const Command &command) {
-  if (command.cmd_params.size() < kCmdParSize) {
-    GELOGE(PARAM_INVALID, "When the cmd_type is 'profiling', the size of cmd_params must larger than 2.");
-    return PARAM_INVALID;
-  }
-
-  std::string map_key = command.cmd_params[0];
-  std::string value = command.cmd_params[1];
-  if (map_key == PROFILE_CONFIG) {
-    ProfilingManager::Instance().SetProfilingConfig(value);
-  }
-
-  return SUCCESS;
-}
-
 Status ModelManager::GetModelByCmd(const Command &command,
                                    std::shared_ptr<DavinciModel> &davinci_model) {
   if (command.cmd_params.size() < kCmdParSize) {
@@ -806,29 +791,6 @@ Status ModelManager::HandleProfStopCommand(const Command &command) {
   return SUCCESS;
 }
 
-Status ModelManager::HandleProfileCommand(const Command &command) {
-  if (command.cmd_params.size() < kCmdParSize) {
-    GELOGE(PARAM_INVALID, "When the cmd_type is 'profile', the size of cmd_params must larger than 2.");
-    return PARAM_INVALID;
-  }
-
-  std::string map_key = command.cmd_params[0];
-  std::string value = command.cmd_params[1];
-
-  GELOGI("Profiling mode, Command key:%s , value:%s ", map_key.c_str(), value.c_str());
-
-  auto iter = PROFILE_COMPONENT_MAP.find(map_key);
-  if (iter != PROFILE_COMPONENT_MAP.end()) {
-    std::string property_value = (value == "on") ? "1" : "0";
-    PropertiesManager::Instance().SetPropertyValue(iter->second, property_value);
-  }
-
-  if ((map_key == PROFILER_JOBCTX || map_key == PROFILER_TARGET_PATH || map_key == RTS_PROFILE_PATH)) {
-    PropertiesManager::Instance().SetPropertyValue(map_key, value);
-  }
-  return SUCCESS;
-}
-
 static Status ParserPara(const Command &command, const string &dump_key, string &dump_value) {
   auto iter = std::find(command.cmd_params.begin(), command.cmd_params.end(), dump_key);
   if (iter != command.cmd_params.end()) {
@@ -1072,12 +1034,12 @@ Status ModelManager::GenSessionId(uint64_t &session_id) {
     GELOGE(INTERNAL_ERROR, "Failed to get current time.");
     return INTERNAL_ERROR;
   }
-  session_id = static_cast<uint64_t>(tv.tv_sec * 1000000 + tv.tv_usec);  // 1000000us
+  session_id = static_cast<uint64_t>(tv.tv_sec * kTimeSpecMiro + tv.tv_usec);  // 1000000us
 
   session_id_bias_++;
   // max bais 100.
-  session_id_bias_ = session_id_bias_ % 100;
-  session_id = session_id * 100 + session_id_bias_;
+  session_id_bias_ = session_id_bias_ % kSessionMaxBias;
+  session_id = session_id * kSessionMaxBias + session_id_bias_;
 
   GELOGD("Generate new session id: %lu.", session_id);
   return SUCCESS;
@@ -1086,8 +1048,7 @@ Status ModelManager::GenSessionId(uint64_t &session_id) {
 Status ModelManager::LoadModelOffline(uint32_t &model_id, const ModelData &model, shared_ptr<ModelListener> listener,
                                       void *dev_ptr, size_t mem_size, void *weight_ptr, size_t weight_size) {
   GE_CHK_BOOL_RET_STATUS(model.key.empty() || mmAccess2(model.key.c_str(), M_F_OK) == EN_OK,
-	                 ACL_ERROR_GE_PARAM_INVALID,
-                         "input key file path %s is invalid, %s", model.key.c_str(), strerror(errno));
+      ACL_ERROR_GE_PARAM_INVALID, "input key file path %s is invalid, %s", model.key.c_str(), strerror(errno));
   GenModelId(&model_id);
 
   shared_ptr<DavinciModel> davinci_model = nullptr;
@@ -1148,7 +1109,7 @@ Status ModelManager::LoadModelOffline(uint32_t &model_id, const ModelData &model
 
     GELOGI("Parse model %u success.", model_id);
 
-    davinci_model->SetProfileTime(MODEL_LOAD_START, (timespec.tv_sec * 1000 * 1000 * 1000 +
+    davinci_model->SetProfileTime(MODEL_LOAD_START, (timespec.tv_sec * kTimeSpecNano +
                                                      timespec.tv_nsec));  // 1000 ^ 3 converts second to nanosecond
     davinci_model->SetProfileTime(MODEL_LOAD_END);
 
@@ -1252,7 +1213,8 @@ Status ModelManager::ExecuteModel(uint32_t model_id, rtStream_t stream, bool asy
   }
 
   std::shared_ptr<DavinciModel> davinci_model = GetModel(model_id);
-  GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, PARAM_INVALID, "Invalid model id %u.", model_id);
+  GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, ACL_ERROR_GE_EXEC_MODEL_ID_INVALID,
+                         "Invalid model id %u, check weather model has been loaded or not.", model_id);
 
   if (davinci_model->NeedDestroyAicpuKernel()) {
     GELOGI("Start to destroy specified aicpu kernel.");
@@ -1289,8 +1251,8 @@ Status ModelManager::CreateAicpuSession(uint64_t session_id) {
   return SUCCESS;
 }
 
-Status ModelManager::LoadCustAicpuSo(const OpDescPtr &op_desc, const string &so_name) {
-  GELOGI("LoadCustAicpuSo in, op name %s, so name %s", op_desc->GetName().c_str(), so_name.c_str());
+Status ModelManager::LoadCustAicpuSo(const OpDescPtr &op_desc, const string &so_name, bool &loaded) {
+  GELOGD("LoadCustAicpuSo in, op name %s, so name %s", op_desc->GetName().c_str(), so_name.c_str());
   std::lock_guard<std::mutex> lock(cust_aicpu_mutex_);
   CustAICPUKernelPtr aicpu_kernel = op_desc->TryGetExtAttr(OP_EXTATTR_CUSTAICPU_KERNEL, CustAICPUKernelPtr());
   if (aicpu_kernel == nullptr) {
@@ -1313,18 +1275,24 @@ Status ModelManager::LoadCustAicpuSo(const OpDescPtr &op_desc, const string &so_
     std::map<string, CustAICPUKernelPtr> new_so_name;
     new_so_name.insert({so_name, aicpu_kernel});
     cust_aicpu_so_[resource_id] = new_so_name;
-    GELOGI("LoadCustAicpuSo new aicpu so resource id %lu", resource_id);
+    loaded = false;
+    GELOGD("LoadCustAicpuSo new aicpu so name %s, resource id %lu", so_name.c_str(), resource_id);
     return SUCCESS;
   }
   auto it_so_name = it->second.find(so_name);
   if (it_so_name == it->second.end()) {
     it->second.insert({so_name, aicpu_kernel});
-    GELOGI("LoadCustAicpuSo add aicpu so resource id %lu", resource_id);
+    loaded = false;
+    GELOGD("LoadCustAicpuSo add aicpu so name %s, resource id %lu", so_name.c_str(), resource_id);
+    return SUCCESS;
   }
+  loaded = true;
+  GELOGD("LoadCustAicpuSo so name %s has been loaded.", so_name.c_str());
   return SUCCESS;
 }
 
 Status ModelManager::LaunchKernelCustAicpuSo(const string &kernel_name) {
+  GELOGD("Aicpu kernel launch task in, kernel name %s.", kernel_name.c_str());
   std::lock_guard<std::mutex> lock(cust_aicpu_mutex_);
   if (cust_aicpu_so_.size() == 0) return SUCCESS;
   // get current context
diff --git a/ge/graph/load/new_model_manager/model_manager.h b/ge/graph/load/new_model_manager/model_manager.h
index e3780d5b..fc98d9c2 100755
--- a/ge/graph/load/new_model_manager/model_manager.h
+++ b/ge/graph/load/new_model_manager/model_manager.h
@@ -169,8 +169,6 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager {
   /// @brief comment handle function
   ///
   ge::Status HandleCommand(const Command &command);
-  static ge::Status HandleAclProfilingCommand(const Command &command);
-  static ge::Status HandleProfileCommand(const Command &command);
   static ge::Status HandleDumpCommand(const Command &command);
   static ge::Status HandleProfModelSubscribeCommand(const Command &command);
   static ge::Status HandleProfModelUnsubscribeCommand(const Command &command);
@@ -289,7 +287,7 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager {
 
   ge::Status DestroyAicpuSessionForInfer(uint32_t model_id);
 
-  ge::Status LoadCustAicpuSo(const OpDescPtr &op_desc, const string &so_name);
+  ge::Status LoadCustAicpuSo(const OpDescPtr &op_desc, const string &so_name, bool &loaded);
 
   ge::Status LaunchCustAicpuSo();
 
diff --git a/ge/graph/load/new_model_manager/model_utils.cc b/ge/graph/load/new_model_manager/model_utils.cc
index 34fb7ff3..22a657ad 100755
--- a/ge/graph/load/new_model_manager/model_utils.cc
+++ b/ge/graph/load/new_model_manager/model_utils.cc
@@ -61,7 +61,7 @@ vector<int64_t> ModelUtils::GetInputSize(ConstOpDescPtr op_desc) {
       GELOGI("Get size from TensorDesc failed, op : %s, input index : %zu", op_desc->GetName().c_str(), i);
       continue);
 
-    GELOGI("[IMAS]GetInputSize op: %s, index: %zu, size:%ld", op_desc->GetName().c_str(), i, tensor_size);
+    GELOGI("GetInputSize op: %s, index: %zu, size:%ld", op_desc->GetName().c_str(), i, tensor_size);
     v_input_size.push_back(tensor_size);
   }
 
@@ -96,7 +96,7 @@ vector<int64_t> ModelUtils::GetOutputSize(ConstOpDescPtr op_desc) {
       GELOGI("Get size from TensorDesc failed, op : %s, output index : %zu", op_desc->GetName().c_str(), i);
       continue);
 
-    GELOGI("[IMAS]GetOutputSize op: %s, index: %zu, size:%ld", op_desc->GetName().c_str(), i, tensor_size);
+    GELOGI("GetOutputSize op: %s, index: %zu, size:%ld", op_desc->GetName().c_str(), i, tensor_size);
     v_output_size.push_back(tensor_size);
   }
 
diff --git a/ge/graph/load/new_model_manager/task_info/hccl_task_info.cc b/ge/graph/load/new_model_manager/task_info/hccl_task_info.cc
index b09a4fce..4fb64aab 100644
--- a/ge/graph/load/new_model_manager/task_info/hccl_task_info.cc
+++ b/ge/graph/load/new_model_manager/task_info/hccl_task_info.cc
@@ -279,9 +279,10 @@ Status HcclTaskInfo::SetAddrs(const std::shared_ptr<OpDesc> &op_desc,
       output_data_addr = output_data_addrs_.empty() ? nullptr : output_data_addrs_[i];
     }
     kernel_hccl_infos[i].inputDataAddr = input_data_addr;
-    if (hccl_type == HCOMALLGATHER || hccl_type == HCOMRECEIVE || hccl_type == HVDCALLBACKALLGATHER || hccl_type == HCOMREDUCE) {
+    if (hccl_type == HCOMALLGATHER || hccl_type == HCOMRECEIVE || hccl_type == HVDCALLBACKALLGATHER) {
       kernel_hccl_infos[i].outputDataAddr = output_data_addr;
-    } else if (hccl_type == HCOMALLREDUCE || hccl_type == HCOMREDUCESCATTER || hccl_type == HVDCALLBACKALLREDUCE) {
+    } else if (hccl_type == HCOMALLREDUCE ||
+               hccl_type == HCOMREDUCESCATTER || hccl_type == HVDCALLBACKALLREDUCE || hccl_type == HCOMREDUCE) {
       GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclOperationType(op_desc, op_type),
                         "davinci_model: GetHcomOperationType fail!");
       kernel_hccl_infos[i].outputDataAddr = output_data_addr;
diff --git a/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc b/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc
index 04607c02..74faeb24 100755
--- a/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc
+++ b/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc
@@ -43,6 +43,13 @@ const char *kIsLastNode = "is_last_node";
 const char *kIsFirstNode = "is_first_node";
 const int64_t kCloseSkt = 100;
 const uint32_t kAddrLen = sizeof(void *);
+const int kBaseInt = 10;
+const int kStrtolFail = 0;
+const int kArgsInputDesc = 0;
+const int kArgsInputAddr = 1;
+const int kArgsOutputDesc = 2;
+const int kArgsOutputAddr = 3;
+const int kArgsAttrHandle = 4;
 }  // namespace
 
 namespace ge {
@@ -66,7 +73,7 @@ Status KernelTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci
   // get opcontext stored in model
   const domi::KernelContext &context = kernel_def.context();
   // get kernel_type
-  kernel_type_ = static_cast<cce::ccKernelType>(context.kernel_type());
+  kernel_type_ = static_cast<ccKernelType>(context.kernel_type());
   // get opdesc
   op_desc_ = davinci_model_->GetOpByIndex(context.op_index());
   GE_CHECK_NOTNULL(op_desc_);
@@ -88,13 +95,13 @@ Status KernelTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci
   // get bin_file_key
   const char *bin_file_key = davinci_model_->GetRegisterStub(op_desc_->GetName(), session_graph_model_id);
   // new aicpu kernel(rtCpuKernelLaunch) no need to check function
-  if (kernel_type_ == cce::ccKernelType::CCE_AI_CORE) {
+  if (kernel_type_ == ccKernelType::CCE_AI_CORE) {
     rtError_t rt_ret;
     rt_ret = rtGetFunctionByName(const_cast<char *>(kernel_def.stub_func().c_str()), &stub_func_);
     GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "execute rtGetFunctionByName failed. stub_func: %s",
                                                     kernel_def.stub_func().c_str());
                     return RT_ERROR_TO_GE_STATUS(rt_ret););
-  } else if (kernel_type_ == cce::ccKernelType::TE) {
+  } else if (kernel_type_ == ccKernelType::TE) {
     rtError_t rt_ret;
     rt_ret = rtGetFunctionByName(bin_file_key, &stub_func_);
     GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE,
@@ -111,7 +118,7 @@ Status KernelTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci
     ctx_.opIndex2[i] = context.origin_op_index(i);
   }
   ctx_.opCount = context.origin_op_index_size();
-  if (kernel_type_ == cce::ccKernelType::TE) {
+  if (kernel_type_ == ccKernelType::TE) {
     ctx_.opIndex = context.op_index();
     uint16_t *args_offset_tmp = reinterpret_cast<uint16_t *>(const_cast<char *>(context.args_offset().data()));
     if (context.args_offset().size() / sizeof(uint16_t) < 1) {
@@ -120,9 +127,9 @@ Status KernelTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci
     }
 
     ret = InitTVMTask(args_offset_tmp[0], kernel_def);
-  } else if (kernel_type_ == cce::ccKernelType::CUSTOMIZED) {
+  } else if (kernel_type_ == ccKernelType::CUSTOMIZED) {
     ret = InitAICPUCustomTask(context.op_index(), kernel_def);
-  } else if (kernel_type_ == cce::ccKernelType::AI_CPU || kernel_type_ == cce::ccKernelType::CUST_AI_CPU) {
+  } else if (kernel_type_ == ccKernelType::AI_CPU || kernel_type_ == ccKernelType::CUST_AI_CPU) {
     ret = InitAicpuTask(context.op_index(), kernel_def);
   } else {
     if (kernel_def.args().empty() || args_size_ == 0) {
@@ -371,9 +378,9 @@ Status KernelTaskInfo::Distribute() {
   rtError_t rt_ret = RT_ERROR_NONE;
   char skt_enable_env[MMPA_MAX_PATH] = { 0x00 };
   INT32 res = mmGetEnv("SKT_ENABLE", skt_enable_env, MMPA_MAX_PATH);
-  int64_t env_flag = (res == EN_OK) ? strtol(skt_enable_env, nullptr, 10) : 0;
+  int64_t env_flag = (res == EN_OK) ? strtol(skt_enable_env, nullptr, kBaseInt) : kStrtolFail;
   bool call_skt = ((env_flag != 0) || is_l1_fusion_enable_);
-  if (kernel_type_ == cce::ccKernelType::AI_CPU || kernel_type_ == cce::ccKernelType::CUST_AI_CPU) {
+  if (kernel_type_ == ccKernelType::AI_CPU || kernel_type_ == ccKernelType::CUST_AI_CPU) {
     GELOGI("distribute task info kernel_type %d, flag %d", kernel_type_, dump_flag_);
     // blockDim is reserved parameter, set to 1
     rt_ret = rtCpuKernelLaunchWithFlag(reinterpret_cast<const void *>(so_name_.c_str()),
@@ -749,15 +756,15 @@ Status KernelTaskInfo::InitAICPUCustomTask(uint32_t op_index, const domi::Kernel
       return FAILED;
     }
   }
-  *(reinterpret_cast<uint64_t *>(args + ctx_.argsOffset[0])) =
+  *(reinterpret_cast<uint64_t *>(args + ctx_.argsOffset[kArgsInputDesc])) =
       static_cast<uint64_t>(reinterpret_cast<uintptr_t>(custom_info_.input_descs));  // arg 0
-  *(reinterpret_cast<uint64_t *>(args + ctx_.argsOffset[1])) =
+  *(reinterpret_cast<uint64_t *>(args + ctx_.argsOffset[kArgsInputAddr])) =
       static_cast<uint64_t>(reinterpret_cast<uintptr_t>(custom_info_.input_addrs));  // arg 1
-  *(reinterpret_cast<uint64_t *>(args + ctx_.argsOffset[2])) =
+  *(reinterpret_cast<uint64_t *>(args + ctx_.argsOffset[kArgsOutputDesc])) =
       static_cast<uint64_t>(reinterpret_cast<uintptr_t>(custom_info_.output_descs));  // arg 2
-  *(reinterpret_cast<uint64_t *>(args + ctx_.argsOffset[3])) =
+  *(reinterpret_cast<uint64_t *>(args + ctx_.argsOffset[kArgsOutputAddr])) =
       static_cast<uint64_t>(reinterpret_cast<uintptr_t>(custom_info_.output_addrs));  // arg 3
-  *(reinterpret_cast<uint64_t *>(args + ctx_.argsOffset[4])) =
+  *(reinterpret_cast<uint64_t *>(args + ctx_.argsOffset[kArgsAttrHandle])) =
       static_cast<uint64_t>(reinterpret_cast<uintptr_t>(custom_info_.attr_handle));  // arg 4
 
   rt_ret = rtMalloc(&args_, args_size_, RT_MEMORY_HBM);
@@ -874,8 +881,10 @@ Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &k
     return INTERNAL_ERROR;
   }
 
-  if (kernel_type_ == cce::ccKernelType::CUST_AI_CPU) {
-    GE_CHK_STATUS_RET(ModelManager::GetInstance()->LoadCustAicpuSo(op_desc, so_name_), "launch cust aicpu so failed");
+  if (kernel_type_ == ccKernelType::CUST_AI_CPU) {
+    bool loaded = false;
+    GE_CHK_STATUS_RET(ModelManager::GetInstance()->LoadCustAicpuSo(op_desc, so_name_, loaded),
+                      "launch cust aicpu so failed");
   }
 
   // copy args to new host memory
@@ -946,7 +955,7 @@ Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &k
     GELOGI("Op debug is open in aicpu task info");
     dump_args_ = static_cast<char *>(args_) + sizeof(aicpu::AicpuParamHead);
   }
-  if (kernel_type_ == cce::ccKernelType::CUST_AI_CPU) {
+  if (kernel_type_ == ccKernelType::CUST_AI_CPU) {
     dump_flag_ |= RT_KERNEL_CUSTOM_AICPU;
   }
 
@@ -1076,7 +1085,7 @@ Status KernelTaskInfo::StoreInputOutputTensor(const std::vector<void *> &input_d
 
 Status KernelTaskInfo::SetContext(const domi::KernelDef &kernel_def) {
   const domi::KernelContext &context = kernel_def.context();
-  ctx_.kernelType = static_cast<cce::ccKernelType>(context.kernel_type());
+  ctx_.kernelType = static_cast<ccKernelType>(context.kernel_type());
   ctx_.opId = context.op_id();
   ctx_.kernelFuncId = context.kernel_func_id();
   ctx_.isFlowtable = context.is_flowtable();
@@ -1161,10 +1170,10 @@ Status KernelTaskInfo::CceUpdateKernelArgs(const domi::KernelContext &context, u
     GELOGE(GE_PLGMGR_SO_NOT_EXIST, "Failed in dlopen %s! ", error);
     return FAILED;
   }
-  cce::ccStatus_t cc_ret;
+  ccStatus_t cc_ret;
   std::string update_kernel_args = "ccUpdateKernelArgs";
-  auto cceUpdateKernelArgs = (cce::ccStatus_t(*)(cce::ccOpContext &, uint64_t, uint64_t, uint64_t, void *, uint64_t,
-                                                 void *))mmDlsym(handle, const_cast<char *>(update_kernel_args.c_str()));
+  auto cceUpdateKernelArgs = (ccStatus_t(*)(ccOpContext &, uint64_t, uint64_t,
+      uint64_t, void *, uint64_t, void *))mmDlsym(handle, const_cast<char *>(update_kernel_args.c_str()));
   if (cceUpdateKernelArgs == nullptr) {
     GELOGE(FAILED, "Failed to invoke function ccUpdateKernelArgs");
     if (mmDlclose(handle) != 0) {
@@ -1189,7 +1198,7 @@ Status KernelTaskInfo::CceUpdateKernelArgs(const domi::KernelContext &context, u
     GELOGW("Failed to close handle %s", error);
     return FAILED;
   }
-  if (cc_ret != cce::CC_STATUS_SUCCESS) {
+  if (cc_ret != CC_STATUS_SUCCESS) {
     GELOGE(CCE_FAILED, "Call cce api failed, ret: 0x%X", cc_ret);
     return CCE_FAILED;
   }
diff --git a/ge/graph/load/new_model_manager/task_info/kernel_task_info.h b/ge/graph/load/new_model_manager/task_info/kernel_task_info.h
index f2945b0b..1f90ede1 100644
--- a/ge/graph/load/new_model_manager/task_info/kernel_task_info.h
+++ b/ge/graph/load/new_model_manager/task_info/kernel_task_info.h
@@ -43,7 +43,7 @@ class KernelTaskInfo : public TaskInfo {
         stream_id_(0),
         so_name_(""),
         kernel_name_(""),
-        kernel_type_(cce::ccKernelType::CCE_AI_CORE),
+        kernel_type_(ccKernelType::CCE_AI_CORE),
         dump_flag_(RT_KERNEL_DEFAULT),
         dump_args_(nullptr),
         op_desc_(nullptr),
@@ -75,7 +75,7 @@ class KernelTaskInfo : public TaskInfo {
 
   Status Release() override;
 
-  cce::ccOpContext *GetCtx() override { return &ctx_; }
+  ccOpContext *GetCtx() override { return &ctx_; }
 
   FusionOpInfo *GetFusionOpInfo() override { return &fusion_op_info_; }
 
@@ -92,7 +92,7 @@ class KernelTaskInfo : public TaskInfo {
 
   bool CallSaveDumpInfo() override  { return call_save_dump_; };
 
-  cce::ccOpContext ctx_;
+  ccOpContext ctx_;
   FusionOpInfo fusion_op_info_;
 
  private:
@@ -153,7 +153,7 @@ class KernelTaskInfo : public TaskInfo {
   uint32_t stream_id_;
   std::string so_name_;
   std::string kernel_name_;
-  cce::ccKernelType kernel_type_;
+  ccKernelType kernel_type_;
   uint32_t dump_flag_;
   void *dump_args_;
   OpDescPtr op_desc_;
diff --git a/ge/graph/load/new_model_manager/task_info/stream_switch_task_info.h b/ge/graph/load/new_model_manager/task_info/stream_switch_task_info.h
index 89642cf8..a72d7de2 100755
--- a/ge/graph/load/new_model_manager/task_info/stream_switch_task_info.h
+++ b/ge/graph/load/new_model_manager/task_info/stream_switch_task_info.h
@@ -41,7 +41,7 @@ class StreamSwitchTaskInfo : public TaskInfo {
 
   Status CalculateArgs(const domi::TaskDef &task_def, DavinciModel *davinci_model) override;
  private:
-  void SetInputAndValuePtr(DavinciModel *davinci_model, const vector<void *> &input_data_addrs);
+  void SetInputAndValuePtr(DavinciModel *davinci_model, const std::vector<void *> &input_data_addrs);
   void *input_ptr_;
   rtCondition_t cond_;
   void *value_ptr_;
@@ -49,7 +49,7 @@ class StreamSwitchTaskInfo : public TaskInfo {
   uint32_t true_stream_id_;
   rtSwitchDataType_t data_type_;
   static const uint32_t kInputNum = 2;
-  vector<int64_t> fixed_addr_offset_;
+  std::vector<int64_t> fixed_addr_offset_;
 };
 }  // namespace ge
 #endif  // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_STREAM_SWITCH_TASK_INFO_H_
diff --git a/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel.cc b/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel.cc
index 63f29f84..65dca3b3 100644
--- a/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel.cc
+++ b/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel.cc
@@ -25,10 +25,11 @@ Status SuperKernel::Launch(rtStream_t stream, uint32_t dump_flag) {
   const void *args[] = {this->GetNavTablePtr(),
                         reinterpret_cast<const void *>(static_cast<uintptr_t>(this->GetNavTableSize()))};
 
-  rtError_t rt_ret = rtMalloc((void **)&(device_args_addr_), sizeof(args), RT_MEMORY_HBM);
-  GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "rtMalloc failied. error: 0x%X", rt_ret); return
-                  RT_ERROR_TO_GE_STATUS(rt_ret);)
-  rt_ret = rtMemcpy((void *)device_args_addr_, sizeof(args), (void *)args, sizeof(args), RT_MEMCPY_HOST_TO_DEVICE);
+  rtError_t rt_ret = rtMalloc(reinterpret_cast<void **>(&device_args_addr_), sizeof(args), RT_MEMORY_HBM);
+  GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "rtMalloc failied. error: 0x%X", rt_ret);
+                  return RT_ERROR_TO_GE_STATUS(rt_ret);)
+  rt_ret = rtMemcpy(reinterpret_cast<void *>(device_args_addr_), sizeof(args), reinterpret_cast<void *>(args),
+                    sizeof(args), RT_MEMCPY_HOST_TO_DEVICE);
   GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "rtMemcpy failied. error: 0x%X", rt_ret);
                   return RT_ERROR_TO_GE_STATUS(rt_ret);)
   rt_ret = rtKernelLaunchWithFlag((void *const)func_stub_, block_dim_, device_args_addr_, sizeof(args), NULL, stream,
diff --git a/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.cc b/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.cc
index 69f7b159..4e22cd7c 100644
--- a/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.cc
+++ b/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.cc
@@ -19,6 +19,8 @@
 
 namespace ge {
 namespace skt {
+const size_t kFusedKernelMinimumSize = 2;
+const size_t kFusedKernelSizeUnit = 2;
 SuperKernelFactory &SuperKernelFactory::GetInstance() {
   static SuperKernelFactory factory;
   return factory;
@@ -79,17 +81,17 @@ Status SuperKernelFactory::FuseKernels(const std::vector<void *> &stub_func_list
     return FAILED;
   }
 
-  if (super_kernel_size < 2) {
+  if (super_kernel_size < kFusedKernelMinimumSize) {
     GELOGW(
       "SKT: the number of kernels being fused must be greater than or "
       "equal to 2");
     return FAILED;
   }
   GELOGI("SKT: superkernel start fuse, superkernel size %zu.", stub_func_list.size());
-  const size_t nav_table_len = 2 * stub_func_list.size();
+  const size_t nav_table_len = kFusedKernelSizeUnit * stub_func_list.size();
   std::unique_ptr<uint64_t[]> nav_table(new(std::nothrow) uint64_t[nav_table_len]);
   GE_CHECK_NOTNULL(nav_table);
-  uint64_t nav_table_size = 2 * stub_func_list.size() * sizeof(int64_t);
+  uint64_t nav_table_size = kFusedKernelSizeUnit * stub_func_list.size() * sizeof(int64_t);
 
   rtError_t rt_ret;
   void *hbm_nav_table_addr = nullptr;
@@ -101,21 +103,21 @@ Status SuperKernelFactory::FuseKernels(const std::vector<void *> &stub_func_list
     GELOGD("SKT: fuseKernels subFunc %p, device func address %p", stub_func_list[i], sub_device_func);
     // store two uint64_t address
     // address divided by 4 because of 32bits encoding, call offset will *4 when calculating
-    nav_table[i * 2] = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(sub_device_func)) / 4;
-    GELOGD("SKT: CALL offet %lu", nav_table[i * 2]);
-    nav_table[i * 2 + 1] = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(args_addr_list[i]));
-    GELOGD("SKT: fuseKernels args base address %lu", nav_table[i * 2 + 1]);
+    nav_table[i * kFusedKernelSizeUnit] = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(sub_device_func)) / 4;
+    GELOGD("SKT: CALL offet %lu", nav_table[i * kFusedKernelSizeUnit]);
+    nav_table[i * kFusedKernelSizeUnit + 1] = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(args_addr_list[i]));
+    GELOGD("SKT: fuseKernels args base address %lu", nav_table[i * kFusedKernelSizeUnit + 1]);
   }
-  rt_ret = rtMalloc((void **)&hbm_nav_table_addr, nav_table_size, RT_MEMORY_HBM);
+  rt_ret = rtMalloc(reinterpret_cast<void **>(&hbm_nav_table_addr), nav_table_size, RT_MEMORY_HBM);
   GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "rtMalloc failed. error: 0x%X", rt_ret);
                   return RT_ERROR_TO_GE_STATUS(rt_ret);)
-  rt_ret =
-    rtMemcpy((void *)hbm_nav_table_addr, nav_table_size, (void *)nav_table.get(), nav_table_size, RT_MEMCPY_HOST_TO_DEVICE);
+  rt_ret = rtMemcpy(reinterpret_cast<void *>(hbm_nav_table_addr), nav_table_size,
+                    reinterpret_cast<void *>(nav_table.get()), nav_table_size, RT_MEMCPY_HOST_TO_DEVICE);
   GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "rtMemcpy failed. error: 0x%X", rt_ret);
                   GE_CHK_RT(rtFree(hbm_nav_table_addr)); return RT_ERROR_TO_GE_STATUS(rt_ret);)
   // Create the necessary metadata for the super kernel
-  h = std::unique_ptr<skt::SuperKernel>(
-      new SuperKernel(this->func_stub_, hbm_nav_table_addr, nav_table_size, block_dim));
+  h =
+    std::unique_ptr<skt::SuperKernel>(new SuperKernel(this->func_stub_, hbm_nav_table_addr, nav_table_size, block_dim));
   return SUCCESS;
 }
 }  // namespace skt
diff --git a/ge/graph/load/new_model_manager/task_info/task_info.h b/ge/graph/load/new_model_manager/task_info/task_info.h
index d296d29e..26f22564 100644
--- a/ge/graph/load/new_model_manager/task_info/task_info.h
+++ b/ge/graph/load/new_model_manager/task_info/task_info.h
@@ -20,7 +20,7 @@
 #include <vector>
 
 #include "cce/customize.h"
-#include "cce/taskdown_common.hpp"
+#include "framework/common/taskdown_common.h"
 #include "framework/common/ge_inner_error_codes.h"
 #include "graph/load/new_model_manager/ts_mem_mall.h"
 #include "graph/load/new_model_manager/task_info/task_info_factory.h"
@@ -63,8 +63,8 @@ struct RuntimeParam {
 };
 
 typedef struct FusionOpInfo {
-  vector<string> original_op_names;
-  string op_name;
+  std::vector<std::string> original_op_names;
+  std::string op_name;
   uint32_t op_index;
   uint32_t stream_id;
 } FusionOpInfo;
@@ -87,7 +87,7 @@ class TaskInfo {
 
   virtual Status Release() { return SUCCESS; }
 
-  virtual cce::ccOpContext *GetCtx() { return nullptr; }
+  virtual ccOpContext *GetCtx() { return nullptr; }
 
   virtual uint32_t GetTaskID() { return 0xFFFFFFFF; }
 
diff --git a/ge/graph/load/new_model_manager/ts_mem_mall.h b/ge/graph/load/new_model_manager/ts_mem_mall.h
index 42ad3957..64a64930 100644
--- a/ge/graph/load/new_model_manager/ts_mem_mall.h
+++ b/ge/graph/load/new_model_manager/ts_mem_mall.h
@@ -25,7 +25,7 @@
 #include "framework/common/debug/ge_log.h"
 
 namespace {
-constexpr uint32_t kMaxTsMemBlock = 2 * 1024 * 1024;   // Max block 2M
+constexpr uint32_t kMaxTsMemBlock = 2097152;   // Max block 2M 2 * 1024 * 1024
 constexpr uint32_t kTsMemAligment = 64;   // Malloc for 64 bits align
 constexpr uint32_t kTsMemAlignMask = kTsMemAligment - 1;
 }
diff --git a/ge/graph/load/new_model_manager/zero_copy_offset.cc b/ge/graph/load/new_model_manager/zero_copy_offset.cc
index 970b292c..9cd3f30b 100644
--- a/ge/graph/load/new_model_manager/zero_copy_offset.cc
+++ b/ge/graph/load/new_model_manager/zero_copy_offset.cc
@@ -35,6 +35,7 @@ Status ZeroCopyOffset::InitInputDataInfo(int64_t output_size, void *virtual_addr
   GELOGI("[ZCPY] Start to InitInputDataInfo of %s, total_data_size is %ld, virtual_addr is %p",
          op_desc->GetName().c_str(), output_size, virtual_addr);
   basic_addr_ = virtual_addr;
+  op_name_ = op_desc->GetName();
   (void)ge::AttrUtils::GetListInt(op_desc, ATTR_ZERO_COPY_BASIC_OFFSET, zero_copy_basic_offset_);
   (void)ge::AttrUtils::GetListInt(op_desc, ATTR_ZERO_COPY_RELATIVE_OFFSET, zero_copy_relative_offset_);
   GE_CHK_BOOL_EXEC(zero_copy_basic_offset_.size() == zero_copy_relative_offset_.size(), return PARAM_INVALID,
@@ -82,6 +83,7 @@ Status ZeroCopyOffset::InitOutputDataInfo(const vector<int64_t> &input_size_list
   GELOGD("Tensor data size: GetSize=%ld, GetTensorSizeInBytes=%ld", input_size_list[idx], size);
 
   basic_addr_ = virtual_addr_list[idx];
+  op_name_ = op_desc->GetName();
   (void)ge::AttrUtils::GetListInt(op_desc, ATTR_ZERO_COPY_BASIC_OFFSET, zero_copy_basic_offset_);
   (void)ge::AttrUtils::GetListInt(op_desc, ATTR_ZERO_COPY_RELATIVE_OFFSET, zero_copy_relative_offset_);
   GE_CHK_BOOL_EXEC(zero_copy_basic_offset_.size() == zero_copy_relative_offset_.size(), return PARAM_INVALID,
diff --git a/ge/graph/load/new_model_manager/zero_copy_offset.h b/ge/graph/load/new_model_manager/zero_copy_offset.h
index 025d1b14..fa80f28b 100644
--- a/ge/graph/load/new_model_manager/zero_copy_offset.h
+++ b/ge/graph/load/new_model_manager/zero_copy_offset.h
@@ -66,9 +66,12 @@ class ZeroCopyOffset {
   int64_t GetDataSize() const { return data_size_; }
   // value of *outside_addrs_ from davinci_model
   std::vector<std::map<const void *, std::vector<void *>>> &GetOutsideAddrs() { return outside_addrs_; }
+  // name of op
+  std::string GetOpName() const { return op_name_; }
 
  private:
   void *basic_addr_ = nullptr;
+  std::string op_name_;
   uint32_t data_count_ = 0;
   std::vector<std::pair<int64_t, void *>> data_info_;
   vector<int64_t> relative_offset_;
@@ -80,4 +83,4 @@ class ZeroCopyOffset {
   std::vector<int64_t> zero_copy_relative_offset_;
 };
 }  // namespace ge
-#endif  // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_ZERO_COPY_OFFSET_H_
\ No newline at end of file
+#endif  // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_ZERO_COPY_OFFSET_H_
diff --git a/ge/graph/load/new_model_manager/zero_copy_task.cc b/ge/graph/load/new_model_manager/zero_copy_task.cc
index 9b42d563..2609cb4b 100755
--- a/ge/graph/load/new_model_manager/zero_copy_task.cc
+++ b/ge/graph/load/new_model_manager/zero_copy_task.cc
@@ -131,7 +131,7 @@ Status ZeroCopyTask::UpdateTaskParam(uintptr_t addr, void *buffer_addr, const ma
       auto dst_addr = static_cast<uint8_t *>(buffer_addr);
       GELOGI("[ZCPY] %s update task, args_addr: %p, size: %zu, offset: %zu, virtual_addr: 0x%lx, user_data_addr: %p",
              name_.c_str(), args_addr_, args_size_, offset, addr, buffer_addr);
-      *(uintptr_t *)(args_info + offset) = reinterpret_cast<uintptr_t>(dst_addr);
+      *reinterpret_cast<uintptr_t *>(args_info + offset)= reinterpret_cast<uintptr_t>(dst_addr);
       is_updated_ = true;
     }
   }
diff --git a/ge/graph/manager/graph_caching_allocator.cc b/ge/graph/manager/graph_caching_allocator.cc
index 4ba39ca8..d6027a08 100644
--- a/ge/graph/manager/graph_caching_allocator.cc
+++ b/ge/graph/manager/graph_caching_allocator.cc
@@ -25,13 +25,13 @@
 
 namespace ge {
 const size_t bin_ranges[kNumBins] = {kRoundBlockSize * kKByteSize,
-                                     8 * kMByteSize,
-                                     32 * kMByteSize,
-                                     128 * kMByteSize,
+                                     kBinSizeUnit8 * kMByteSize,
+                                     kBinSizeUnit32 * kMByteSize,
+                                     kBinSizeUnit128 * kMByteSize,
                                      kGByteSize,
-                                     4 * kGByteSize,
-                                     16 * kGByteSize,
-                                     26 * kGByteSize};
+                                     kBinSizeUnit4 * kGByteSize,
+                                     kBinSizeUnit16 * kGByteSize,
+                                     kBinSizeUnit26 * kGByteSize};
 
 static bool BlockComparator(const Block *left, const Block *right) {
   if (left->size != right->size) {
diff --git a/ge/graph/manager/graph_caching_allocator.h b/ge/graph/manager/graph_caching_allocator.h
index dc4af753..e024d5cd 100644
--- a/ge/graph/manager/graph_caching_allocator.h
+++ b/ge/graph/manager/graph_caching_allocator.h
@@ -34,10 +34,17 @@
 
 namespace ge {
 constexpr size_t kRoundBlockSize = 512;         // all block sizes are rounded to at least 512 bytes
+constexpr size_t kBinSizeUnit4 = 4;
+constexpr size_t kBinSizeUnit8 = 8;
+constexpr size_t kBinSizeUnit16 = 16;
+constexpr size_t kBinSizeUnit26 = 26;
+constexpr size_t kBinSizeUnit32 = 32;
+constexpr size_t kBinSizeUnit128 = 128;
+
 constexpr double kSplitThreshold = 0.75;         // split when malloc size <= small block size * kSpliThreshold
 constexpr size_t kKByteSize = 1024;
-constexpr size_t kMByteSize = 1024 * 1024;
-constexpr size_t kGByteSize = 1024 * 1024 * 1024;
+constexpr size_t kMByteSize = 1048576;   // 1024 * 1024
+constexpr size_t kGByteSize = 1073741824;   // 1024 * 1024 * 1024
 
 static const uint32_t kNumBins = 8;
 
diff --git a/ge/graph/manager/graph_manager.cc b/ge/graph/manager/graph_manager.cc
index 87070e79..2c2495b4 100755
--- a/ge/graph/manager/graph_manager.cc
+++ b/ge/graph/manager/graph_manager.cc
@@ -533,9 +533,8 @@ Status GraphManager::CopySubGraphAndMarkFusion(const ComputeGraphPtr &compute_gr
   return SUCCESS;
 }
 
-Status GraphManager::OptimizeSubGraphWithMultiThreads(ComputeGraphPtr compute_graph,
-                                                      Graph2SubGraphInfoList &sub_graph_map,
-                                                      uint64_t session_id) {
+Status GraphManager::OptimizeSubGraphWithMultiThreads(ComputeGraphPtr compute_graph, 
+                                                      Graph2SubGraphInfoList &sub_graph_map, uint64_t session_id) {
   GE_CHECK_NOTNULL(compute_graph);
   // use default 16 multi thread
   const uint32_t thread_num = 16;
@@ -550,14 +549,14 @@ Status GraphManager::OptimizeSubGraphWithMultiThreads(ComputeGraphPtr compute_gr
       (void) AttrUtils::SetStr(subgraph->GetSubGraph(), ATTR_NAME_OP_COMPILE_STRATEGY, op_compile_strategy);
     }
     std::future<Status> f = executor.commit(GraphManager::ProcessSubGraphWithMultiThreads, this,
-                                            compute_graph->GetGraphID(), subgraph, compute_graph, session_id, GetThreadLocalContext());
+                                            compute_graph->GetGraphID(), subgraph, compute_graph, session_id,
+                                            GetThreadLocalContext());
     if (!f.valid()) {
       GELOGE(FAILED, "Future is invalid");
       return FAILED;
     }
     vector_future.emplace_back(std::move(f));
   }
-
   for (auto &function_graph : compute_graph->GetAllSubgraphs()) {
     auto subgraph_list = sub_graph_map[function_graph];
     for (const auto &subgraph : subgraph_list) {
@@ -651,62 +650,13 @@ Status GraphManager::ReplaceSubgraphWithOriGraph(const ComputeGraphPtr &compute_
 Status GraphManager::SetSubgraph(uint64_t session_id, ComputeGraphPtr compute_graph, GraphPartitioner &partitioner) {
   GE_CHECK_NOTNULL(compute_graph);
   auto sub_graph_map = partitioner.GetSubGraphMap();
-  std::string buffer_optimize;
-  graphStatus graph_status = ge::GetContext().GetOption(BUFFER_OPTIMIZE, buffer_optimize);
-  bool need_lx_fusion = (graph_status == GRAPH_SUCCESS) && (buffer_optimize != kOffOptimize);
-  if (options_.build_mode.empty() && need_lx_fusion) {
-    GELOGI("Enter normal mode with buffer_optimize:%s.", buffer_optimize.c_str());
-    /// 1. Copy subgraph for buffer optimize while lx fusion failed.
-    /// 2. Set graph with attr "lx_fusion" for fusion optimize.
-    std::unordered_map<std::string, ComputeGraphPtr> copy_graphs;
-    GE_TIMESTAMP_START(CopySubGraphAndMarkFusion);
-    Status ret = CopySubGraphAndMarkFusion(compute_graph, sub_graph_map, copy_graphs);
-    GE_TIMESTAMP_EVENT_END(CopySubGraphAndMarkFusion, "SetSubgraph:CopySubGraphAndMarkFusion");
-    if (ret != SUCCESS) {
-      GELOGE(ret, "CopySubGraphAndMarkFusion failed.");
-      return ret;
-    }
-
-    // Multiply optimize subgraph with lx fusion
-    ret = OptimizeSubGraphWithMultiThreads(compute_graph, sub_graph_map, session_id);
-    if (ret != SUCCESS) {
-      GELOGE(ret, "Multiply optimize subgraph with lx fusion failed.");
-      return ret;
-    }
-
-    // Check whether all subgraph lx fusion success
-    GE_TIMESTAMP_START(CheckAllFusionOptimizeSuccess);
-    if (CheckAllFusionOptimizeSuccess(compute_graph, sub_graph_map)) {
-      GE_TIMESTAMP_EVENT_END(CheckAllFusionOptimizeSuccess, "SetSubgraph:CheckAllFusionOptimizeSuccess");
-      return SUCCESS;
-    }
-
-    // Replace subgraph with original graph for lx buffer
-    ret = ReplaceSubgraphWithOriGraph(compute_graph, sub_graph_map, copy_graphs);
-    if (ret != SUCCESS) {
-      GELOGE(ret, "Replace subgraph with original graph failed.");
-      return ret;
-    }
-
-    // Multiply optimize subgraph with lx buffer
-    ret = OptimizeSubGraphWithMultiThreads(compute_graph, sub_graph_map, session_id);
-    if (ret != SUCCESS) {
-      GELOGE(ret, "Multiply optimize subgraph with lx buffer failed.");
-      return ret;
-    }
-  } else {
-    /// Multiply optimize subgraph:
-    /// 1. run lx buffer while build_mode is normal and buffer_optimize is empty or "off_optimize";
-    /// 2. run lx fusion or buffer according build_mode and build_step in fe.
-    GELOGD("Directly optimize subgraph with build mode:%s, and step:%s, buffer_optimize:%s.",
-           options_.build_mode.c_str(),
-           options_.build_step.c_str(),
-           buffer_optimize.c_str());
-    Status ret = OptimizeSubGraphWithMultiThreads(compute_graph, sub_graph_map, session_id);
-    if (ret != SUCCESS) {
-      GELOGE(ret, "Multiply optimize subgraph with lx buffer");
-      return ret;
-    }
+  GELOGD("Directly optimize subgraph with build mode:%s, and step:%s.",
+         options_.build_mode.c_str(),
+         options_.build_step.c_str());
+  Status ret = OptimizeSubGraphWithMultiThreads(compute_graph, sub_graph_map, session_id);
+  if (ret != SUCCESS) {
+    GELOGE(ret, "Multiply optimize subgraph failed");
+    return ret;
   }
   return SUCCESS;
 }
@@ -2515,7 +2465,6 @@ Status GraphManager::ProcessSubGraphWithMultiThreads(GraphManager *graph_manager
     GetContext().SetSessionId(session_id);
     GetThreadLocalContext() = ge_context;
     graph_manager->UpdateLocalOmgContext(root_graph_id);
-
     ComputeGraphPtr compute_graph_tmp = sub_graph_info_ptr->GetSubGraph();
     const std::string &engine_name = sub_graph_info_ptr->GetEngineName();
     GELOGD("ProcessSubGraphWithMultiThreads start, graph name is %s, engine_name is %s, thread id is %lu",
@@ -2523,6 +2472,10 @@ Status GraphManager::ProcessSubGraphWithMultiThreads(GraphManager *graph_manager
            pthread_self());
     GE_DUMP(compute_graph_tmp, "OptimizeSubGraphBefore");
     GE_CHECK_NOTNULL(compute_graph_tmp);
+    if (!AttrUtils::SetInt(*compute_graph_tmp, ATTR_NAME_ROOT_GRAPH_ID, root_graph_id)) {
+      GELOGE(FAILED, "Failed to set attr ATTR_NAME_ROOT_GRAPH_ID for subgraph, graph_id: %u.", root_graph_id);
+      return FAILED;
+    }
     compute_graph_tmp->SetSessionID(session_id);
     Status ret = graph_manager->GetCompilerStages(root_graph_id).optimizer.OptimizeSubGraph(compute_graph_tmp,
                                                                                             compute_graph,
@@ -2688,9 +2641,7 @@ void GraphManager::PreRunThread(GraphManager *graph_manager) {
     }
 
     // it will not execute graph preprocess, optimize, parition, build if the graph has built successful.
-
     GELOGI("Start for run graph async.");
-
     GeRootModelPtr ge_root_model = nullptr;
     if (graph_manager->IsGraphNeedBuild(graph_node)) {
       if (graph_node->GetBuildFlag()) {
diff --git a/ge/graph/manager/graph_var_manager.cc b/ge/graph/manager/graph_var_manager.cc
index be7d4eb2..84a07069 100755
--- a/ge/graph/manager/graph_var_manager.cc
+++ b/ge/graph/manager/graph_var_manager.cc
@@ -280,9 +280,9 @@ Status MemResource::AssignVarMem(const std::string &var_name, uint64_t size, uin
     return PARAM_INVALID;
   }
   uint64_t free_size = total_size_ - var_mem_size_;
-  if (free_size < (size + kSessionMemAlignSize * 2)) {
+  if (free_size < (size + kSessionMemAlignSize * kSessionMemAlignUnit)) {
     GELOGE(PARAM_INVALID, "Out of memory : current var size[%lu] exceeds total var size[%lu]",
-           size + kSessionMemAlignSize * 2 + var_mem_size_, total_size_);
+           size + kSessionMemAlignSize * kSessionMemAlignUnit + var_mem_size_, total_size_);
     return PARAM_INVALID;
   }
 
diff --git a/ge/graph/manager/graph_var_manager.h b/ge/graph/manager/graph_var_manager.h
index b4f6aca3..fcbc92c5 100755
--- a/ge/graph/manager/graph_var_manager.h
+++ b/ge/graph/manager/graph_var_manager.h
@@ -42,6 +42,7 @@ const size_t kGraphMemoryBuffer = 4UL * 1024UL * 1024UL * 1024UL;
 const size_t kMaxMemorySize = 256UL * 1024UL * 1024UL * 1024UL;
 const char kEnvGeuseStaticMemory[] = "GE_USE_STATIC_MEMORY";
 const uint64_t kSessionMemAlignSize = 512;
+const size_t kSessionMemAlignUnit = 2;
 
 enum MemStatus {
   NORMAL = 0,
diff --git a/ge/graph/manager/host_mem_manager.cc b/ge/graph/manager/host_mem_manager.cc
index d4aceddd..c99c9e87 100644
--- a/ge/graph/manager/host_mem_manager.cc
+++ b/ge/graph/manager/host_mem_manager.cc
@@ -106,7 +106,7 @@ Status HostMemManager::QueryVarMemInfo(const string &op_name, uint64_t &base_add
     GELOGE(INTERNAL_ERROR, "Find host base base_addr failed,node name:%s!", op_name.c_str());
     return INTERNAL_ERROR;
   }
-  base_addr = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(var_memory_base_map_[op_name].device_address));
+  base_addr = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(var_memory_base_map_[op_name].device_address));
   data_size = var_memory_base_map_[op_name].mem_size;
   return SUCCESS;
 }
diff --git a/ge/graph/manager/util/debug.cc b/ge/graph/manager/util/debug.cc
index 45c070c6..2c930d1f 100644
--- a/ge/graph/manager/util/debug.cc
+++ b/ge/graph/manager/util/debug.cc
@@ -32,7 +32,8 @@ Debug::~Debug() = default;
 
 void Debug::DumpProto(const Message &proto, const char *file) {
   std::string file_path = RealPath(file);
-  int fd = mmOpen2(file_path.c_str(), M_WRONLY | M_CREAT | O_TRUNC, M_IRUSR | M_IWUSR | M_UMASK_GRPREAD | M_UMASK_OTHREAD);
+  int fd = mmOpen2(file_path.c_str(), M_WRONLY | M_CREAT | O_TRUNC, M_IRUSR | M_IWUSR | M_UMASK_GRPREAD |
+                   M_UMASK_OTHREAD);
   if (fd == -1) {
     GELOGW("Write %s failed", file_path.c_str());
     return;
diff --git a/ge/graph/manager/util/hcom_util.cc b/ge/graph/manager/util/hcom_util.cc
index 487b24af..50fa9936 100644
--- a/ge/graph/manager/util/hcom_util.cc
+++ b/ge/graph/manager/util/hcom_util.cc
@@ -263,7 +263,8 @@ Status HcomOmeUtil::GetHcclRootId(const ge::ConstOpDescPtr &op_desc, int64_t &ro
 Status HcomOmeUtil::GetAllRootId(const ge::ConstOpDescPtr &op_desc,
                                  std::vector<GETaskKernelHcclInfo> &kernel_hccl_infos) {
   GE_CHECK_NOTNULL(op_desc);
-  if (op_desc->GetType() == HCOMBROADCAST || op_desc->GetType() == HVDCALLBACKBROADCAST || op_desc->GetType() == HCOMREDUCE) {
+  if (op_desc->GetType() == HCOMBROADCAST ||
+      op_desc->GetType() == HVDCALLBACKBROADCAST || op_desc->GetType() == HCOMREDUCE) {
     GELOGI("GetAllRootId Node[%s] opType[%s] get hccl rootId.", op_desc->GetName().c_str(), op_desc->GetType().c_str());
     int64_t root_id = 0;
     Status dmrt = GetHcclRootId(op_desc, root_id);
diff --git a/ge/graph/optimize/mem_rw_conflict_optimize.cc b/ge/graph/optimize/mem_rw_conflict_optimize.cc
index 2fabc035..dfc6c9df 100644
--- a/ge/graph/optimize/mem_rw_conflict_optimize.cc
+++ b/ge/graph/optimize/mem_rw_conflict_optimize.cc
@@ -26,6 +26,13 @@
 namespace {
 using namespace ge;
 const int kIdentityAnchorIndex = 0;
+const size_t kSerialStringVecSize = 4;
+
+const int kCaseReadOnly = 0;
+const int kCaseScopeWriteable = 2;
+const int kCaseWriteable = 3;
+const int kCaseInvalidRWType = 5;
+
 // rw type of input.
 enum class InputRWType {
   kReadOnly,        // Normal op input only read
@@ -55,7 +62,7 @@ thread_local map<string, NodeInputOutputRWType> node_rwtype_map_;
 /// @return rw_type_name
 ///
 static std::string InputRWTypeToSerialString(InputRWType rw_type) {
-  const static char *names[4] = {"ReadOnly", "Writeable", "ScopeWriteable", "InvalidRWType"};
+  const static char *names[kSerialStringVecSize] = {"ReadOnly", "Writeable", "ScopeWriteable", "InvalidRWType"};
   return names[static_cast<int>(rw_type)];
 }
 
@@ -65,7 +72,7 @@ static std::string InputRWTypeToSerialString(InputRWType rw_type) {
 /// @return rw_type_name
 ///
 static std::string OutputRWTypeToSerialString(OutputRWType rw_type) {
-  const static char *names[4] = {"ReadOnly", "SoftRead", "Writeable", "InvalidRWType"};
+  const static char *names[kSerialStringVecSize] = {"ReadOnly", "SoftRead", "Writeable", "InvalidRWType"};
   return names[static_cast<int>(rw_type)];
 }
 
@@ -118,13 +125,13 @@ InputRWType GetInputRwTypeInConflict(const std::set<int> &rw_type_set) {
   }
 
   switch (total_rw_type) {
-    case 0:
+    case kCaseReadOnly:
       return InputRWType::kReadOnly;  // all input rw type is readonly
-    case 2:
+    case kCaseScopeWriteable:
       return InputRWType::kScopeWriteable;  // readonly 2 scope_writeable
-    case 3:
+    case kCaseWriteable:
       return InputRWType::kWriteable;  // all input rw type is writeable or readonly 2 writeable
-    case 5:
+    case kCaseInvalidRWType:
       return InputRWType::kInvalidRWType;  // writeable 2 scope_writeable
     default:
       return InputRWType::kInvalidRWType;
@@ -643,7 +650,7 @@ Status HandleAllreduceDuplicateInput(ComputeGraphPtr &compute_graph) {
          auto ret = GraphUtils::InsertNodeBetweenDataAnchors(pre_out_anchor, in_data_anchor, identity_node);
          GE_CHK_STATUS_RET(ret, "Fail to insert identity.");
          GELOGI("InsertNode %s between %s and %s successfully.", identity_node->GetName().c_str(),
-               pre_node->GetName().c_str(), node->GetName().c_str());
+                pre_node->GetName().c_str(), node->GetName().c_str());
        }
      }
    }
diff --git a/ge/graph/partition/graph_partition.cc b/ge/graph/partition/graph_partition.cc
index 6a1fbb34..fbc13920 100755
--- a/ge/graph/partition/graph_partition.cc
+++ b/ge/graph/partition/graph_partition.cc
@@ -614,32 +614,32 @@ Status ge::GraphPartitioner::AddPartitionsToGraphNode(vector<ge::SubGraphInfoPtr
     }
     // flush parent node of subgraph
     sub_graph->SetParentNode(compute_graph->GetParentNode());
-    (void) AttrUtils::SetStr(*sub_graph, ATTR_NAME_PARENT_GRAPH_NAME, compute_graph->GetName());
-      auto sgi = MakeShared<SubGraphInfo>();
-      if (sgi == nullptr) {
-        GELOGE(GE_GRAPH_PARAM_NULLPTR, "[GraphPartitioner]: MakeShared sub graph info failed.");
-        return FAILED;
-      }
-      // set engine name
-      sgi->SetEngineName(engine_name);
-      // set stream label
-      string sub_graph_stream;
-      if (AttrUtils::GetStr(sub_graph->GetDirectNode().at(0)->GetOpDesc(), ATTR_NAME_STREAM_LABEL, sub_graph_stream)) {
-        sgi->SetStreamLabel(sub_graph_stream);
-      }
-      /// for now inputFlag is the same before and after partition. It should
-      /// be changed according to the real partition
-      std::vector<bool> sub_graph_input(graph_info_.input_size_, true);
-      std::vector<bool> sub_graph_output(graph_info_.output_size_, true);
-      sgi->SetSubGraph(sub_graph);
-      sgi->SetOutputFlag(sub_graph_output);
-      sgi->SetInputFlag(sub_graph_input);
-      sgi->SetOutputContext(graph_info_.output_name_);
-      AddEndPldInformationToSubGraphInfo(sgi);
-      GELOGI("[GraphPartitioner]: subGraph engine name is %s, graph name is %s, stream label is %s",
-             engine_name.c_str(),
-             sub_graph->GetName().c_str(),
-             sgi->GetStreamLabel().empty() ? "null" : sgi->GetStreamLabel().c_str());
+    (void)AttrUtils::SetStr(*sub_graph, ATTR_NAME_PARENT_GRAPH_NAME, compute_graph->GetName());
+    GELOGD("set attr success. subgraph(%s) with parent graph(%s)", sub_graph->GetName().c_str(),
+           compute_graph->GetName().c_str());
+    auto sgi = MakeShared<SubGraphInfo>();
+    if (sgi == nullptr) {
+      GELOGE(GE_GRAPH_PARAM_NULLPTR, "[GraphPartitioner]: MakeShared sub graph info failed.");
+      return FAILED;
+    }
+    // set engine name
+    sgi->SetEngineName(engine_name);
+    // set stream label
+    string sub_graph_stream;
+    if (AttrUtils::GetStr(sub_graph->GetDirectNode().at(0)->GetOpDesc(), ATTR_NAME_STREAM_LABEL, sub_graph_stream)) {
+      sgi->SetStreamLabel(sub_graph_stream);
+    }
+    /// for now inputFlag is the same before and after partition. It should
+    /// be changed according to the real partition
+    std::vector<bool> sub_graph_input(graph_info_.input_size_, true);
+    std::vector<bool> sub_graph_output(graph_info_.output_size_, true);
+    sgi->SetSubGraph(sub_graph);
+    sgi->SetOutputFlag(sub_graph_output);
+    sgi->SetInputFlag(sub_graph_input);
+    sgi->SetOutputContext(graph_info_.output_name_);
+    AddEndPldInformationToSubGraphInfo(sgi);
+    GELOGI("[GraphPartitioner]: subGraph engine name is %s, graph name is %s, stream label is %s", engine_name.c_str(),
+           sub_graph->GetName().c_str(), sgi->GetStreamLabel().empty() ? "null" : sgi->GetStreamLabel().c_str());
     if (engine_name != input_subgraph_name) {  // do not add Data subGraph into SubGraphInfo
       output_subgraphs.push_back(sgi);
     } else {
diff --git a/ge/graph/passes/atomic_addr_clean_pass.cc b/ge/graph/passes/atomic_addr_clean_pass.cc
index 60742eb1..7c6ed8ce 100755
--- a/ge/graph/passes/atomic_addr_clean_pass.cc
+++ b/ge/graph/passes/atomic_addr_clean_pass.cc
@@ -74,10 +74,88 @@ Status AtomicAddrCleanPass::Run(ComputeGraphPtr graph) {
   return SUCCESS;
 }
 
+// just hccl may mark atomic from ops kernel now, and hccl's atomic if for all input
+bool AtomicAddrCleanPass::CheckAtomicFromOpsKernel(const NodePtr &node) {
+  // 1.Check if isAtomic attrs exist for HCOM
+  std::shared_ptr<GELib> instance_ptr = GELib::GetInstance();
+  if ((instance_ptr == nullptr) || (!instance_ptr->InitFlag())) {
+    GELOGW("GELib not initialized, atomic from ops kernel judge false, node_name: %s", node->GetName().c_str());
+    return false;
+  }
+
+  OpsKernelManager &ops_kernel_manager = instance_ptr->OpsKernelManagerObj();
+  vector<OpInfo> op_info_vec = ops_kernel_manager.GetOpsKernelInfo(node->GetType());
+  for (const auto &op_info : op_info_vec) {
+    if (op_info.isAtomic) {
+      // check peer input is DATA
+      for (const auto &in_data_anchor : node->GetAllInDataAnchors()) {
+        if (in_data_anchor->GetPeerOutAnchor() != nullptr &&
+            in_data_anchor->GetPeerOutAnchor()->GetOwnerNode() != nullptr) {
+          auto peer_in_node = in_data_anchor->GetPeerOutAnchor()->GetOwnerNode();
+          if (peer_in_node->GetType() == DATA) {
+            GELOGI("Recognized atomic op %s from %s engine and input is DATA.", node->GetName().c_str(), 
+                   op_info.engine.c_str());
+            return false;
+          }
+        }
+      }
+      GELOGI("Recognized atomic op %s from %s engine.", node->GetName().c_str(), op_info.engine.c_str());
+      hcom_node_vec_.push_back(node);
+      return true;
+    }
+  }
+  return false;
+}
+
+bool AtomicAddrCleanPass::IsOutputIndexPeerInputAtomic(const NodePtr &node, int64_t output_index) {
+  auto out_data_anchor = node->GetAllOutDataAnchors().at(output_index);
+  if (out_data_anchor == nullptr) {
+    return false;
+  }
+
+  for (auto input_anchor : out_data_anchor->GetPeerInDataAnchors()) {
+    auto output_node = input_anchor->GetOwnerNode();
+    // just hccl may mark atomic from ops kernel now, and hccl's atomic if for all input
+    // hccl's attr ATOMIC_ATTR_INPUT_INDEX mark on CalcOpRunningParam, can't be get here
+    if (CheckAtomicFromOpsKernel(output_node)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool AtomicAddrCleanPass::CheckSkipInsertInLoopGraph(const NodePtr &node) {
+  OpDescPtr op_desc = node->GetOpDesc();
+  std::map<string, std::map<int, int>> node_workspace_offset;
+  bool has_atomic_input = op_desc->HasAttr(ATOMIC_ATTR_INPUT_INDEX);
+  bool has_atomic_output = op_desc->HasAttr(ATOMIC_ATTR_OUTPUT_INDEX);
+  node_workspace_offset = op_desc->TryGetExtAttr(EXT_ATTR_ATOMIC_WORKSPACE_OFFSET, node_workspace_offset);
+  if (!has_atomic_input && has_atomic_output && node_workspace_offset.empty()) {
+    std::vector<int64_t> atomic_output_index;
+    (void) ge::AttrUtils::GetListInt(op_desc, ATOMIC_ATTR_OUTPUT_INDEX, atomic_output_index);
+    bool is_all_output_peer_also_atomic = true;
+    for (const auto &output_index : atomic_output_index) {
+      if (!IsOutputIndexPeerInputAtomic(node, output_index)) {
+        is_all_output_peer_also_atomic = false;
+        break;
+      }
+    }
+    if (is_all_output_peer_also_atomic) {
+      GELOGI("all out peer node input atomic, skip this out atomic process, node name: %s", node->GetName().c_str());
+      return true;
+    }
+  }
+  return false;
+}
+
 Status AtomicAddrCleanPass::HandleLoopGraph(ComputeGraphPtr &graph, const vector<NodePtr> &atomic_node_vec) {
   // Loop graph , insert clean node follow atomic node
   int index = 0;
   for (const auto &node : atomic_node_vec) {
+    if (CheckSkipInsertInLoopGraph(node)) {
+      continue;
+    }
+
     // Insert atomic clean op
     NodePtr clean_addr_node = InsertAtomicAddrCleanNode(graph);
     if (clean_addr_node == nullptr) {
@@ -249,32 +327,10 @@ bool AtomicAddrCleanPass::IsAtomicOp(const NodePtr &node) {
     return false;
   }
   // 1.Check if isAtomic attrs exist for HCOM
-  std::shared_ptr<GELib> instance_ptr = GELib::GetInstance();
-  if ((instance_ptr == nullptr) || (!instance_ptr->InitFlag())) {
-    GELOGW("GELib not initialized");
-    return false;
+  if (CheckAtomicFromOpsKernel(node)) {
+    return true;
   }
 
-  OpsKernelManager &ops_kernel_manager = instance_ptr->OpsKernelManagerObj();
-  vector<OpInfo> op_info_vec = ops_kernel_manager.GetOpsKernelInfo(op_desc->GetType());
-  for (const auto &op_info : op_info_vec) {
-    if (op_info.isAtomic) {
-      GELOGI("Recognized atomic op %s from DNN_HCCL engine.", op_desc->GetName().c_str());
-      // check peer input is DATA
-      for (auto &in_data_anchor : node->GetAllInDataAnchors()) {
-        if (in_data_anchor->GetPeerOutAnchor() != nullptr &&
-            in_data_anchor->GetPeerOutAnchor()->GetOwnerNode() != nullptr) {
-          auto peer_in_node = in_data_anchor->GetPeerOutAnchor()->GetOwnerNode();
-          if (peer_in_node->GetType() == DATA) {
-            GELOGI("Recognized atomic op %s from DNN_HCCL engine and input is DATA.", op_desc->GetName().c_str());
-            return false;
-          }
-        }
-      }
-      hcom_node_vec_.push_back(node);
-      return true;
-    }
-  }
   // 2.Check atomic attr in node
   std::map<string, std::map<int, int>> node_workspace_offset;
   bool has_atomic_input = op_desc->HasAttr(ATOMIC_ATTR_INPUT_INDEX);
diff --git a/ge/graph/passes/atomic_addr_clean_pass.h b/ge/graph/passes/atomic_addr_clean_pass.h
index ad60b7b5..8138d511 100755
--- a/ge/graph/passes/atomic_addr_clean_pass.h
+++ b/ge/graph/passes/atomic_addr_clean_pass.h
@@ -84,6 +84,11 @@ class AtomicAddrCleanPass : public GraphPass {
   Status HandleDispersedAtomicNodes(ComputeGraphPtr &graph, const std::vector<NodePtr> &atomic_node_vec,
                                     std::vector<NodePtr> &common_atomic_nodes);
 
+  bool CheckAtomicFromOpsKernel(const NodePtr &node);
+
+  bool IsOutputIndexPeerInputAtomic(const NodePtr &node, int64_t output_index);
+
+  bool CheckSkipInsertInLoopGraph(const NodePtr &node);
 
   vector<NodePtr> hcom_node_vec_;
   bool is_loop_graph_ = false;
diff --git a/ge/graph/passes/attach_stream_label_pass.cc b/ge/graph/passes/attach_stream_label_pass.cc
index b04643a4..c0e0f669 100644
--- a/ge/graph/passes/attach_stream_label_pass.cc
+++ b/ge/graph/passes/attach_stream_label_pass.cc
@@ -24,11 +24,7 @@ Status AttachStreamLabelPass::Run(ComputeGraphPtr graph) {
 
   FindNodes(graph);
   for (const auto &node : need_label_nodes_) {
-    OpDescPtr op_desc = node->GetOpDesc();
-    GE_CHECK_NOTNULL(op_desc);
-    if (!op_desc->HasAttr(ATTR_NAME_STREAM_LABEL)) {
-      GE_CHK_STATUS_RET(UpdateCondBranch(node), "Update cond branch failed, start node:%s.", node->GetName().c_str());
-    }
+    GE_CHK_STATUS_RET(UpdateCondBranch(node), "Update cond branch failed, start node:%s.", node->GetName().c_str());
   }
   GE_CHK_STATUS_RET(UpdateEnterNode(), "UpdateEnterNode failed.");
 
@@ -55,13 +51,15 @@ Status AttachStreamLabelPass::ClearStatus() {
 ///
 void AttachStreamLabelPass::FindNodes(const ComputeGraphPtr &graph) {
   for (const NodePtr &node : graph->GetDirectNode()) {
-    const std::string &type = node->GetType();
-    if (type == STREAMSWITCH) {
+    const auto &op_desc = node->GetOpDesc();
+    if (op_desc == nullptr) {
+      continue;
+    }
+    const std::string &type = op_desc->GetType();
+    if ((type == STREAMSWITCH) && op_desc->HasAttr(ATTR_NAME_SWITCH_TRUE_BRANCH_FLAG)) {
       stream_switch_nodes_.emplace_back(node);
-    } else if (type == STREAMMERGE) {
-      if ((node->GetOpDesc() != nullptr) && !node->GetOpDesc()->HasAttr(ATTR_NAME_NEXT_ITERATION)) {
-        need_label_nodes_.emplace_back(node);
-      }
+    } else if ((type == STREAMMERGE) && !op_desc->HasAttr(ATTR_NAME_NEXT_ITERATION)) {
+      need_label_nodes_.emplace_back(node);
     } else if ((type == ENTER) || (type == REFENTER)) {
       enter_nodes_.emplace_back(node);
     }
@@ -83,11 +81,15 @@ void AttachStreamLabelPass::FindNodes(const ComputeGraphPtr &graph) {
 ///
 Status AttachStreamLabelPass::UpdateCondBranch(const NodePtr &node) {
   std::string stream_label;
+  if (AttachFlag(node, stream_label) != SUCCESS) {
+    GELOGE(FAILED, "Attach flag for node %s failed.", node->GetName().c_str());
+    return FAILED;
+  }
+
   std::unordered_set<NodePtr> branch_nodes;
   std::unordered_set<NodePtr> visited;
   std::stack<NodePtr> nodes;
   nodes.push(node);
-
   static const std::set<std::string> end_type_set = {STREAMSWITCH, STREAMMERGE, MERGE};
   while (!nodes.empty()) {
     NodePtr cur_node = nodes.top();
@@ -95,10 +97,6 @@ Status AttachStreamLabelPass::UpdateCondBranch(const NodePtr &node) {
     if (visited.count(cur_node) > 0) {
       continue;
     }
-    if (AttachFlag(cur_node, stream_label) != SUCCESS) {
-      GELOGE(FAILED, "Attach flag for node %s failed.", cur_node->GetName().c_str());
-      return FAILED;
-    }
 
     const std::string &type = cur_node->GetType();
     for (const auto &out_node : cur_node->GetOutAllNodes()) {
@@ -115,10 +113,6 @@ Status AttachStreamLabelPass::UpdateCondBranch(const NodePtr &node) {
     visited.insert(cur_node);
   }
 
-  if (node->GetType() == STREAMSWITCH) {
-    GE_CHK_STATUS_RET(SetActiveLabelList(node, {stream_label}), "set active_label_list failed.");
-  }
-
   for (const NodePtr &tmp_node : branch_nodes) {
     GELOGD("Attach label %s to node: %s.", stream_label.c_str(), tmp_node->GetName().c_str());
     GE_CHK_STATUS_RET(SetStreamLabel(tmp_node, stream_label), "Set stream label failed.");
@@ -148,11 +142,10 @@ Status AttachStreamLabelPass::AttachFlag(const NodePtr &node, std::string &strea
     GE_CHK_BOOL_EXEC(AttrUtils::GetBool(op_desc, ATTR_NAME_SWITCH_TRUE_BRANCH_FLAG, value), return FAILED,
                      "StreamSwitch get attr TRUE_BRANCH_STREAM failed.");
     stream_label += (value ? "_t" : "_f");
+    GE_CHK_STATUS_RET(SetActiveLabelList(node, {stream_label}), "set active_label_list failed.");
   } else if (type == STREAMMERGE) {
     stream_label = node->GetName();
     GE_CHK_STATUS_RET(SetStreamLabel(node, stream_label), "Set stream label failed.");
-  } else if ((type == EXIT) || (type == REFEXIT)) {
-    GE_CHK_STATUS_RET(SetStreamLabel(node, stream_label), "Set stream label failed.");
   }
 
   return SUCCESS;
@@ -166,12 +159,13 @@ Status AttachStreamLabelPass::UpdateEnterNode() {
   std::unordered_map<NodePtr, std::vector<NodePtr>> enter_active_map;
   for (const auto &enter_node : enter_nodes_) {
     for (const auto &out_ctrl_node : enter_node->GetOutControlNodes()) {
-      if (out_ctrl_node->GetType() == STREAMACTIVE) {
-        if (enter_active_map.find(out_ctrl_node) == enter_active_map.end()) {
-          enter_active_map[out_ctrl_node] = {enter_node};
-        } else {
-          enter_active_map[out_ctrl_node].emplace_back(enter_node);
-        }
+      if (out_ctrl_node->GetType() != STREAMACTIVE) {
+        continue;
+      }
+      if (enter_active_map.find(out_ctrl_node) == enter_active_map.end()) {
+        enter_active_map[out_ctrl_node] = {enter_node};
+      } else {
+        enter_active_map[out_ctrl_node].emplace_back(enter_node);
       }
     }
   }
@@ -226,9 +220,8 @@ Status AttachStreamLabelPass::SetEnterLabel(const std::vector<NodePtr> &enter_no
   std::string stream_label;
   GE_CHECK_NOTNULL(active_node);
   (void)AttrUtils::GetStr(active_node->GetOpDesc(), ATTR_NAME_STREAM_LABEL, stream_label);
-
   if (stream_label.empty()) {
-    GELOGW("stream_label of enter_active & enter_nodes is empty.");
+    GELOGD("stream_label of enter_active %s is empty.", active_node->GetName().c_str());
     return SUCCESS;
   }
 
@@ -238,7 +231,6 @@ Status AttachStreamLabelPass::SetEnterLabel(const std::vector<NodePtr> &enter_no
       GE_CHK_STATUS_RET(SetStreamLabel(enter_node, stream_label), "Set stream label failed.");
     }
   }
-  GE_CHK_STATUS_RET(SetStreamLabel(active_node, stream_label), "Set stream label failed.");
   return SUCCESS;
 }
 
diff --git a/ge/graph/passes/cond_remove_pass.cc b/ge/graph/passes/cond_remove_pass.cc
index e8d1493f..bf2e1170 100644
--- a/ge/graph/passes/cond_remove_pass.cc
+++ b/ge/graph/passes/cond_remove_pass.cc
@@ -37,6 +37,12 @@ Status CondRemovePass::Run(NodePtr &node) {
   OutDataAnchorPtr cond_out_anchor = nullptr;
   InDataAnchorPtr cond_in_anchor = nullptr;
   Status ret = GetCondInfo(node, graph, cond_out_anchor, cond_in_anchor);
+  if (ret == NOT_CHANGED) {
+    return SUCCESS;
+  } else if (ret != SUCCESS) {
+    GELOGE(FAILED, "Get cond_info for node %s failed.", node->GetName().c_str());
+    return FAILED;
+  }
   int32_t cond_index = 0;
   GELOGD("Handle cond remove for node %s.", node->GetOpDesc()->GetName().c_str());
   bool if_cond_const = CheckIfCondConstInput(cond_out_anchor, cond_in_anchor, cond_index);
@@ -322,11 +328,11 @@ Status CondRemovePass::GetCondInfo(const NodePtr &node, ComputeGraphPtr &graph,
   std::string type = node->GetType();
   if ((kIfOpTypes.count(type) != 0) || (kCaseOpTypes.count(type) != 0)) {
     if (GetCondInfoForIfCase(node, graph, cond_out_anchor, cond_in_anchor) != SUCCESS) {
-      GELOGE(FAILED, "Get cond_info for if node failed.");
+      GELOGE(FAILED, "Get cond_info for if/case node failed.");
       return FAILED;
     }
   } else {
-    GELOGD("no need cond_pass for node %s.", node->GetName().c_str());
+    GELOGD("no need cond_remove_pass for node %s.", node->GetName().c_str());
     return NOT_CHANGED;
   }
 
diff --git a/ge/graph/passes/ctrl_edge_transfer_pass.cc b/ge/graph/passes/ctrl_edge_transfer_pass.cc
index f53dc7be..a538a10c 100755
--- a/ge/graph/passes/ctrl_edge_transfer_pass.cc
+++ b/ge/graph/passes/ctrl_edge_transfer_pass.cc
@@ -38,7 +38,6 @@ namespace ge {
  *   \  /
  *    B
  */
-
 Status CtrlEdgeTransferPass::Run(ge::ComputeGraphPtr graph) {
   GELOGD("CtrlEdgeTransferPass start running");
   GE_CHECK_NOTNULL(graph);
diff --git a/ge/graph/passes/data_pass.cc b/ge/graph/passes/data_pass.cc
index 4ec8743e..5bbd2fb1 100644
--- a/ge/graph/passes/data_pass.cc
+++ b/ge/graph/passes/data_pass.cc
@@ -21,6 +21,7 @@
 
 namespace ge {
 namespace {
+const int kDataIndexOffset = 2;
 Status MappingSubgraphInput(const ComputeGraphPtr &graph, const std::function<int(int data_index)> &input) {
   for (const auto &node : graph->GetDirectNode()) {
     if (node->GetType() != DATA) {
@@ -111,7 +112,7 @@ Status ParseSubgraphPostFnWhile(const string &subgraph_name, const ComputeGraphP
 
 Status ParseSubgraphPostFnFor(const string &subgraph_name, const ComputeGraphPtr &graph) {
   return MappingSubgraphIndex(graph,
-      [](int data_index) { return (data_index == 0) ? 0 : data_index + 2; },
+      [](int data_index) { return (data_index == 0) ? 0 : data_index + kDataIndexOffset; },
       [](int retval_index) { return retval_index; });
 }
 
diff --git a/ge/graph/passes/enter_pass.cc b/ge/graph/passes/enter_pass.cc
index 206d271c..afeca78f 100644
--- a/ge/graph/passes/enter_pass.cc
+++ b/ge/graph/passes/enter_pass.cc
@@ -16,6 +16,7 @@
 
 #include "graph/passes/enter_pass.h"
 
+#include "graph/debug/ge_attr_define.h"
 #include "framework/common/debug/ge_log.h"
 #include "framework/common/debug/log.h"
 #include "graph/utils/graph_utils.h"
@@ -72,33 +73,25 @@ Status EnterPass::Run(NodePtr &node) {
 }
 
 Status EnterPass::OptimizeEnter(NodePtr &node, NodePtr &in_node) {
-  auto out_nodes_of_in_node = in_node->GetOutAllNodes();
-  if (out_nodes_of_in_node.size() != kOutNodesNum) {
+  if ((in_node->GetOutAllNodes().size() != kOutNodesNum) || !node->GetOutControlNodes().empty()) {
     return SUCCESS;
   }
-
-  if (!node->GetOutControlNodes().empty()) {
+  bool is_constant_flag = true;
+  (void)AttrUtils::GetBool(node->GetOpDesc(), ENTER_ATTR_CONSTANT_FLAG, is_constant_flag);
+  if (!is_constant_flag) {
     return SUCCESS;
   }
 
-  for (const auto &out_node : node->GetOutDataNodes()) {
-    GE_CHECK_NOTNULL(out_node);
-    if (out_node->GetType() == MERGE) {
-      return SUCCESS;
-    }
-  }
-
   GE_CHECK_NOTNULL(in_node->GetOutDataAnchor(0));
   GE_CHK_STATUS_RET(in_node->GetOutDataAnchor(0)->Unlink(node->GetInDataAnchor(0)));
-  auto out_data_anchor = node->GetOutDataAnchor(0);
+  const auto &out_data_anchor = node->GetOutDataAnchor(0);
   GE_CHECK_NOTNULL(out_data_anchor);
-  for (auto peer_in_data_anchor : out_data_anchor->GetPeerInDataAnchors()) {
+  for (const auto &peer_in_data_anchor : out_data_anchor->GetPeerInDataAnchors()) {
     GE_CHK_STATUS_RET(out_data_anchor->Unlink(peer_in_data_anchor));
     GE_CHK_STATUS_RET(in_node->GetOutDataAnchor(0)->LinkTo(peer_in_data_anchor));
   }
-
-  auto graph = node->GetOwnerComputeGraph();
-  GE_CHK_STATUS_RET(GraphUtils::RemoveNodeWithoutRelink(graph, node))
+  GE_CHK_STATUS_RET(GraphUtils::RemoveNodeWithoutRelink(node->GetOwnerComputeGraph(), node));
+  AddNodeDeleted(node);
   AddRePassNodesWithInOut(in_node);
 
   return SUCCESS;
diff --git a/ge/graph/passes/for_pass.cc b/ge/graph/passes/for_pass.cc
index f3caea35..31dee390 100644
--- a/ge/graph/passes/for_pass.cc
+++ b/ge/graph/passes/for_pass.cc
@@ -37,6 +37,7 @@ namespace {
   const uint32_t kSubgraphLoopVarInputIndex = 0;
   const uint32_t kSubgraphInputIndex = 1;
   const uint32_t kWhileOutputIndex = 5;
+  const size_t kIDiffValue = 2;
   const std::string kAbs = "Abs";
 }
 
@@ -137,7 +138,7 @@ Status ForPass::BuildForInfo(const ComputeGraphPtr &root_graph, const NodePtr &n
   for_info.ctrl_inputs = std::move(ctrl_inputs);
   for_info.ctrl_outputs = std::move(ctrl_outputs);
 
-  GELOGI("Build for_info for node %s succ.", node->GetName().c_str());
+  GELOGI("Build for_info for node %s success.", node->GetName().c_str());
   return SUCCESS;
 }
 
@@ -159,13 +160,7 @@ OutDataAnchorPtr ForPass::FindInputWithIndex(const NodePtr &node, uint32_t index
     return nullptr;
   }
 
-  OutDataAnchorPtr peer_out_anchor = in_data_anchor->GetPeerOutAnchor();
-  if (peer_out_anchor == nullptr) {
-    GELOGE(FAILED, "FindInputWithIndex %s:%u failed: peer_out_anchor is NULL.", node->GetName().c_str(), index);
-    return nullptr;
-  }
-
-  return peer_out_anchor;
+  return in_data_anchor->GetPeerOutAnchor();
 }
 
 ///
@@ -186,20 +181,13 @@ Status ForPass::FindInputsAndOutputs(const NodePtr &node, std::vector<OutDataAnc
   uint32_t input_data_num = node->GetAllInDataAnchorsSize();
   for (uint32_t index = FOR_DATA_INPUT; index < input_data_num; index++) {
     InDataAnchorPtr in_data_anchor = node->GetInDataAnchor(index);
-    if (in_data_anchor == nullptr) {
-      GELOGE(FAILED, "FindInputWithIndex %s:%u failed: in_data_anchor is NULL.", node->GetName().c_str(), index);
-      return FAILED;
-    }
-    GE_IF_BOOL_EXEC(in_data_anchor->GetPeerOutAnchor() == nullptr,
-                    GELOGW("Get null input by index %d from node %s ",
-                           in_data_anchor->GetIdx(), node->GetName().c_str());
-                    continue);
+    GE_CHECK_NOTNULL(in_data_anchor);
     data_inputs.emplace_back(in_data_anchor->GetPeerOutAnchor());
   }
 
-  for (auto &out_data_anchor : node->GetAllOutDataAnchors()) {
+  for (const auto &out_data_anchor : node->GetAllOutDataAnchors()) {
     std::vector<ge::InDataAnchorPtr> peer_in_data_anchors;
-    for (auto &peer_in_data_anchor : out_data_anchor->GetPeerInDataAnchors()) {
+    for (const auto &peer_in_data_anchor : out_data_anchor->GetPeerInDataAnchors()) {
       peer_in_data_anchors.emplace_back(peer_in_data_anchor);
     }
     data_outputs.emplace_back(peer_in_data_anchors);
@@ -207,13 +195,13 @@ Status ForPass::FindInputsAndOutputs(const NodePtr &node, std::vector<OutDataAnc
 
   InControlAnchorPtr in_ctrl_anchor = node->GetInControlAnchor();
   GE_CHECK_NOTNULL(in_ctrl_anchor);
-  for (auto &peer_out_ctrl_anchor : in_ctrl_anchor->GetPeerOutControlAnchors()) {
+  for (const auto &peer_out_ctrl_anchor : in_ctrl_anchor->GetPeerOutControlAnchors()) {
     ctrl_inputs.emplace_back(peer_out_ctrl_anchor);
   }
 
   OutControlAnchorPtr out_ctrl_anchor = node->GetOutControlAnchor();
   GE_CHECK_NOTNULL(out_ctrl_anchor);
-  for (auto &peer_in_ctrl_anchor : out_ctrl_anchor->GetPeerInControlAnchors()) {
+  for (const auto &peer_in_ctrl_anchor : out_ctrl_anchor->GetPeerInControlAnchors()) {
     ctrl_outputs.emplace_back(peer_in_ctrl_anchor);
   }
 
@@ -707,7 +695,7 @@ Status ForPass::UpdateForBodyInputMapping(const WhileInfo &while_info) {
     } else if ((i == FOR_LIMIT_INPUT) || (i == FOR_DELTA_INPUT)) {
       continue;
     } else {
-      input_mapping[i] = i - 2;
+      input_mapping[i] = i - kIDiffValue;
     }
   }
   for_body->UpdateInputMapping(input_mapping);
diff --git a/ge/graph/passes/mark_agnostic_pass.cc b/ge/graph/passes/mark_agnostic_pass.cc
index 8c9a0451..30fa1742 100644
--- a/ge/graph/passes/mark_agnostic_pass.cc
+++ b/ge/graph/passes/mark_agnostic_pass.cc
@@ -19,6 +19,8 @@
 #include "graph/utils/tensor_utils.h"
 
 namespace ge {
+const size_t kTwoInputNodesSize = 2;
+
 Status MarkAgnosticPass::Run(ComputeGraphPtr graph) {
   for (const auto &node : graph->GetDirectNode()) {
     auto node_type = NodeUtils::GetNodeType(*node);
@@ -52,7 +54,7 @@ Status MarkAgnosticPass::Run(ComputeGraphPtr graph) {
       /// Enter-----------+
       ///                 +-> Merge
       /// NextIteration---+
-      if (input_nodes.size() == 2) {
+      if (input_nodes.size() == kTwoInputNodesSize) {
         if (input_nodes.at(0)->GetType() == ENTER && input_nodes.at(1)->GetType() == NEXTITERATION) {
           continue;
         }
diff --git a/ge/graph/passes/merge_pass.cc b/ge/graph/passes/merge_pass.cc
index d2340037..26d82820 100644
--- a/ge/graph/passes/merge_pass.cc
+++ b/ge/graph/passes/merge_pass.cc
@@ -21,18 +21,16 @@
 #include <vector>
 
 #include "framework/common/debug/ge_log.h"
-#include "common/ge_inner_error_codes.h"
 #include "common/ge/ge_util.h"
 #include "graph/common/omg_util.h"
 #include "graph/debug/ge_attr_define.h"
 #include "graph/utils/graph_utils.h"
 #include "graph/passes/pass_utils.h"
 
-using domi::PARAM_INVALID;
-using domi::SUCCESS;
-
 namespace ge {
 const int kValueIndexOutputIndex = 1;
+const size_t kCaseNoInput = 0;
+const size_t kCaseOneInput = 1;
 
 Status MergePass::Run(NodePtr &node) {
   GELOGD("MergePass running");
@@ -47,15 +45,14 @@ Status MergePass::Run(NodePtr &node) {
     return SUCCESS;
   }
 
-  auto out_data_anchors = node->GetAllOutDataAnchors();
-  if (out_data_anchors.empty()) {
+  if (node->GetAllOutDataAnchors().empty()) {
     GELOGE(PARAM_INVALID, "[%s] Merge node output anchor is empty", node->GetName().c_str());
     return PARAM_INVALID;
   }
 
-  auto in_data_nodes = node->GetInDataNodes();
+  const auto &in_data_nodes = node->GetInDataNodes();
   switch (in_data_nodes.size()) {
-    case 0: {
+    case kCaseNoInput: {
       /// Case A: input_count = 0, the output of merge node is inactive as well
       /// In which case the output branch can be removed
       /// until another merge node is met
@@ -70,7 +67,7 @@ Status MergePass::Run(NodePtr &node) {
       }
       return ret;
     }
-    case 1: {  // Case B: input_count = 1, the merge node can be optimized out
+    case kCaseOneInput: {  // Case B: input_count = 1, the merge node can be optimized out
       std::vector<int> merge_io_map = {PassUtils::GetUniqueInDataAnchorIndex(node), -1};
       if (merge_io_map[0] != -1 && IsNeedChangeIndexToConstant(node)) {
         int index = merge_io_map[0];
diff --git a/ge/graph/passes/multi_batch_pass.cc b/ge/graph/passes/multi_batch_pass.cc
index c7034612..74f7e30e 100644
--- a/ge/graph/passes/multi_batch_pass.cc
+++ b/ge/graph/passes/multi_batch_pass.cc
@@ -22,9 +22,6 @@
 #include "graph/common/omg_util.h"
 #include "graph/utils/type_utils.h"
 
-using std::string;
-using std::vector;
-
 namespace ge {
 Status MultiBatchPass::Run(ComputeGraphPtr graph) {
   GELOGD("MultiBatchPass Enter");
@@ -53,7 +50,7 @@ Status MultiBatchPass::Run(ComputeGraphPtr graph) {
     return FAILED;
   }
   std::vector<std::vector<int64_t>> batch_shape;
-  vector<vector<int64_t>> combined_batch;
+  std::vector<std::vector<int64_t>> combined_batch;
   if (!CheckSwitchN(batch_shape, combined_batch)) {
     GELOGE(FAILED, "CheckSwitchN failed.");
     return FAILED;
@@ -104,6 +101,7 @@ Status MultiBatchPass::ClearStatus() {
 ///
 Status MultiBatchPass::SetCaseLabel(const ComputeGraphPtr &graph, const NodePtr &case_node) {
   const auto &func_desc = case_node->GetOpDesc();
+  GE_CHECK_NOTNULL(func_desc);
   if (!func_desc->HasAttr(ATTR_NAME_BATCH_NUM)) {
     GELOGD("Graph: %s Not multi-batch, Node: %s", graph->GetName().c_str(), case_node->GetName().c_str());
     return SUCCESS;
@@ -114,7 +112,7 @@ Status MultiBatchPass::SetCaseLabel(const ComputeGraphPtr &graph, const NodePtr
     const auto &subgraph = graph->GetSubgraph(dynamic_branch_names[i]);
     GE_CHECK_NOTNULL(subgraph);
 
-    const string batch_label = "Batch_" + std::to_string(i);
+    const std::string batch_label = "Batch_" + std::to_string(i);
     for (const auto &node : subgraph->GetDirectNode()) {
       (void)AttrUtils::SetStr(node->GetOpDesc(), ATTR_NAME_BATCH_LABEL, batch_label);
     }
@@ -139,12 +137,12 @@ Status MultiBatchPass::FindPredValue(const ComputeGraphPtr &graph, OutDataAnchor
       continue;
     }
 
-    InDataAnchorPtr in_data_anchor = node->GetInDataAnchor(SWITCH_PRED_INPUT);
+    const auto &in_data_anchor = node->GetInDataAnchor(SWITCH_PRED_INPUT);
     if (in_data_anchor == nullptr) {
       GELOGE(FAILED, "FindPredInput failed, in_data_anchor is null, node:%s.", node->GetName().c_str());
       return FAILED;
     }
-    OutDataAnchorPtr pred_input = in_data_anchor->GetPeerOutAnchor();
+    const auto &pred_input = in_data_anchor->GetPeerOutAnchor();
     if (pred_input == nullptr) {
       GELOGE(FAILED, "FindPredInput failed, pred_input is null, node:%s.", node->GetName().c_str());
       return FAILED;
@@ -178,12 +176,10 @@ Status MultiBatchPass::FindPredValue(const ComputeGraphPtr &graph, OutDataAnchor
 /// @return Status
 ///
 Status MultiBatchPass::GetDynamicType() {
-  for (const auto &switchn : switch_n_nodes_) {
-    auto switchn_desc = switchn->GetOpDesc();
-    GE_CHECK_NOTNULL(switchn_desc);
+  for (const auto &switch_n : switch_n_nodes_) {
     int32_t dynamic_type = static_cast<int32_t>(FIXED);
-    if (!AttrUtils::GetInt(switchn_desc, ATTR_DYNAMIC_TYPE, dynamic_type)) {
-      GELOGE(FAILED, "Get attr ATTR_DYNAMIC_TYPE of node: %s failed.", switchn->GetName().c_str());
+    if (!AttrUtils::GetInt(switch_n->GetOpDesc(), ATTR_DYNAMIC_TYPE, dynamic_type)) {
+      GELOGE(FAILED, "Get attr ATTR_DYNAMIC_TYPE of node: %s failed.", switch_n->GetName().c_str());
       return FAILED;
     }
     if (dynamic_type == static_cast<int32_t>(FIXED)) {
@@ -191,7 +187,7 @@ Status MultiBatchPass::GetDynamicType() {
       return FAILED;
     }
     if (dynamic_type_ != static_cast<int32_t>(FIXED) && dynamic_type_ != dynamic_type) {
-      GELOGE(FAILED, "Attr ATTR_DYNAMIC_TYPE of all switchn node should be same, while one is %d and another is %d.",
+      GELOGE(FAILED, "Attr ATTR_DYNAMIC_TYPE of all switch_n node should be same, while one is %d and another is %d.",
              dynamic_type, dynamic_type_);
       return FAILED;
     }
@@ -212,21 +208,19 @@ Status MultiBatchPass::GetDynamicType() {
 Status MultiBatchPass::GetUserDesignateShape() {
   data_name_order_.clear();
   bool first_check = true;
-  for (const auto &switchn : switch_n_nodes_) {
-    auto switchn_desc = switchn->GetOpDesc();
-    GE_CHECK_NOTNULL(switchn_desc);
-    vector<string> cur_switchn_data_name_order;
-    if (!AttrUtils::GetListStr(switchn_desc, ATTR_USER_DESIGNEATE_SHAPE_ORDER, cur_switchn_data_name_order)) {
-      GELOGE(FAILED, "Get attr ATTR_USER_DESIGNEATE_SHAPE_ORDER of node: %s failed.", switchn->GetName().c_str());
+  for (const auto &switch_n : switch_n_nodes_) {
+    std::vector<std::string> cur_data_name_order;
+    if (!AttrUtils::GetListStr(switch_n->GetOpDesc(), ATTR_USER_DESIGNEATE_SHAPE_ORDER, cur_data_name_order)) {
+      GELOGE(FAILED, "Get attr ATTR_USER_DESIGNEATE_SHAPE_ORDER of node: %s failed.", switch_n->GetName().c_str());
       return FAILED;
     }
     if (first_check) {
-      data_name_order_ = cur_switchn_data_name_order;
+      data_name_order_ = cur_data_name_order;
       first_check = false;
     } else {
-      if (data_name_order_ != cur_switchn_data_name_order) {
+      if (data_name_order_ != cur_data_name_order) {
         GELOGE(FAILED, "The ATTR_USER_DESIGNEATE_SHAPE_ORDER of switchN must be same: %s failed.",
-               switchn->GetName().c_str());
+               switch_n->GetName().c_str());
         return FAILED;
       }
     }
@@ -245,7 +239,8 @@ Status MultiBatchPass::GetUserDesignateShape() {
 /// @param [out] combined_batch
 /// @return bool
 ///
-bool MultiBatchPass::CheckSwitchN(vector<vector<int64_t>> &batch_shape, vector<vector<int64_t>> &combined_batch) {
+bool MultiBatchPass::CheckSwitchN(std::vector<std::vector<int64_t>> &batch_shape,
+                                  std::vector<std::vector<int64_t>> &combined_batch) {
   // Check if output_num of different SwitchN is same
   uint32_t batch_num = 0;
   for (const NodePtr &node : switch_n_nodes_) {
@@ -281,7 +276,8 @@ bool MultiBatchPass::CheckSwitchN(vector<vector<int64_t>> &batch_shape, vector<v
     }
     size_t tmp_combined_dim_num = combined_batch[i].size();
     if (combined_dim_num != tmp_combined_dim_num) {
-      GELOGE(FAILED, "Dim num of combined_batch not equal, batch_0:%zu, batch_%u:%zu.", dim_num, i, tmp_dim_num);
+      GELOGE(FAILED, "Dim num of combined_batch not equal, batch_0:%zu, batch_%u:%zu.",
+             combined_dim_num, i, tmp_combined_dim_num);
       return false;
     }
   }
@@ -296,11 +292,11 @@ bool MultiBatchPass::CheckSwitchN(vector<vector<int64_t>> &batch_shape, vector<v
 /// @param [out] combined_batch
 /// @return bool
 ///
-bool MultiBatchPass::GetBatchInfo(uint32_t batch_num, vector<vector<int64_t>> &batch_shape,
-                                  vector<vector<int64_t>> &combined_batch) {
+bool MultiBatchPass::GetBatchInfo(uint32_t batch_num, std::vector<std::vector<int64_t>> &batch_shape,
+                                  std::vector<std::vector<int64_t>> &combined_batch) {
   // Check if output_shape of different SwitchN is same
-  vector<vector<int64_t>> idx_batch_shape;
-  vector<vector<int64_t>> idx_combined_batch;
+  std::vector<std::vector<int64_t>> idx_batch_shape;
+  std::vector<std::vector<int64_t>> idx_combined_batch;
   for (uint32_t i = 0; i < batch_num; i++) {
     idx_batch_shape.clear();
     idx_combined_batch.clear();
@@ -310,7 +306,7 @@ bool MultiBatchPass::GetBatchInfo(uint32_t batch_num, vector<vector<int64_t>> &b
         GELOGE(FAILED, "CheckDims failed, get op_desc failed, node: %s.", node->GetName().c_str());
         return false;
       }
-      vector<int64_t> output_dims;
+      std::vector<int64_t> output_dims;
       if (!AttrUtils::GetListInt(op_desc->GetOutputDesc(i), ATTR_NAME_SWITCHN_PRED_VALUE, output_dims)) {
         GELOGE(FAILED, "CheckDims failed, get attr ATTR_NAME_SWITCHN_PRED_VALUE failed, batch_index=%u.", i);
         return false;
@@ -385,8 +381,8 @@ Status MultiBatchPass::FindSwitchOutNodes(uint32_t batch_num) {
 /// @return Status
 ///
 Status MultiBatchPass::ReplaceSwitchN(const ComputeGraphPtr &graph, const OutDataAnchorPtr &pred_value,
-                                      const vector<vector<int64_t>> &batch_shape,
-                                      const vector<vector<int64_t>> &combined_batch) {
+                                      const std::vector<std::vector<int64_t>> &batch_shape,
+                                      const std::vector<std::vector<int64_t>> &combined_batch) {
   NodePtr pred_value_node = pred_value->GetOwnerNode();
   // Create SwitchCase node
   const std::string &switch_case_name = pred_value_node->GetName() + "_" + STREAMSWITCHN;
@@ -429,31 +425,11 @@ bool MultiBatchPass::CheckDims(const std::vector<std::vector<int64_t>> &output_s
     return false;
   }
 
-  size_t num = output_shape.size();
-  size_t dim_num = output_shape[0].size();
-  for (size_t i = 1; i < num; i++) {
-    size_t tmp_dim_num = output_shape[i].size();
-    if (dim_num != tmp_dim_num) {
-      GELOGE(FAILED, "CheckDims failed: dim_num not equal, output_0:%zu, output_%zu:%zu.", dim_num, i, tmp_dim_num);
+  for (auto iter = output_shape.begin() + 1; iter != output_shape.end(); ++iter) {
+    if (output_shape[0] != *iter) {
       return false;
     }
   }
-
-  if (dim_num == 0) {
-    return true;
-  }
-
-  for (size_t i = 0; i < dim_num; i++) {
-    int64_t dim_value = output_shape[0][i];
-    for (size_t j = 1; j < num; j++) {
-      int64_t tmp_dim_value = output_shape[j][i];
-      if (dim_value != tmp_dim_value) {
-        GELOGE(FAILED, "CheckDims failed: dim_value not equal, dim_index=%zu, dim_value_0:%ld, dim_value_%zu:%ld.", i,
-               dim_value, j, tmp_dim_value);
-        return false;
-      }
-    }
-  }
   return true;
 }
 
@@ -468,8 +444,8 @@ bool MultiBatchPass::CheckDims(const std::vector<std::vector<int64_t>> &output_s
 ///
 NodePtr MultiBatchPass::CreateSwitchCaseNode(const ComputeGraphPtr &graph, const std::string &name,
                                              const OutDataAnchorPtr &pred_value,
-                                             const vector<vector<int64_t>> &batch_shape,
-                                             const vector<vector<int64_t>> &combined_batch) {
+                                             const std::vector<std::vector<int64_t>> &batch_shape,
+                                             const std::vector<std::vector<int64_t>> &combined_batch) {
   OpDescPtr op_desc = MakeShared<OpDesc>(name, STREAMSWITCHN);
   if (op_desc == nullptr) {
     GELOGE(FAILED, "Create op_desc failed, StreamSwitchN:%s.", name.c_str());
@@ -512,7 +488,7 @@ NodePtr MultiBatchPass::CreateSwitchCaseNode(const ComputeGraphPtr &graph, const
       GELOGE(FAILED, "set attr ATTR_NAME_PRED_VALUE failed, StreamSwitchN:%s.", name.c_str());
       return nullptr;
     }
-    const string &attr_combined_batch = ATTR_NAME_COMBINED_BATCH + "_" + std::to_string(i);
+    const std::string &attr_combined_batch = ATTR_NAME_COMBINED_BATCH + "_" + std::to_string(i);
     if (!AttrUtils::SetListInt(op_desc, attr_combined_batch, combined_batch[i])) {
       GELOGE(FAILED, "set attr ATTR_NAME_COMBINED_BATCH failed, StreamSwitchN:%s.", name.c_str());
       return nullptr;
diff --git a/ge/graph/passes/pass_utils.cc b/ge/graph/passes/pass_utils.cc
index 5359ff63..3adfbde3 100644
--- a/ge/graph/passes/pass_utils.cc
+++ b/ge/graph/passes/pass_utils.cc
@@ -37,10 +37,6 @@
 #include "graph/utils/type_utils.h"
 
 namespace ge {
-namespace {
-const uint32_t kShapeDimSize = 1;
-const uint32_t DIM_SIZE_TWO = 2;
-}  // namespace
 
 Status PassUtils::ConstructTensorDescWithData(const GeTensorDesc &out_desc, std::vector<int64_t> &data,
                                               std::vector<GeTensorPtr> &v_output, const bool scalar_output) {
diff --git a/ge/graph/passes/subgraph_pass.cc b/ge/graph/passes/subgraph_pass.cc
index 88e661a7..d1111d52 100755
--- a/ge/graph/passes/subgraph_pass.cc
+++ b/ge/graph/passes/subgraph_pass.cc
@@ -149,10 +149,10 @@ Status SubgraphPass::SubgraphOutputNode(const ComputeGraphPtr &graph, const Node
     //   5. While->NetOutput in known subgraph
     std::string op_type;
     bool insert_flag = NodeUtils::GetConstOpType(in_node, op_type) ||
-                       IsAtomicRequired(in_node, peer_out_anchor->GetIdx()) || IsOutputContinuesRequired(in_node) ||
-                       ((in_node->GetType() == DATA) && (kWhileOpTypes.count(graph->GetParentNode()->GetType()) == 0)) ||
-                       (!graph->GetGraphUnknownFlag() && NodeUtils::IsDynamicShape(node) &&
-                        (kWhileOpTypes.count(in_node->GetType()) != 0));
+        IsAtomicRequired(in_node, peer_out_anchor->GetIdx()) || IsOutputContinuesRequired(in_node) ||
+        ((in_node->GetType() == DATA) && (kWhileOpTypes.count(graph->GetParentNode()->GetType()) == 0)) ||
+        (!graph->GetGraphUnknownFlag() && NodeUtils::IsDynamicShape(node) &&
+            (kWhileOpTypes.count(in_node->GetType()) != 0));
     if (insert_flag) {
       GELOGD("Insert MemcpyAsync node between %s and %s.", in_node->GetName().c_str(), node->GetName().c_str());
       std::string name = node->GetName() + "_input_" + std::to_string(in_data_anchor->GetIdx()) + "_Memcpy";
diff --git a/ge/graph/passes/switch_to_stream_switch_pass.cc b/ge/graph/passes/switch_to_stream_switch_pass.cc
index 529480a6..f75a104f 100644
--- a/ge/graph/passes/switch_to_stream_switch_pass.cc
+++ b/ge/graph/passes/switch_to_stream_switch_pass.cc
@@ -72,25 +72,26 @@ Status SwitchToStreamSwitchPass::CheckCycleDependence(const ComputeGraphPtr &gra
   std::unordered_map<NodePtr, std::vector<NodePtr>> cond_switch_map;
   for (const NodePtr &node : graph->GetDirectNode()) {
     GE_CHK_STATUS_RET(GetOriginalType(node, type), "Get node type failed.");
-    if ((type == SWITCH) || (type == REFSWITCH)) {
-      InDataAnchorPtr in_cond_anchor = node->GetInDataAnchor(SWITCH_PRED_INPUT);
-      GE_CHECK_NOTNULL(in_cond_anchor);
-      OutDataAnchorPtr peer_out_anchor = in_cond_anchor->GetPeerOutAnchor();
-      GE_CHECK_NOTNULL(peer_out_anchor);
-      if (FindSwitchCondInput(true, peer_out_anchor) != SUCCESS) {
-        GELOGE(FAILED, "Find pred_input for switch_node %s failed.", node->GetName().c_str());
-        return FAILED;
-      }
+    if ((type != SWITCH) && (type != REFSWITCH)) {
+      continue;
+    }
+    InDataAnchorPtr in_cond_anchor = node->GetInDataAnchor(SWITCH_PRED_INPUT);
+    GE_CHECK_NOTNULL(in_cond_anchor);
+    OutDataAnchorPtr peer_out_anchor = in_cond_anchor->GetPeerOutAnchor();
+    GE_CHECK_NOTNULL(peer_out_anchor);
+    if (FindSwitchCondInput(peer_out_anchor) != SUCCESS) {
+      GELOGE(FAILED, "Find pred_input for switch_node %s failed.", node->GetName().c_str());
+      return FAILED;
+    }
 
-      NodePtr cond_node = peer_out_anchor->GetOwnerNode();
-      auto iter = cond_switch_map.find(cond_node);
-      if (iter == cond_switch_map.end()) {
-        cond_switch_map[cond_node] = { node };
-      } else {
-        iter->second.emplace_back(node);
-      }
-      switch_nodes_.emplace_back(node);
+    NodePtr cond_node = peer_out_anchor->GetOwnerNode();
+    auto iter = cond_switch_map.find(cond_node);
+    if (iter == cond_switch_map.end()) {
+      cond_switch_map[cond_node] = { node };
+    } else {
+      iter->second.emplace_back(node);
     }
+    switch_nodes_.emplace_back(node);
   }
 
   MarkCycleDependence(cond_switch_map);
@@ -241,10 +242,6 @@ Status SwitchToStreamSwitchPass::BypassSwitchNode(const NodePtr &switch_node, Ou
     if (idx == SWITCH_DATA_INPUT) {
       peer_data_anchor = peer_out_anchor;
     } else {
-      if (FindSwitchCondInput(false, peer_out_anchor) != SUCCESS) {
-        GELOGE(FAILED, "Find pred_input for switch_node %s failed.", switch_node->GetName().c_str());
-        return FAILED;
-      }
       peer_cond_anchor = peer_out_anchor;
     }
   }
@@ -254,15 +251,14 @@ Status SwitchToStreamSwitchPass::BypassSwitchNode(const NodePtr &switch_node, Ou
 
 ///
 /// @brief Find Switch cond input
-/// @param [in] pass_switch_flag
 /// @param [out] peer_cond_anchor
 /// @return Status
 ///
-Status SwitchToStreamSwitchPass::FindSwitchCondInput(bool pass_switch_flag, OutDataAnchorPtr &peer_cond_anchor) {
+Status SwitchToStreamSwitchPass::FindSwitchCondInput(OutDataAnchorPtr &peer_cond_anchor) {
   NodePtr tmp_node = nullptr;
-  string type;
-  bool need_pass_type = true;
-  while (need_pass_type) {
+  std::string type;
+  bool pass_flag = true;
+  while (pass_flag) {
     if (tmp_node == nullptr) {
       tmp_node = peer_cond_anchor->GetOwnerNode();
     } else {
@@ -274,7 +270,7 @@ Status SwitchToStreamSwitchPass::FindSwitchCondInput(bool pass_switch_flag, OutD
     }
 
     GE_CHK_STATUS_RET(GetOriginalType(tmp_node, type), "Get node type failed.");
-    need_pass_type = (pass_switch_flag && ((type == SWITCH) || (type == REFSWITCH)));
+    pass_flag = ((type == SWITCH) || (type == REFSWITCH));
   }
 
   return SUCCESS;
@@ -369,7 +365,7 @@ Status SwitchToStreamSwitchPass::MarkBranches(const OutDataAnchorPtr &peer_cond_
     }
   } else {
     int64_t switch_group_id = GetGroupId(stream_switch);
-    map<int64_t, std::vector<std::list<NodePtr>>> switch_group_map;
+    std::map<int64_t, std::vector<std::list<NodePtr>>> switch_group_map;
     std::list<NodePtr> false_node_list;
     std::list<NodePtr> true_node_list;
     std::list<NodePtr> &node_list = true_branch_flag ? true_node_list : false_node_list;
@@ -389,7 +385,7 @@ Status SwitchToStreamSwitchPass::MarkBranches(const OutDataAnchorPtr &peer_cond_
 /// @return group_id
 ///
 int64_t SwitchToStreamSwitchPass::GetGroupId(const NodePtr &node) {
-  string tailing_optimization_option;
+  std::string tailing_optimization_option;
   bool is_tailing_optimization = false;
   if (GetContext().GetOption(OPTION_EXEC_ENABLE_TAILING_OPTIMIZATION, tailing_optimization_option) == GRAPH_SUCCESS) {
     // "1" means it's True from frontend option
@@ -400,7 +396,7 @@ int64_t SwitchToStreamSwitchPass::GetGroupId(const NodePtr &node) {
     return 0;
   }
 
-  string hccl_group_id;
+  std::string hccl_group_id;
   if (!AttrUtils::GetStr(node->GetOpDesc(), ATTR_NAME_HCCL_FUSED_GROUP, hccl_group_id)) {
     GELOGI("Node %s can not find hccl group id.", node->GetName().c_str());
     return 0;
@@ -432,6 +428,7 @@ Status SwitchToStreamSwitchPass::CombineSwitchNode(const ComputeGraphPtr &graph)
       same_cond_switch.insert(true_switch_list.begin(), true_switch_list.end());
 
       OutDataAnchorPtr peer_cond_anchor = iter->first;
+      GE_CHECK_NOTNULL(peer_cond_anchor);
       NodePtr cond_node = peer_cond_anchor->GetOwnerNode();
       GELOGI("CombineSwitchNode: cond_node=%s.", cond_node->GetName().c_str());
 
@@ -549,6 +546,7 @@ NodePtr SwitchToStreamSwitchPass::CreateCastOp(const ComputeGraphPtr &graph, con
 
   NodePtr cast_node = graph->AddNode(cast_desc);
   GE_CHK_BOOL_EXEC(cast_node != nullptr, return nullptr, "Create cast_node failed.");
+  // Cast node has and only has one input
   GE_CHK_STATUS(GraphUtils::AddEdge(peer_cond_anchor, cast_node->GetInDataAnchor(0)), "Cast add data edge failed.");
 
   return cast_node;
@@ -614,24 +612,24 @@ Status SwitchToStreamSwitchPass::ModifySwitchInCtlEdges(const NodePtr &switch_no
     return INTERNAL_ERROR;
   }
 
-  for (const NodePtr &in_ctl_node : switch_node->GetInControlNodes()) {
-    GE_CHK_STATUS(GraphUtils::RemoveEdge(in_ctl_node->GetOutControlAnchor(), switch_node->GetInControlAnchor()),
+  for (const NodePtr &in_ctrl_node : switch_node->GetInControlNodes()) {
+    GE_CHK_STATUS(GraphUtils::RemoveEdge(in_ctrl_node->GetOutControlAnchor(), switch_node->GetInControlAnchor()),
                   "Remove ctl edge failed.");
-    GE_IF_BOOL_EXEC(!in_ctl_node->GetOutControlAnchor()->IsLinkedWith(cast_node->GetInControlAnchor()), {
-      GE_CHK_STATUS(GraphUtils::AddEdge(in_ctl_node->GetOutControlAnchor(), cast_node->GetInControlAnchor()),
+    GE_IF_BOOL_EXEC(!in_ctrl_node->GetOutControlAnchor()->IsLinkedWith(cast_node->GetInControlAnchor()), {
+      GE_CHK_STATUS(GraphUtils::AddEdge(in_ctrl_node->GetOutControlAnchor(), cast_node->GetInControlAnchor()),
                     "Add ctl edge failed.");
     });
 
-    GE_IF_BOOL_EXEC(in_ctl_node->GetType() != STREAMSWITCH, continue);
-    if (same_cond_switch.count(in_ctl_node) > 0) {
-      GE_CHK_STATUS(GraphUtils::RemoveEdge(in_ctl_node->GetOutControlAnchor(), cast_node->GetInControlAnchor()),
+    GE_IF_BOOL_EXEC(in_ctrl_node->GetType() != STREAMSWITCH, continue);
+    if (same_cond_switch.count(in_ctrl_node) > 0) {
+      GE_CHK_STATUS(GraphUtils::RemoveEdge(in_ctrl_node->GetOutControlAnchor(), cast_node->GetInControlAnchor()),
                     "Remove ctl edge failed.");
       continue;
     }
 
-    auto find_res1 = switch_node_map_.find(in_ctl_node);
+    auto find_res1 = switch_node_map_.find(in_ctrl_node);
     GE_IF_BOOL_EXEC(find_res1 == switch_node_map_.end(), {
-      GELOGE(INTERNAL_ERROR, "StreamSwitch node %s not found in switch_node_map_.", in_ctl_node->GetName().c_str());
+      GELOGE(INTERNAL_ERROR, "StreamSwitch node %s not found in switch_node_map_.", in_ctrl_node->GetName().c_str());
       return INTERNAL_ERROR;
     });
     auto find_res2 = find_res1->second.find(orig_switch_name);
diff --git a/ge/graph/passes/switch_to_stream_switch_pass.h b/ge/graph/passes/switch_to_stream_switch_pass.h
index 48725230..05628871 100644
--- a/ge/graph/passes/switch_to_stream_switch_pass.h
+++ b/ge/graph/passes/switch_to_stream_switch_pass.h
@@ -42,9 +42,9 @@ namespace ge {
   +-----------+                +-----------+
   |   Const   |                | VariableV2|
   +-----------+                +-----------+
-*/
 
-/* Switch branch op optimize, Switches in same case merge to one StreamSwitch, update following nodes' input
+
+  Switch branch op optimize, Switches in same case merge to one StreamSwitch, update following nodes' input
 
                                             +-----------+
                                           / |   task2   | \
@@ -131,11 +131,10 @@ class SwitchToStreamSwitchPass : public GraphPass {
 
   ///
   /// @brief Find Switch cond input
-  /// @param [in] pass_switch_flag
   /// @param [out] peer_cond_anchor
   /// @return Status
   ///
-  Status FindSwitchCondInput(bool pass_switch_flag, OutDataAnchorPtr &peer_cond_anchor);
+  Status FindSwitchCondInput(OutDataAnchorPtr &peer_cond_anchor);
 
   ///
   /// @brief Create StreamSwitch Node
diff --git a/ge/graph/passes/transop_breadth_fusion_pass.cc b/ge/graph/passes/transop_breadth_fusion_pass.cc
index 689510f0..654c3822 100644
--- a/ge/graph/passes/transop_breadth_fusion_pass.cc
+++ b/ge/graph/passes/transop_breadth_fusion_pass.cc
@@ -70,8 +70,10 @@ std::string TransOpBreadthFusionPass::GetNodeId(const int anchor_index, const No
     trans_data_type = true;
     trans_format = true;
     trans_shape = true;
-  } else if (node->GetType() == RESHAPE) {
+  } else if (node->GetType() == RESHAPE || node->GetType() == EXPANDDIMS || node->GetType() == SQUEEZE) {
     trans_shape = true;
+  } else if (node->GetType() == REFORMAT) {
+    trans_format = true;
   }
 
   id << node->GetType() << '-' << anchor_index;
diff --git a/ge/graph/passes/transop_without_reshape_fusion_pass.cc b/ge/graph/passes/transop_without_reshape_fusion_pass.cc
index d2b3f1b1..6bea9edc 100644
--- a/ge/graph/passes/transop_without_reshape_fusion_pass.cc
+++ b/ge/graph/passes/transop_without_reshape_fusion_pass.cc
@@ -63,7 +63,7 @@ void TransOpWithoutReshapeFusionPass::SetRemainNode(
       continue;
     }
     GELOGI("SetRemainNode node is %s", op_desc->GetName().c_str());
-    GE_IF_BOOL_EXEC(!op_desc->SetExtAttr(kRemainNode, true), GELOGE(INTERNAL_ERROR, "set ext attr failed"); return );
+    GE_IF_BOOL_EXEC(!op_desc->SetExtAttr(kRemainNode, true), GELOGE(INTERNAL_ERROR, "set ext attr failed"); return);
   }
 }
 
@@ -594,7 +594,7 @@ void TransOpWithoutReshapeFusionPass::GetBeginOutDescAndEndInDesc(const int inde
   auto out_owner_node = out_peer_anchor->GetOwnerNode();
   GE_CHECK_NOTNULL_JUST_RETURN(out_owner_node);
   auto out_peer_op_desc = out_owner_node->GetOpDesc();
-  GE_IF_BOOL_EXEC(out_peer_op_desc == nullptr, GELOGE(INTERNAL_ERROR, "out_peer_op_desc is nullptr"); return );
+  GE_IF_BOOL_EXEC(out_peer_op_desc == nullptr, GELOGE(INTERNAL_ERROR, "out_peer_op_desc is nullptr"); return);
   out_desc = out_peer_op_desc->GetInputDesc(out_peer_anchor->GetIdx());
 
   auto in_peer_anchor = nodes_anchor.back().first;
@@ -602,7 +602,7 @@ void TransOpWithoutReshapeFusionPass::GetBeginOutDescAndEndInDesc(const int inde
   auto in_owner_node = in_peer_anchor->GetOwnerNode();
   GE_CHECK_NOTNULL_JUST_RETURN(in_owner_node);
   auto in_peer_op_desc = in_owner_node->GetOpDesc();
-  GE_IF_BOOL_EXEC(in_peer_op_desc == nullptr, GELOGE(INTERNAL_ERROR, "in_peer_op_desc is nullptr"); return );
+  GE_IF_BOOL_EXEC(in_peer_op_desc == nullptr, GELOGE(INTERNAL_ERROR, "in_peer_op_desc is nullptr"); return);
   in_desc = in_peer_op_desc->GetOutputDesc(in_peer_anchor->GetIdx());
 }
 
@@ -734,10 +734,14 @@ void TransOpWithoutReshapeFusionPass::RemoveNousedNodes(const ComputeGraphPtr &g
         continue;
       }
 
-      GE_IF_BOOL_EXEC(!op_desc->SetExtAttr(kRemainNode, true), GELOGE(INTERNAL_ERROR, "set ext attr failed"); return );
+      GE_IF_BOOL_EXEC(!op_desc->SetExtAttr(kRemainNode, true), GELOGE(INTERNAL_ERROR, "set ext attr failed"); return);
       GELOGI("remove node:%s", node->GetName().c_str());
-      if (graph->RemoveNode(node) != GRAPH_SUCCESS) {
-        GELOGW("remove node failed!node:%s", node->GetName().c_str());
+      if (GraphUtils::IsolateNode(node, {0}) != GRAPH_SUCCESS) {
+        GELOGW("Isolate node: %s failed.", node->GetName().c_str());
+        continue;
+      }
+      if (GraphUtils::RemoveNodeWithoutRelink(graph, node) != GRAPH_SUCCESS) {
+        GELOGW("Remove node: %s failed.", node->GetName().c_str());
         continue;
       }
     }
diff --git a/ge/graph/passes/transpose_transdata_pass.cc b/ge/graph/passes/transpose_transdata_pass.cc
index 7348f143..2178eac7 100644
--- a/ge/graph/passes/transpose_transdata_pass.cc
+++ b/ge/graph/passes/transpose_transdata_pass.cc
@@ -217,11 +217,11 @@ void TransposeTransDataPass::CopyInputEdges(NodePtr &origin_node, NodePtr &new_n
   }
   OutDataAnchorPtr out_anchor = origin_node->GetInDataAnchor(0)->GetPeerOutAnchor();
   new_in_data_anchor->UnlinkAll();
-  GE_IF_BOOL_EXEC(new_in_data_anchor->LinkFrom(out_anchor) != GRAPH_SUCCESS, GELOGW("Link failed"); return );
+  GE_IF_BOOL_EXEC(new_in_data_anchor->LinkFrom(out_anchor) != GRAPH_SUCCESS, GELOGW("Link failed"); return);
 
   // control anchor only link to control anchor
   GE_IF_BOOL_EXEC(
-    GraphUtils::CopyInCtrlEdges(origin_node, new_node) != GRAPH_SUCCESS, GELOGW("Copy in ctrl edges failed"); return );
+    GraphUtils::CopyInCtrlEdges(origin_node, new_node) != GRAPH_SUCCESS, GELOGW("Copy in ctrl edges failed"); return);
 }
 
 bool TransposeTransDataPass::TransDataCheckAccuracySupported(const OpDescPtr &op_desc) {
diff --git a/ge/graph/passes/variable_op_pass_bak.cc b/ge/graph/passes/variable_op_pass_bak.cc
index 3e40e686..c9218296 100644
--- a/ge/graph/passes/variable_op_pass_bak.cc
+++ b/ge/graph/passes/variable_op_pass_bak.cc
@@ -252,7 +252,6 @@ Status VariableOpPass::RenewTransRoadDesc(const NodePtr &var, VarTransRoad &fusi
   // case 2: suppose input format of transdata not equal with out format
   // and input format not equal with var
   // so we make input format equal with var
-
   for (auto &cur_trans : fusion_road) {
     if (cur_trans.input.GetFormat() == cur_trans.output.GetFormat()) {
       cur_trans.output.SetFormat(prev_node_info.output.GetFormat());
@@ -319,8 +318,8 @@ Status VariableOpPass::FusionIfNeed(const NodePtr &var, VarTransRoad &fusion_roa
 }
 
 Status VariableOpPass::UpdateTransRoad(VarTransRoad &fusion_road, vector<std::string> &first_path_trans_order,
-                                       map<std::string,std::pair<std::string, bool>> &trans_type_to_changed_desc,
-                                       map<std::string,vector<NodePtr>> &trans_type_to_trans_ops){
+                                       map<std::string, std::pair<std::string, bool>> &trans_type_to_changed_desc,
+                                       map<std::string, vector<NodePtr>> &trans_type_to_trans_ops){
   vector<std::string> delete_trans_type;
   for (auto &trans_type : first_path_trans_order) {
     if (trans_type_to_changed_desc.find(trans_type) == trans_type_to_changed_desc.end()) {
diff --git a/ge/graph/passes/variable_op_pass_bak.h b/ge/graph/passes/variable_op_pass_bak.h
index b9fbb90e..fccd063b 100644
--- a/ge/graph/passes/variable_op_pass_bak.h
+++ b/ge/graph/passes/variable_op_pass_bak.h
@@ -45,8 +45,8 @@ class VariableOpPass : public GraphPass {
 
  private:
   Status UpdateTransRoad(VarTransRoad &fusion_road, vector<string> &trans_road_order,
-                         map<string,pair<string, bool>> &trans_type_to_changed_desc,
-                         map<string,vector<NodePtr>> &trans_type_to_trans_ops);
+                         map<string, pair<string, bool>> &trans_type_to_changed_desc,
+                         map<string, vector<NodePtr>> &trans_type_to_trans_ops);
 
   Status DealFusion(const ge::NodePtr &var_node, VarTransRoad &fusion_road,
                     map<string, pair<string, bool>> trans_type_to_changed_desc,
diff --git a/ge/graph/preprocess/graph_preprocess.cc b/ge/graph/preprocess/graph_preprocess.cc
index b899ee83..2ee5e330 100644
--- a/ge/graph/preprocess/graph_preprocess.cc
+++ b/ge/graph/preprocess/graph_preprocess.cc
@@ -1621,7 +1621,8 @@ Status GraphPrepare::CheckUserInput(const std::vector<GeTensor> &user_input) {
 
       for (size_t i = 0; i < desc.GetShape().GetDimNum(); ++i) {
         if (desc.GetShape().GetDim(i) < 0) {
-          std::string situation = "data dim[" + std::to_string(i) + "][" + std::to_string(desc.GetShape().GetDim(i)) + "]" ;
+          std::string situation = "data dim[" + std::to_string(i) + "][" +
+                  std::to_string(desc.GetShape().GetDim(i)) + "]" ;
           std::string reason = "it need >= 0";
           ErrorManager::GetInstance().ATCReportErrMessage("E19025", {"situation", "reason"}, {situation, reason});
           GELOGE(GE_GRAPH_INIT_FAILED, "data dim %zu is not supported, need >= 0, real:%ld.", i,
@@ -1701,7 +1702,7 @@ Status GraphPrepare::PrepareOptimize() {
   try {
     (void)original_graph_passes.AddPass("PrepareOptimize::ShapeOperateOpRemovePass", new ShapeOperateOpRemovePass);
     (void)original_graph_passes.AddPass("PrepareOptimize::ReplaceTransShapePass", new ReplaceTransShapePass);
-    (void)original_graph_passes.AddPass("PrepareOptimize::MarkAgnosticPass" , new MarkAgnosticPass);
+    (void)original_graph_passes.AddPass("PrepareOptimize::MarkAgnosticPass", new MarkAgnosticPass);
   } catch (std::bad_alloc &e) {
     GELOGE(INTERNAL_ERROR, "Add pass failed, bad memory allocation occurs.");
     return INTERNAL_ERROR;
@@ -1796,6 +1797,16 @@ Status GraphPrepare::PrepareOptimize() {
 }
 
 void GraphPrepare::TypeConversionOfConstant() {
+  bool is_acl_compile = false;
+  for (ge::NodePtr &n : compute_graph_->GetAllNodes()) {
+    // This can ensure that n is not a null pointer
+    // No Conversion when called by aclOpCompile
+    (void)AttrUtils::GetBool(n->GetOpDesc(), ATTR_DYNAMIC_SHAPE_SINGLE_AICPU, is_acl_compile);
+    if (is_acl_compile) {
+      return;
+    }
+  }
+
   if (options_.train_graph_flag) {
     GELOGD("trans CONSTANT to CONSTANTOP in train.");
     for (ge::NodePtr &n : compute_graph_->GetAllNodes()) {
diff --git a/ge/graph/preprocess/insert_op/ge_aipp_op.cc b/ge/graph/preprocess/insert_op/ge_aipp_op.cc
index 98712a82..7c8d9073 100755
--- a/ge/graph/preprocess/insert_op/ge_aipp_op.cc
+++ b/ge/graph/preprocess/insert_op/ge_aipp_op.cc
@@ -408,7 +408,7 @@ Status AippOp::ConvertRelatedInputNameToRank() {
   GE_CHECK_NOTNULL(aipp_params_);
 
   string related_input_name = aipp_params_->related_input_name();
-  if(related_input_name.empty()) {
+  if (related_input_name.empty()) {
     return SUCCESS;
   }
 
diff --git a/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc b/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc
index 1b926e4b..3b37003f 100755
--- a/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc
+++ b/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc
@@ -470,7 +470,7 @@ Status InsertNewOpUtil::UpdateDataBySwitchN(const NodePtr &switchn, const NodePt
     }
   }
   if (max_index >= switchn->GetOpDesc()->GetOutputsSize()) {
-    string error_msg = "No max size found from switchn node[" + switchn->GetName()+ "]";
+    string error_msg = "No max size found from switchn node[" + switchn->GetName() + "]";
     GE_ERRORLOG_AND_ERRORMSG(INTERNAL_ERROR, error_msg.c_str());
     return INTERNAL_ERROR;
   }
diff --git a/ge/host_kernels/concat_v2_kernel.cc b/ge/host_kernels/concat_v2_kernel.cc
index a9f0da81..234d8c8a 100644
--- a/ge/host_kernels/concat_v2_kernel.cc
+++ b/ge/host_kernels/concat_v2_kernel.cc
@@ -120,7 +120,7 @@ Status ConcatV2Kernel::ConcatV2PreCompute(const std::vector<ConstGeTensorPtr> &i
                                           int &tidx,
                                           ConstGeTensorPtr &tensor) {
   size_t input_size = input.size();
-  // N >= 2 and N + 1 >= 3
+  // N + 1 is greater than or equal to 3
   if (input_size < kConcatV2InputNum) {
     GELOGI("The number of input for ConcatV2 must not be less than %zu.", kConcatV2InputNum);
     return NOT_CHANGED;
diff --git a/ge/host_kernels/floordiv_kernel.cc b/ge/host_kernels/floordiv_kernel.cc
index e254af09..df381212 100644
--- a/ge/host_kernels/floordiv_kernel.cc
+++ b/ge/host_kernels/floordiv_kernel.cc
@@ -112,8 +112,8 @@ void FloorDivKernel::ShapeCal(const std::vector<ge::ConstGeTensorPtr> &input, Ge
 template <typename T>
 T FloorDivKernel::DivCal(const T &x_i, const T &y_i) {
   if ((x_i < static_cast<T>(0)) != (y_i < static_cast<T>(0))) {
-    T abs_x_i = std::abs(x_i);
-    T abs_y_i = std::abs(y_i);
+    T abs_x_i = x_i < 0 ? -x_i : x_i;
+    T abs_y_i = y_i < 0 ? -y_i : y_i;
     return static_cast<T>(static_cast<int32_t>(-(abs_x_i + abs_y_i - 1) / abs_y_i));
   } else {
     return static_cast<T>(static_cast<int32_t>(x_i / y_i));
diff --git a/ge/host_kernels/floordiv_kernel.h b/ge/host_kernels/floordiv_kernel.h
index d3dc3ff7..b8f6dd12 100755
--- a/ge/host_kernels/floordiv_kernel.h
+++ b/ge/host_kernels/floordiv_kernel.h
@@ -40,10 +40,6 @@ class FloorDivKernel : public Kernel {
   template <typename T>
   Status DataCal(const std::vector<ConstGeTensorPtr> &input, ge::GeTensorPtr output_ptr);
   Status ComputeByDataType(DataType data_type, const std::vector<ConstGeTensorPtr> &input, GeTensorPtr output_ptr);
-
-  int64_t axis_dim_;
-  int64_t head_dim_;
-  int64_t end_dim_;
 };
 }  // namespace ge
 
diff --git a/ge/host_kernels/gather_v2_kernel.cc b/ge/host_kernels/gather_v2_kernel.cc
index e52b4534..ee73626b 100644
--- a/ge/host_kernels/gather_v2_kernel.cc
+++ b/ge/host_kernels/gather_v2_kernel.cc
@@ -40,6 +40,10 @@ const size_t kGatherV2InpotNum = 3;
 const size_t kMaxIndicatesDims = 1;  // only support scalar and 1 dims indicates_
 const std::set<DataType> supported_type = {DT_FLOAT16, DT_DOUBLE, DT_INT8,   DT_INT16,  DT_INT16, DT_INT32,
                                            DT_INT64,   DT_UINT8,  DT_UINT16, DT_UINT32, DT_UINT64};
+const int64_t DIM_AXIS_0 = 0;
+const int64_t DIM_AXIS_1 = 1;
+const int64_t DIM_AXIS_2 = 2;
+const int64_t DIM_AXIS_3 = 3;
 }  // namespace
 template <typename T>
 Status GatherV2Kernel::ProcessAxis0(ConstGeTensorPtr tensor_x, GeTensorPtr output) {
@@ -191,16 +195,16 @@ Status GatherV2Kernel::GenData(const int64_t data_num, ConstGeTensorPtr tensor_x
 
   Status ret = SUCCESS;
   switch (axis) {
-    case 0:
+    case DIM_AXIS_0:
       ret = ProcessAxis0<T>(tensor_x, output);
       break;
-    case 1:
+    case DIM_AXIS_1:
       ret = ProcessAxis1<T>(tensor_x, output);
       break;
-    case 2:
+    case DIM_AXIS_2:
       ret = ProcessAxis2<T>(tensor_x, output);
       break;
-    case 3:
+    case DIM_AXIS_3:
       ret = ProcessAxis3<T>(tensor_x, output);
       break;
     default:
diff --git a/ge/host_kernels/range_kernel.cc b/ge/host_kernels/range_kernel.cc
index 32a72b47..97254fff 100644
--- a/ge/host_kernels/range_kernel.cc
+++ b/ge/host_kernels/range_kernel.cc
@@ -32,6 +32,9 @@ namespace ge {
 namespace {
 constexpr size_t kRangeInputNum = 3;
 constexpr uint32_t kRangeDimNum = 0;
+constexpr size_t kStartIndex = 0;
+constexpr size_t kLimitIndex = 1;
+constexpr size_t kDeltaIndex = 2;
 const std::set<DataType> kRangeSupportedType = {DT_INT32, DT_FLOAT};
 }  // namespace
 
@@ -53,9 +56,9 @@ Status RangeKernel::Compute(const OpDescPtr op_desc_ptr, const std::vector<Const
     return MEMALLOC_FAILED;
   }
 
-  ConstGeTensorPtr start = input.at(0);
-  ConstGeTensorPtr limit = input.at(1);
-  ConstGeTensorPtr delta = input.at(2);
+  ConstGeTensorPtr start = input.at(kStartIndex);
+  ConstGeTensorPtr limit = input.at(kLimitIndex);
+  ConstGeTensorPtr delta = input.at(kDeltaIndex);
   DataType data_type = delta->GetTensorDesc().GetDataType();
   if (data_type == DT_FLOAT) {
     if (GetRange(*reinterpret_cast<const float *>(start->GetData().data()),
diff --git a/ge/host_kernels/ssd_prior_box_kernel.cc b/ge/host_kernels/ssd_prior_box_kernel.cc
index b3a0fc3e..3661fa9d 100644
--- a/ge/host_kernels/ssd_prior_box_kernel.cc
+++ b/ge/host_kernels/ssd_prior_box_kernel.cc
@@ -180,14 +180,18 @@ Status SsdPriorboxKernel::SetVariance(const vector<float> &variance, const int d
   return SUCCESS;
 }
 
-Status SsdPriorboxKernel::GetNumPriorAndDimSize(uint aspect_ratios_size, uint min_sizes_size, uint max_sizes_size,
-                                                int layer_width, int layer_height, int &num_priors,
+Status SsdPriorboxKernel::GetNumPriorAndDimSize(uint32_t aspect_ratios_size,
+                                                uint32_t min_sizes_size,
+                                                uint32_t max_sizes_size,
+                                                int layer_width,
+                                                int layer_height,
+                                                int &num_priors,
                                                 int &dim_size) const {
   if (ge::CheckUint32MulOverflow(min_sizes_size, aspect_ratios_size) != SUCCESS) {
     return PARAM_INVALID;
   }
 
-  uint tmp_value = aspect_ratios_size * min_sizes_size;
+  uint32_t tmp_value = aspect_ratios_size * min_sizes_size;
   if (ge::CheckUint32AddOverflow(tmp_value, max_sizes_size) != SUCCESS) {
     GELOGW("Failed to get list param.");
     return PARAM_INVALID;
@@ -199,7 +203,7 @@ Status SsdPriorboxKernel::GetNumPriorAndDimSize(uint aspect_ratios_size, uint mi
     return PARAM_INVALID;
   }
   num_priors = static_cast<int>(tmp_value);
-  
+
   if (ge::CheckIntMulOverflow(layer_width, layer_height) != SUCCESS) {
     GELOGW("Failed to get list param.");
     return PARAM_INVALID;
@@ -288,7 +292,7 @@ std::unique_ptr<float[]> SsdPriorboxKernel::BoundaryCalulate(int dim_size, int l
     }
   }
 
-  return std::move(output_data);
+  return output_data;
 }
 
 Status SsdPriorboxKernel::Compute(const NodePtr &node, std::vector<GeTensorPtr> &v_output) {
diff --git a/ge/host_kernels/ssd_prior_box_kernel.h b/ge/host_kernels/ssd_prior_box_kernel.h
index 0ebf221d..c08217e2 100755
--- a/ge/host_kernels/ssd_prior_box_kernel.h
+++ b/ge/host_kernels/ssd_prior_box_kernel.h
@@ -100,8 +100,8 @@ class SsdPriorboxKernel : public Kernel {
    * @return OTHERS:  Execution failed
    * @author
    */
-  Status GetNumPriorAndDimSize(uint aspect_ratios_size, uint min_sizes_size, uint max_sizes_size, int layer_width,
-                               int layer_height, int &num_priors, int &dim_size) const;
+  Status GetNumPriorAndDimSize(uint32_t aspect_ratios_size, uint32_t min_sizes_size, uint32_t max_sizes_size,
+                               int layer_width, int layer_height, int &num_priors, int &dim_size) const;
   void DataCalulate(float x, float y, float box_x, float box_y, int img_x, int img_y, vector<float> &result);
   std::unique_ptr<float[]> BoundaryCalulate(int dim_size, int layer_width, int layer_height, float step_width,
                                             float step_height, int img_width, int img_height, float offset,
diff --git a/ge/host_kernels/strided_slice_kernel.cc b/ge/host_kernels/strided_slice_kernel.cc
index 2fe74415..b1bfb10a 100644
--- a/ge/host_kernels/strided_slice_kernel.cc
+++ b/ge/host_kernels/strided_slice_kernel.cc
@@ -272,6 +272,10 @@ Status StridedSliceKernel::InitParamWithAttrs(const std::vector<ConstGeTensorPtr
 void StridedSliceKernel::ExpandDimsWithNewAxis(const ConstGeTensorPtr &begin_tensor, const size_t x_dims_num,
                                                vector<int64_t> &x_dims) {
   auto begin_data_type_size = GetSizeByDataType(begin_tensor->GetTensorDesc().GetDataType());
+  if (begin_data_type_size == 0) {
+    GELOGW("Param begin_data_type_size should not be zero.");
+    return;
+  }
   size_t begin_vec_size = begin_tensor->GetData().size() / begin_data_type_size;
   auto final_dim_num = x_dims_num < begin_vec_size ? begin_vec_size : x_dims_num;
   for (size_t i = 0; i < final_dim_num; i++) {
@@ -284,8 +288,10 @@ void StridedSliceKernel::ExpandDimsWithNewAxis(const ConstGeTensorPtr &begin_ten
 }
 
 void StridedSliceKernel::ExpandStrideWithEllipsisMask(const size_t x_dims_num, 
-                                    const vector<int64_t> &x_dims, vector<int64_t> &orig_begin_vec,
-                                    vector<int64_t> &orig_end_vec, vector<int64_t> &orig_stride_vec) {
+                                                      const vector<int64_t> &x_dims, 
+                                                      vector<int64_t> &orig_begin_vec,
+                                                      vector<int64_t> &orig_end_vec, 
+                                                      vector<int64_t> &orig_stride_vec) {
   
   if (attr_value_map_.at(STRIDE_SLICE_ATTR_ELLIPSIS_MASK) != 0) {
     auto end_mask = attr_value_map_.at(STRIDE_SLICE_ATTR_END_MASK);
@@ -308,7 +314,7 @@ void StridedSliceKernel::ExpandStrideWithEllipsisMask(const size_t x_dims_num,
         if (orig_begin_vec.size() < x_dims_num) {
           for (size_t j = 1; j < (x_dims_num - orig_begin_vec.size() + 1); ++j) {
             orig_begin_vec.insert((orig_begin_vec.begin() + ellipsis_dim + j), 0);
-            orig_end_vec.insert((orig_end_vec.begin() + ellipsis_dim + j), x_dims.at(ellipsis_dim +j));
+            orig_end_vec.insert((orig_end_vec.begin() + ellipsis_dim + j), x_dims.at(ellipsis_dim + j));
             orig_stride_vec.insert((orig_stride_vec.begin() + ellipsis_dim + j), 1);
           }
         }
diff --git a/ge/hybrid/common/npu_memory_allocator.cc b/ge/hybrid/common/npu_memory_allocator.cc
index f506caec..2c38367a 100644
--- a/ge/hybrid/common/npu_memory_allocator.cc
+++ b/ge/hybrid/common/npu_memory_allocator.cc
@@ -23,6 +23,8 @@
 
 namespace ge {
 namespace hybrid {
+const size_t kPaddingUnit = 2;
+
 size_t kMaxHbmMemorySize = 1024UL * 1024UL * 1024UL * 1024UL; // 1024G
 
 std::map<uint32_t, std::unique_ptr<NpuMemoryAllocator>> NpuMemoryAllocator::allocators_;
@@ -77,7 +79,7 @@ void *NpuMemoryAllocator::Allocate(std::size_t size, AllocationAttr *attr) {
       }
     }
     // padding up to multiple of padding, and add extra padding
-    allocate_size = (size + 2 * padding - 1) / padding * padding;
+    allocate_size = (size + kPaddingUnit * padding - 1) / padding * padding;
     GELOGD("Padding size %ld by %d. final size = %zu.", size, padding, allocate_size);
     buffer = MemManager::Instance()
                  .CachingInstance(RT_MEMORY_HBM)
diff --git a/ge/hybrid/executor/hybrid_execution_context.h b/ge/hybrid/executor/hybrid_execution_context.h
index 0910d2c7..1fe40c77 100644
--- a/ge/hybrid/executor/hybrid_execution_context.h
+++ b/ge/hybrid/executor/hybrid_execution_context.h
@@ -57,7 +57,8 @@ struct GraphExecutionContext {
 do { \
   if ((context != nullptr) && (context)->profiler != nullptr) { \
     if (node_name != nullptr) { \
-      context->profiler->RecordEvent(evt_type, "tid:%lu [%s] [%s] " fmt, GeLog::GetTid(), node_name, category, ##__VA_ARGS__);\
+      context->profiler->RecordEvent(evt_type, "tid:%lu [%s] [%s] " fmt, GeLog::GetTid(), node_name, category, \
+                                     ##__VA_ARGS__); \
     } else { \
       context->profiler->RecordEvent(evt_type, "tid:%lu [%s] " fmt, GeLog::GetTid(), category, ##__VA_ARGS__); \
     }\
@@ -77,7 +78,7 @@ do { \
   RECORD_PROFILING_EVENT((context), HybridProfiler::EXECUTION, fmt, "Execution", name,  ##__VA_ARGS__)
 
 #define RECORD_CALLBACK_EVENT(context, name, fmt, ...) \
-  RECORD_PROFILING_EVENT((context), HybridProfiler::CALLBACK, fmt, "Callback", name,  ##__VA_ARGS__)
+  RECORD_PROFILING_EVENT((context), HybridProfiler::CALLBACKS, fmt, "Callback", name,  ##__VA_ARGS__)
 }  // namespace hybrid
 }  // namespace ge
 #endif // GE_HYBRID_EXECUTOR_HYBRID_EXECUTION_CONTEXT_H_
diff --git a/ge/hybrid/executor/hybrid_model_async_executor.cc b/ge/hybrid/executor/hybrid_model_async_executor.cc
index 91996ab3..ba717a2d 100644
--- a/ge/hybrid/executor/hybrid_model_async_executor.cc
+++ b/ge/hybrid/executor/hybrid_model_async_executor.cc
@@ -379,11 +379,13 @@ Status HybridModelAsyncExecutor::Execute(const std::vector<DataBuffer> &inputs,
     }
     if (output_real_size > 0) {
       if (outputs[i].length < static_cast<uint64_t>(output_real_size)) {
-        GELOGE(FAILED, "output idx[%zu], the memory size of output[%lu] given by user should be greater than or equal to the real size of output[%ld]",
+        GELOGE(FAILED, "output idx[%zu], the memory size of output[%lu] given by "
+                       "user should be greater than or equal to the real size of output[%ld]",
                i, outputs[i].length, output_real_size);
         return FAILED;
       }
-      GE_CHK_RT_RET(rtMemcpy(outputs[i].data, outputs[i].length, args.outputs[i].GetData(), output_real_size, RT_MEMCPY_DEVICE_TO_DEVICE));
+      GE_CHK_RT_RET(rtMemcpy(outputs[i].data, outputs[i].length, args.outputs[i].GetData(), output_real_size, 
+                    RT_MEMCPY_DEVICE_TO_DEVICE));
     }
     outputs[i].length = output_real_size;
   }
diff --git a/ge/hybrid/executor/hybrid_model_executor.cc b/ge/hybrid/executor/hybrid_model_executor.cc
index 4af34451..8ba687c2 100755
--- a/ge/hybrid/executor/hybrid_model_executor.cc
+++ b/ge/hybrid/executor/hybrid_model_executor.cc
@@ -82,7 +82,7 @@ Status HybridModelExecutor::ExecuteGraphInternal(SubgraphExecutor &executor,
 Status HybridModelExecutor::Cleanup() {
   GELOGD("Start to cleanup.");
   context_.callback_manager->Destroy();
-  RuntimeInferenceContext::DestroyContext(to_string(context_.session_id));
+  RuntimeInferenceContext::DestroyContext(std::to_string(context_.session_id));
   GELOGD("Cleanup successfully.");
   return SUCCESS;
 }
diff --git a/ge/hybrid/executor/hybrid_profiler.cc b/ge/hybrid/executor/hybrid_profiler.cc
index 7228197f..336a633f 100644
--- a/ge/hybrid/executor/hybrid_profiler.cc
+++ b/ge/hybrid/executor/hybrid_profiler.cc
@@ -25,7 +25,7 @@ namespace ge {
 namespace hybrid {
 namespace {
 const int kMaxEvents = 10000;
-const int kEventDescMax = 256;
+const int kEventDescMax = 512;
 const int kMaxEventTypes = 8;
 const int kIndent = 8;
 }
diff --git a/ge/hybrid/executor/hybrid_profiler.h b/ge/hybrid/executor/hybrid_profiler.h
index 62ef9c73..94a042e4 100644
--- a/ge/hybrid/executor/hybrid_profiler.h
+++ b/ge/hybrid/executor/hybrid_profiler.h
@@ -33,7 +33,7 @@ class HybridProfiler {
     SHAPE_INFERENCE,
     COMPILE,
     EXECUTION,
-    CALLBACK,
+    CALLBACKS
   };
 
   struct Event {
diff --git a/ge/hybrid/executor/node_done_manager.cc b/ge/hybrid/executor/node_done_manager.cc
index c0b0b17b..f0d4324a 100644
--- a/ge/hybrid/executor/node_done_manager.cc
+++ b/ge/hybrid/executor/node_done_manager.cc
@@ -21,7 +21,7 @@
 namespace ge {
 namespace hybrid {
 namespace {
-constexpr int kDefaultWaitTimeoutInSec = 60 * 10;
+constexpr int kDefaultWaitTimeoutInSec = 600;
 }
 bool NodeDoneManager::Cond::Await() {
   std::unique_lock<std::mutex> lk(cond_mu_);
diff --git a/ge/hybrid/executor/node_state.h b/ge/hybrid/executor/node_state.h
index 48b2ed72..04f1ee4b 100644
--- a/ge/hybrid/executor/node_state.h
+++ b/ge/hybrid/executor/node_state.h
@@ -27,7 +27,7 @@
 namespace ge {
 namespace hybrid {
 class NodeTask;
-class GraphExecutionContext;
+struct GraphExecutionContext;
 class SubgraphContext;
 
 class ShapeFuture {
diff --git a/ge/hybrid/executor/subgraph_executor.cc b/ge/hybrid/executor/subgraph_executor.cc
index 76a6cc37..5a464f8e 100644
--- a/ge/hybrid/executor/subgraph_executor.cc
+++ b/ge/hybrid/executor/subgraph_executor.cc
@@ -93,6 +93,7 @@ Status SubgraphExecutor::InitInputsForUnknownShape(const std::vector<TensorValue
       GELOGD("[%s] Start to update input[%zu] for subgraph data node.", graph_item_->GetName().c_str(), i);
       GE_CHECK_LE(i + 1, input_desc.size());
       const auto &tensor_desc = input_desc[i];
+      GE_CHECK_NOTNULL(tensor_desc);
       auto node_state = subgraph_context_->GetOrCreateNodeState(input_node);
       GE_CHECK_NOTNULL(node_state);
       node_state->GetShapeInferenceState().UpdateInputShape(0, tensor_desc->GetOriginShape(), tensor_desc->GetShape());
diff --git a/ge/hybrid/executor/worker/execution_engine.cc b/ge/hybrid/executor/worker/execution_engine.cc
index e6729352..b984eec3 100755
--- a/ge/hybrid/executor/worker/execution_engine.cc
+++ b/ge/hybrid/executor/worker/execution_engine.cc
@@ -260,8 +260,7 @@ Status NodeDoneCallback::ProfilingReport() {
   }
 
   auto &profiling_manager = ProfilingManager::Instance();
-  profiling_manager.ReportProfilingData(model->GetModelId(), task_desc_info, compute_graph_info,
-                                        !profiling_manager.IsAclApiMode());
+  profiling_manager.ReportProfilingData(model->GetModelId(), task_desc_info, compute_graph_info);
   return SUCCESS;
 }
 
diff --git a/ge/hybrid/executor/worker/shape_inference_engine.cc b/ge/hybrid/executor/worker/shape_inference_engine.cc
index bd429b21..1d813526 100755
--- a/ge/hybrid/executor/worker/shape_inference_engine.cc
+++ b/ge/hybrid/executor/worker/shape_inference_engine.cc
@@ -62,7 +62,8 @@ Status ShapeInferenceEngine::InferShape(NodeState &node_state) {
   {
     std::lock_guard<std::mutex> lk(mu_);
     RECORD_SHAPE_INFERENCE_EVENT(execution_context_, node_item.NodeName().c_str(), "[InferShapeAndType] Start");
-    GE_CHK_STATUS_RET(ShapeRefiner::InferShapeAndTypeForRunning(node_item.node, true), "Invoke InferShapeAndType failed.");
+    GE_CHK_STATUS_RET(ShapeRefiner::InferShapeAndTypeForRunning(node_item.node, true),
+        "Invoke InferShapeAndType failed.");
     RECORD_SHAPE_INFERENCE_EVENT(execution_context_, node_item.NodeName().c_str(), "[InferShapeAndType] End");
   }
   // Check again to make sure shape is valid after shape inference
@@ -164,7 +165,7 @@ Status ShapeInferenceEngine::InferShapeForSubgraph(const NodeItem &node_item, co
   for (auto &it : fused_subgraph.input_mapping) {
     auto parent_tensor_desc = node_item.MutableInputDesc(it.first);
     GE_CHECK_NOTNULL(parent_tensor_desc);
-    GELOGD("Start to update shape by input[%u]", it.first);
+    GELOGD("Start to update shape by input[%d]", it.first);
     GELOGD("Update shape to [%s]", parent_tensor_desc->GetShape().ToString().c_str());
     GELOGD("Update original shape to [%s]", parent_tensor_desc->GetOriginShape().ToString().c_str());
     for (auto &tensor_desc : it.second) {
@@ -183,12 +184,12 @@ Status ShapeInferenceEngine::InferShapeForSubgraph(const NodeItem &node_item, co
   }
 
   for (auto &it : fused_subgraph.output_mapping) {
-    uint32_t parent_output_idx = it.first;
+    int parent_output_idx = it.first;
     const auto &op_desc = it.second;
     GELOGD("Update parent output[%d] by [%s]", parent_output_idx, op_desc->GetName().c_str());
     auto input_desc = op_desc->MutableInputDesc(0);
     GE_CHECK_NOTNULL(input_desc);
-    auto parent_output_tensor_desc = node_item.op_desc->MutableOutputDesc(parent_output_idx);
+    auto parent_output_tensor_desc = node_item.MutableOutputDesc(parent_output_idx);
     GE_CHECK_NOTNULL(parent_output_tensor_desc);
     GELOGD("Update shape to [%s]", input_desc->GetShape().ToString().c_str());
     GELOGD("Update original shape to [%s]", input_desc->GetOriginShape().ToString().c_str());
diff --git a/ge/hybrid/hybrid_davinci_model.cc b/ge/hybrid/hybrid_davinci_model.cc
index a491c9a5..7009331c 100755
--- a/ge/hybrid/hybrid_davinci_model.cc
+++ b/ge/hybrid/hybrid_davinci_model.cc
@@ -113,8 +113,8 @@ HybridDavinciModel::~HybridDavinciModel() {
   delete impl_;
 }
 
-unique_ptr<HybridDavinciModel> HybridDavinciModel::Create(const GeRootModelPtr &ge_root_model) {
-  auto instance = unique_ptr<HybridDavinciModel>(new (std::nothrow)HybridDavinciModel());
+std::unique_ptr<HybridDavinciModel> HybridDavinciModel::Create(const GeRootModelPtr &ge_root_model) {
+  auto instance = std::unique_ptr<HybridDavinciModel>(new (std::nothrow)HybridDavinciModel());
   if (instance != nullptr) {
     instance->impl_ = new (std::nothrow) HybridDavinciModel::Impl(ge_root_model);
     if (instance->impl_ != nullptr) {
diff --git a/ge/hybrid/model/hybrid_model.cc b/ge/hybrid/model/hybrid_model.cc
index feb6757b..132b0f8c 100644
--- a/ge/hybrid/model/hybrid_model.cc
+++ b/ge/hybrid/model/hybrid_model.cc
@@ -176,20 +176,9 @@ Status HybridModel::GetInputOutputDescInfo(vector<InputOutputDescInfo> &input_de
   return SUCCESS;
 }
 
-void HybridModel::SetInputDimsAndShapeRangesInfo(const vector<int64_t> &model_input_dims, std::vector<std::pair<int64_t,int64_t>> &shape_ranges,
-                                                 Format &format, InputOutputDescInfo &input) {
-  uint32_t n, c, h, w;
-  n = format == FORMAT_NHWC ? NHWC_DIM_N : NCHW_DIM_N;
-  c = format == FORMAT_NHWC ? NHWC_DIM_C : NCHW_DIM_C;
-  h = format == FORMAT_NHWC ? NHWC_DIM_H : NCHW_DIM_H;
-  w = format == FORMAT_NHWC ? NHWC_DIM_W : NCHW_DIM_W;
-
-  if (model_input_dims.size() == static_cast<size_t>(NORMAL_TENSOR_SIZE)) {
-    input.shape_info.num = model_input_dims[n];
-    input.shape_info.height = model_input_dims[h];
-    input.shape_info.width = model_input_dims[w];
-    input.shape_info.channel = model_input_dims[c];
-  }
+void HybridModel::SetInputDimsAndShapeRangesInfo(const vector<int64_t> &model_input_dims,
+                                                 std::vector<std::pair<int64_t, int64_t>> &shape_ranges,
+                                                 InputOutputDescInfo &input) {
   for (auto model_input_dim : model_input_dims) {
     input.shape_info.dims.push_back(model_input_dim);
   }
@@ -197,25 +186,25 @@ void HybridModel::SetInputDimsAndShapeRangesInfo(const vector<int64_t> &model_in
   return;
 }
 
-void HybridModel::CreateInputDimsInfo(const OpDescPtr &op_desc, Format format, InputOutputDescInfo &input) {
+void HybridModel::CreateInputDimsInfo(const OpDescPtr &op_desc, InputOutputDescInfo &input) {
   std::vector<std::pair<int64_t,int64_t>> shape_ranges;
   if (is_new_model_desc_ && op_desc->HasAttr(ATTR_NAME_INPUT_DIMS)) {
     // When static aipp is set, need to get the model input dims which processed by aipp
     vector<int64_t> model_input_dims;
     (void)AttrUtils::GetListInt(op_desc, ATTR_NAME_INPUT_DIMS, model_input_dims);
-    SetInputDimsAndShapeRangesInfo(model_input_dims, shape_ranges, format, input);
+    SetInputDimsAndShapeRangesInfo(model_input_dims, shape_ranges, input);
     return;
   }
   // judge if this data is linked dynamic aipp first, multiply batch has been considered
   if (op_desc->HasAttr("_dynamic_aipp_input_dims")) {
     vector<int64_t> dynamic_aipp_input_dims;
     (void)AttrUtils::GetListInt(op_desc, "_dynamic_aipp_input_dims", dynamic_aipp_input_dims);
-    SetInputDimsAndShapeRangesInfo(dynamic_aipp_input_dims, shape_ranges, format, input);
+    SetInputDimsAndShapeRangesInfo(dynamic_aipp_input_dims, shape_ranges, input);
     return;
   } else {
     vector<int64_t> input_dims = op_desc->GetInputDescPtr(0)->GetShape().GetDims();
     op_desc->GetInputDescPtr(0)->GetShapeRange(shape_ranges);
-    SetInputDimsAndShapeRangesInfo(input_dims, shape_ranges, format, input);
+    SetInputDimsAndShapeRangesInfo(input_dims, shape_ranges, input);
     return;
   }
 }
@@ -248,7 +237,7 @@ Status HybridModel::GetInputDescInfo(vector<InputOutputDescInfo> &input_desc, st
     // not support dynamic shape input for now, so input_size here will be not less than zero.
     input.size = input_size;
 
-    CreateInputDimsInfo(op_desc, format, input);
+    CreateInputDimsInfo(op_desc, input);
 
     formats.push_back(format);
     input_desc.push_back(input);
@@ -257,29 +246,15 @@ Status HybridModel::GetInputDescInfo(vector<InputOutputDescInfo> &input_desc, st
   return SUCCESS;
 }
 
-void HybridModel::CreateOutput(ConstGeTensorDescPtr &output_desc, InputOutputDescInfo &output_desc_info, uint32_t &format_result) {
+void HybridModel::CreateOutput(ConstGeTensorDescPtr &output_desc,
+                               InputOutputDescInfo &output_desc_info, uint32_t &format_result) {
   GE_IF_BOOL_EXEC(output_desc == nullptr, GELOGE(FAILED, "output desc ptr is nullptr"); return );
   Format format = output_desc->GetFormat();
   GeShape shape = output_desc->GetShape();
   std::vector<std::pair<int64_t,int64_t>> shape_ranges;
   output_desc->GetShapeRange(shape_ranges);
   DataType data_type = output_desc->GetDataType();
-  int64_t dims[] = {1, 1, 1, 1};
   format_result = format;
-  if (format == FORMAT_ND) {  // for ND tensor
-    for (size_t i = 0; i < shape.GetDimNum() && i < (sizeof(dims) / sizeof(dims[0])); i++) {
-      dims[i] = shape.GetDim(i);
-    }
-  } else {                                                                    // FOR FORMAT_NHWC or FORMAT_NCHW
-    dims[0] = shape.GetDim(format == FORMAT_NHWC ? NHWC_DIM_N : NCHW_DIM_N);  // 0: first dim
-    dims[1] = shape.GetDim(format == FORMAT_NHWC ? NHWC_DIM_C : NCHW_DIM_C);  // 1: second dim
-    dims[2] = shape.GetDim(format == FORMAT_NHWC ? NHWC_DIM_H : NCHW_DIM_H);  // 2: third dim
-    dims[3] = shape.GetDim(format == FORMAT_NHWC ? NHWC_DIM_W : NCHW_DIM_W);  // 3: forth dim
-  }
-  output_desc_info.shape_info.num = dims[0];      // 0: first dim
-  output_desc_info.shape_info.channel = dims[1];  // 1: second dim
-  output_desc_info.shape_info.height = dims[2];   // 2: third dim
-  output_desc_info.shape_info.width = dims[3];    // 3: forth dim
   if (format == FORMAT_FRACTAL_Z) {  // FraczToHWCK
     int64_t k = shape.GetDim(0);                                           // 0: first dim
     int64_t c = shape.GetDim(1);                                           // 1: second dim
@@ -310,7 +285,8 @@ void HybridModel::CreateOutput(ConstGeTensorDescPtr &output_desc, InputOutputDes
 
 Status HybridModel::GetOutputDescInfo(vector<InputOutputDescInfo> &output_desc, std::vector<uint32_t> &formats) {
   std::vector<ConstGeTensorDescPtr> output_desc_list;
-  GE_CHK_STATUS_RET(root_graph_item_->GetOutputDescList(output_desc_list), "get output desc info failed");  // output_desc_list contains vaild input desc
+  // output_desc_list contains vaild input desc
+  GE_CHK_STATUS_RET(root_graph_item_->GetOutputDescList(output_desc_list), "get output desc info failed");
 
   vector<std::string> out_node_names;
   (void)ge::AttrUtils::GetListStr(ge_root_model_->GetRootGraph(), ATTR_MODEL_OUT_NODES_NAME, out_node_names);
@@ -320,7 +296,8 @@ Status HybridModel::GetOutputDescInfo(vector<InputOutputDescInfo> &output_desc,
   GE_CHECK_NOTNULL(op_desc);
 
   auto out_size = static_cast<uint32_t>(op_desc->GetInputsSize());
-  GE_CHK_BOOL_RET_STATUS(out_size == output_desc_list.size(), FAILED, "output size[%u] not match output_desc_list size[%zu]", out_size, output_desc_list.size());
+  GE_CHK_BOOL_RET_STATUS(out_size == output_desc_list.size(),
+      FAILED, "output size[%u] not match output_desc_list size[%zu]", out_size, output_desc_list.size());
 
   for (uint32_t index = 0; index < out_size; ++index) {
     string output_name;
@@ -328,9 +305,11 @@ Status HybridModel::GetOutputDescInfo(vector<InputOutputDescInfo> &output_desc,
     std::vector<int64_t> src_index = op_desc->GetSrcIndex();
     if (out_size == out_node_names.size()) {
       bool contains_colon = out_node_names[index].find(":") != std::string::npos;
-      output_name = contains_colon ? out_node_names[index] : out_node_names[index] + ":" + std::to_string(src_index[index]);
+      output_name = contains_colon ? out_node_names[index] : out_node_names[index] +
+          ":" + std::to_string(src_index[index]);
     } else {
-      output_name = std::string("output_") + std::to_string(index) + "_" + src_name[index] + "_" + std::to_string(src_index[index]);
+      output_name = std::string("output_") + std::to_string(index) + "_" + src_name[index] +
+          "_" + std::to_string(src_index[index]);
     }
 
     InputOutputDescInfo output_desc_info;
diff --git a/ge/hybrid/model/hybrid_model.h b/ge/hybrid/model/hybrid_model.h
index 1ec2f8a8..5fd5f8f5 100644
--- a/ge/hybrid/model/hybrid_model.h
+++ b/ge/hybrid/model/hybrid_model.h
@@ -100,12 +100,13 @@ class HybridModel {
 
   Status GetOutputDescInfo(vector<InputOutputDescInfo> &output_desc, std::vector<uint32_t> &formats);
 
-  void CreateInputDimsInfo(const OpDescPtr &op_desc, Format format, InputOutputDescInfo &input);
+  void CreateInputDimsInfo(const OpDescPtr &op_desc, InputOutputDescInfo &input);
 
   void SetModelDescVersion(bool is_new_model_desc) { is_new_model_desc_ = is_new_model_desc; }
 
-  void SetInputDimsAndShapeRangesInfo(const vector<int64_t> &model_input_dims, std::vector<std::pair<int64_t, int64_t>> &shape_ranges,
-                                      Format &format, InputOutputDescInfo &input);
+  void SetInputDimsAndShapeRangesInfo(const vector<int64_t> &model_input_dims,
+                                      std::vector<std::pair<int64_t, int64_t>> &shape_ranges,
+                                      InputOutputDescInfo &input);
 
  private:
   friend class HybridModelBuilder;
diff --git a/ge/hybrid/model/hybrid_model_builder.cc b/ge/hybrid/model/hybrid_model_builder.cc
index f4da3dcf..d519c35b 100755
--- a/ge/hybrid/model/hybrid_model_builder.cc
+++ b/ge/hybrid/model/hybrid_model_builder.cc
@@ -35,7 +35,6 @@ namespace hybrid {
 namespace {
 const uint32_t kSubgraphIndex = 0U;
 const uint32_t kVarOutputIndex = 0U;
-const uint32_t kAlignment = 32;
 const int kBytes = 8;
 const char *const kOwnerGraphIsUnknown = "OwnerGraphIsUnknown";
 
@@ -339,9 +338,9 @@ Status HybridModelBuilder::ParseDependentForFusedSubgraph(NodeItem &node_item) {
     uint32_t parent_index = 0;
     if (!AttrUtils::GetInt(*op_desc, ATTR_NAME_PARENT_NODE_INDEX, parent_index)) {
       GELOGE(INTERNAL_ERROR,
-            "[%s] Failed to get attr [%s]",
-            op_desc->GetName().c_str(),
-            ATTR_NAME_PARENT_NODE_INDEX.c_str());
+             "[%s] Failed to get attr [%s]",
+             op_desc->GetName().c_str(),
+             ATTR_NAME_PARENT_NODE_INDEX.c_str());
       return INTERNAL_ERROR;
     }
 
@@ -793,7 +792,7 @@ Status HybridModelBuilder::HandleDtString(const GeTensor &tensor, void *var_addr
                            "Shape size is invalid");
     auto offset = static_cast<uint64_t>(elem_num * kBytes);
     auto hbm_raw_data_base_addr =
-        reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(var_addr) + offset);
+        static_cast<uint64_t>(reinterpret_cast<uintptr_t>(var_addr) + offset);
     for (int64_t i = elem_num - 1; i >= 0; --i) {
       buff[i] = hbm_raw_data_base_addr + (buff[i] - buff[0]);
     }
@@ -987,7 +986,7 @@ Status HybridModelBuilder::IndexTaskDefs() {
 
     // index task defs
     GELOGD("To index tasks for subgraph: %s", name.c_str());
-    unordered_map<int64_t, NodePtr> node_map;
+    std::unordered_map<int64_t, NodePtr> node_map;
     for (const auto &node : sub_graph->GetDirectNode()) {
       GE_CHECK_NOTNULL(node);
       GE_CHECK_NOTNULL(node->GetOpDesc());
diff --git a/ge/hybrid/model/node_item.h b/ge/hybrid/model/node_item.h
index 8fac4a73..8fbdc648 100644
--- a/ge/hybrid/model/node_item.h
+++ b/ge/hybrid/model/node_item.h
@@ -30,8 +30,8 @@ class NodeTask;
 class NodeExecutor;
 
 struct FusedSubgraph {
-  std::map<uint32_t, std::vector<GeTensorDescPtr>> input_mapping;
-  std::map<uint32_t, OpDescPtr> output_mapping;
+  std::map<int, std::vector<GeTensorDescPtr>> input_mapping;
+  std::map<int, OpDescPtr> output_mapping;
   std::vector<NodePtr> nodes;
   ComputeGraphPtr graph;
 };
diff --git a/ge/hybrid/node_executor/aicore/aicore_node_executor.cc b/ge/hybrid/node_executor/aicore/aicore_node_executor.cc
index 3b87c8b8..407210cf 100755
--- a/ge/hybrid/node_executor/aicore/aicore_node_executor.cc
+++ b/ge/hybrid/node_executor/aicore/aicore_node_executor.cc
@@ -15,7 +15,7 @@
  */
 
 #include "aicore_node_executor.h"
-#include "cce/taskdown_common.hpp"
+#include "framework/common/taskdown_common.h"
 #include "hybrid/executor/hybrid_execution_context.h"
 
 namespace ge {
diff --git a/ge/hybrid/node_executor/aicore/aicore_node_executor.h b/ge/hybrid/node_executor/aicore/aicore_node_executor.h
index 989090e9..9e92a160 100755
--- a/ge/hybrid/node_executor/aicore/aicore_node_executor.h
+++ b/ge/hybrid/node_executor/aicore/aicore_node_executor.h
@@ -89,7 +89,7 @@ class TaskCompilerFactory {
 
 class CompilerFunctionRegistrar {
  public:
-  CompilerFunctionRegistrar(CreateFn fn);
+  explicit CompilerFunctionRegistrar(CreateFn fn);
   ~CompilerFunctionRegistrar() = default;
 };
 }  // namespace hybrid
diff --git a/ge/hybrid/node_executor/aicore/aicore_op_task.cc b/ge/hybrid/node_executor/aicore/aicore_op_task.cc
index 7ed14309..80ea579b 100644
--- a/ge/hybrid/node_executor/aicore/aicore_op_task.cc
+++ b/ge/hybrid/node_executor/aicore/aicore_op_task.cc
@@ -15,7 +15,7 @@
  */
 
 #include "hybrid/node_executor/aicore/aicore_op_task.h"
-#include "cce/taskdown_common.hpp"
+#include "framework/common/taskdown_common.h"
 #include "framework/common/debug/log.h"
 #include "hybrid/executor/hybrid_execution_context.h"
 #include "hybrid/node_executor/aicore/aicore_task_builder.h"
@@ -38,7 +38,7 @@ Status AiCoreOpTask::Init(const OpDesc &op_desc, const domi::TaskDef &task_def)
 }
 
 Status AiCoreOpTask::RegisterTbeHandle(const OpDesc &op_desc) {
-  auto op_desc_ptr = make_shared<OpDesc>(op_desc);
+  auto op_desc_ptr = std::make_shared<OpDesc>(op_desc);
   GE_CHECK_NOTNULL(op_desc_ptr);
   auto tbe_kernel = op_desc_ptr->TryGetExtAttr(OP_EXTATTR_NAME_TBE_KERNEL, TBEKernelPtr());
   if (tbe_kernel == nullptr) {
@@ -151,8 +151,8 @@ Status AiCoreOpTask::ValidateTaskDef(const domi::TaskDef &task_def) {
 
   const domi::KernelDef &kernel_def = task_def.kernel();
   const domi::KernelContext &context = kernel_def.context();
-  auto kernel_type = static_cast<cce::ccKernelType>(context.kernel_type());
-  if (kernel_type != cce::ccKernelType::TE) {
+  auto kernel_type = static_cast<ccKernelType>(context.kernel_type());
+  if (kernel_type != ccKernelType::TE) {
     GELOGE(INTERNAL_ERROR, "Invalid kernel type(%d) in AiCore TaskDef.", static_cast<int>(kernel_type));
     return INTERNAL_ERROR;
   }
diff --git a/ge/hybrid/node_executor/aicore/aicore_task_compiler.h b/ge/hybrid/node_executor/aicore/aicore_task_compiler.h
index bf948349..b6dfd82b 100755
--- a/ge/hybrid/node_executor/aicore/aicore_task_compiler.h
+++ b/ge/hybrid/node_executor/aicore/aicore_task_compiler.h
@@ -26,7 +26,7 @@ namespace hybrid {
 class AiCoreTaskCompiler : public TaskCompiler {
  public:
   AiCoreTaskCompiler() = default;
-  ~AiCoreTaskCompiler() = default;
+  ~AiCoreTaskCompiler() override = default;
 
   Status CompileOp(const NodePtr &node, std::vector<domi::TaskDef> &tasks) override;
   Status Initialize() override;
diff --git a/ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc b/ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc
index 1a47e525..7330f616 100755
--- a/ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc
+++ b/ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc
@@ -15,7 +15,7 @@
  */
 
 #include "hybrid/node_executor/aicpu/aicpu_node_executor.h"
-#include "cce/taskdown_common.hpp"
+#include "framework/common/taskdown_common.h"
 #include "common/formats/formats.h"
 #include "aicpu/common/aicpu_task_struct.h"
 #include "graph/load/new_model_manager/model_manager.h"
@@ -642,10 +642,14 @@ Status AicpuNodeTask::Init(const HybridModel &model) {
   const std::string &so_name = kernel_def.so_name();
   const OpDescPtr op_desc = node_item_->GetOpDesc();
   const auto &context = kernel_def.context();
-  auto kernel_type = static_cast<cce::ccKernelType>(context.kernel_type());
-  if (kernel_type == cce::ccKernelType::CUST_AI_CPU) {
-    GE_CHK_STATUS_RET(ModelManager::GetInstance()->LoadCustAicpuSo(op_desc, so_name), "load cust aicpu so failed.");
-    GE_CHK_STATUS_RET(ModelManager::GetInstance()->LaunchCustAicpuSo(), "Launch cust aicpu so failed.");
+  auto kernel_type = static_cast<ccKernelType>(context.kernel_type());
+  if (kernel_type == ccKernelType::CUST_AI_CPU) {
+    bool loaded = false;
+    GE_CHK_STATUS_RET(ModelManager::GetInstance()->LoadCustAicpuSo(op_desc, so_name, loaded), 
+                      "load cust aicpu so failed.");
+    if (!loaded) {
+      GE_CHK_STATUS_RET(ModelManager::GetInstance()->LaunchCustAicpuSo(), "Launch cust aicpu so failed.");
+    }
   }
 
   GE_CHK_BOOL_RET_STATUS(args.size() == args_size_, FAILED,
@@ -723,9 +727,9 @@ Status AicpuNodeTask::UpdateIoAddr(TaskContext &context) {
 
   auto io_addr = args_.get() + sizeof(aicpu::AicpuParamHead);
   // if has input and output, need copy to ioaddr
-  error_t cpy_ret = memcpy_s(io_addr, args_size_ - sizeof(aicpu::AicpuParamHead),
-                             &io_addrs[0], sizeof(uint64_t) * io_addrs.size());
-  GE_CHK_BOOL_RET_STATUS(cpy_ret == EOK, INTERNAL_ERROR,
+  int cpy_ret = memcpy_s(io_addr, args_size_ - sizeof(aicpu::AicpuParamHead),
+                         &io_addrs[0], sizeof(uint64_t) * io_addrs.size());
+  GE_CHK_BOOL_RET_STATUS(cpy_ret == 0, INTERNAL_ERROR,
                          "Node[%s] memcpy io addr to AicpuParamHead failed, ret=%d, args_size=%u, io nums=%zu.",
                          node_name_.c_str(), cpy_ret, args_size_, io_addrs.size());
   return SUCCESS;
@@ -736,9 +740,9 @@ Status AicpuNodeTask::LaunchTask(TaskContext &context) {
   const auto &so_name = task_def_.kernel().so_name();
   const auto &kernel_name = task_def_.kernel().kernel_name();
   const auto &kcontext = task_def_.kernel().context();
-  auto kernel_type = static_cast<cce::ccKernelType>(kcontext.kernel_type());
+  auto kernel_type = static_cast<ccKernelType>(kcontext.kernel_type());
   uint32_t flag = RT_KERNEL_DEFAULT;
-  if (kernel_type == cce::ccKernelType::CUST_AI_CPU) {
+  if (kernel_type == ccKernelType::CUST_AI_CPU) {
     flag |= static_cast<uint32_t>(RT_KERNEL_CUSTOM_AICPU);
   }
   auto rt_ret = rtCpuKernelLaunchWithFlag(reinterpret_cast<const void *>(so_name.c_str()),
diff --git a/ge/hybrid/node_executor/aicpu/aicpu_node_executor.h b/ge/hybrid/node_executor/aicpu/aicpu_node_executor.h
index b984cc86..1205b190 100644
--- a/ge/hybrid/node_executor/aicpu/aicpu_node_executor.h
+++ b/ge/hybrid/node_executor/aicpu/aicpu_node_executor.h
@@ -37,6 +37,8 @@ class AicpuNodeTaskBase : public NodeTask {
 
   ~AicpuNodeTaskBase() override = default;
 
+  using NodeTask::Init;
+
   virtual Status Init(const HybridModel &model) = 0;
 
   Status UpdateArgs(TaskContext &context) override;
diff --git a/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.h b/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.h
index fb1966b4..2dde993b 100644
--- a/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.h
+++ b/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.h
@@ -27,7 +27,7 @@ class HybridModel;
 
 class KnownNodeTask : public NodeTask {
  public:
-  KnownNodeTask(std::shared_ptr<DavinciModel> davinci_model)
+  explicit KnownNodeTask(std::shared_ptr<DavinciModel> davinci_model)
       : davinci_model_(davinci_model)
     {}
 
diff --git a/ge/hybrid/node_executor/controlop/control_op_executor.cc b/ge/hybrid/node_executor/controlop/control_op_executor.cc
index 83fc09ee..74920b22 100644
--- a/ge/hybrid/node_executor/controlop/control_op_executor.cc
+++ b/ge/hybrid/node_executor/controlop/control_op_executor.cc
@@ -405,7 +405,7 @@ Status ControlOpNodeExecutor::LoadTask(const HybridModel &model,
   auto node_item = model.GetNodeItem(node);
   GE_CHECK_NOTNULL(node_item);
 
-  unique_ptr<ControlOpNodeTask> node_task;
+  std::unique_ptr<ControlOpNodeTask> node_task;
   auto node_type = node->GetType();
   if (node_type == IF || node_type == STATELESSIF) {
     node_task.reset(new(std::nothrow) IfOpNodeTask());
diff --git a/ge/hybrid/node_executor/controlop/control_op_executor.h b/ge/hybrid/node_executor/controlop/control_op_executor.h
index 7520afd1..3becfaaa 100644
--- a/ge/hybrid/node_executor/controlop/control_op_executor.h
+++ b/ge/hybrid/node_executor/controlop/control_op_executor.h
@@ -25,6 +25,7 @@ namespace ge {
 namespace hybrid {
 class ControlOpNodeTask : public NodeTask {
  public:
+  using NodeTask::Init;
   virtual Status Init(const NodePtr &node, const HybridModel &model) = 0;
   Status UpdateArgs(TaskContext &context) override;
 
diff --git a/ge/hybrid/node_executor/ge_local/ge_local_node_executor.cc b/ge/hybrid/node_executor/ge_local/ge_local_node_executor.cc
index ee45964c..a52e5670 100755
--- a/ge/hybrid/node_executor/ge_local/ge_local_node_executor.cc
+++ b/ge/hybrid/node_executor/ge_local/ge_local_node_executor.cc
@@ -61,18 +61,18 @@ Status RefInputTask::Execute(TaskContext &context) {
 
 Status RefInputTask::RefOneByOne(TaskContext &context) {
   GELOGI("node %s type %s ref input one by one begin.", node_name_.c_str(), node_type_.c_str());
-  uint32_t input_num = context.NumInputs();
-  uint32_t output_num = context.NumOutputs();
+  int input_num = context.NumInputs();
+  int output_num = context.NumOutputs();
   if (output_num > input_num) {
-    GELOGE(INTERNAL_ERROR, "node %s type %s has %u outputs but only %u inputs, can't ref one by one.",
+    GELOGE(INTERNAL_ERROR, "node %s type %s has %d outputs but only %d inputs, can't ref one by one.",
            node_name_.c_str(), node_type_.c_str(), output_num, input_num);
     return INTERNAL_ERROR;
   }
-  for (uint32_t out_index = 0; out_index < output_num; ++out_index) {
+  for (uint32_t out_index = 0; out_index < static_cast<uint32_t>(output_num); ++out_index) {
     auto input = context.GetInput(out_index);
     GE_CHECK_NOTNULL(input);
     GE_CHK_STATUS_RET(context.SetOutput(out_index, *input));
-    GELOGD("node %s type %s output[%u] ref input[%u] addr=%p.",
+    GELOGD("node %s type %s output[%d] ref input[%d] addr=%p.",
            node_name_.c_str(), node_type_.c_str(), out_index, out_index, input->GetData());
   }
   GELOGI("node %s type %s ref input one by one end.", node_name_.c_str(), node_type_.c_str());
diff --git a/ge/hybrid/node_executor/host_cpu/kernel/assign_kernel.cc b/ge/hybrid/node_executor/host_cpu/kernel/assign_kernel.cc
index 3bf71013..01fd391d 100644
--- a/ge/hybrid/node_executor/host_cpu/kernel/assign_kernel.cc
+++ b/ge/hybrid/node_executor/host_cpu/kernel/assign_kernel.cc
@@ -20,7 +20,6 @@
 #include "hybrid/node_executor/host_cpu/kernel_factory.h"
 
 namespace {
-const size_t kAssignInputNum = 2;
 const size_t kAssignRefInputIndex = 0;
 const size_t kAssignValueInputIndex = 1;
 const size_t kAssignRefOutputIndex = 0;
diff --git a/ge/hybrid/node_executor/node_executor.cc b/ge/hybrid/node_executor/node_executor.cc
index e577f09b..95e50c31 100755
--- a/ge/hybrid/node_executor/node_executor.cc
+++ b/ge/hybrid/node_executor/node_executor.cc
@@ -34,7 +34,6 @@ const char *const kEngineNameAiCpuTf = "aicpu_tf_kernel";
 const char *const kEngineNameHccl = "ops_kernel_info_hccl";
 const char *const kEngineNameRts = "DNN_VM_RTS_OP_STORE";
 const char *const kEngineNameHostCpu = "DNN_VM_HOST_CPU_OP_STORE";
-const char *const kOwnerGraphIsUnknown = "OwnerGraphIsUnknown";
 }
 Status NodeExecutor::PrepareTask(NodeTask &task, TaskContext &context) const {
   GE_CHK_STATUS_RET_NOLOG(context.AllocateOutputs());
diff --git a/ge/hybrid/node_executor/partitioned_call/partitioned_call_node_executor.h b/ge/hybrid/node_executor/partitioned_call/partitioned_call_node_executor.h
index 9ea544a1..73873002 100644
--- a/ge/hybrid/node_executor/partitioned_call/partitioned_call_node_executor.h
+++ b/ge/hybrid/node_executor/partitioned_call/partitioned_call_node_executor.h
@@ -41,7 +41,6 @@ class PartitionedCallNodeTask : public NodeTask {
 
   const GraphItem *graph_item_;
   std::unique_ptr<SubgraphExecutor> subgraph_executor_;
-  GraphExecutionContext *context_ = nullptr;
 };
 
 class PartitionedCallNodeExecutor : public NodeExecutor {
diff --git a/ge/hybrid/node_executor/task_context.cc b/ge/hybrid/node_executor/task_context.cc
index b7152878..77004f99 100644
--- a/ge/hybrid/node_executor/task_context.cc
+++ b/ge/hybrid/node_executor/task_context.cc
@@ -233,9 +233,7 @@ Status TaskContext::AllocateOutput(int index,
       } else {
         GE_CHK_STATUS_RET_NOLOG(AllocateTensor(tensor_desc, outputs_start_[index], attr));
         GELOGD("Allocating output successfully. node: %s. index = %d, size = %zu",
-              node_item_->NodeName().c_str(),
-              index,
-              outputs_start_[index].GetSize());
+               node_item_->NodeName().c_str(), index, outputs_start_[index].GetSize());
       }
     }
   }
diff --git a/ge/hybrid/node_executor/task_context.h b/ge/hybrid/node_executor/task_context.h
index 2cff0536..0549a1dc 100644
--- a/ge/hybrid/node_executor/task_context.h
+++ b/ge/hybrid/node_executor/task_context.h
@@ -29,7 +29,7 @@
 
 namespace ge {
 namespace hybrid {
-class GraphExecutionContext;
+struct GraphExecutionContext;
 class SubgraphContext;
 
 class TaskContext {
diff --git a/ge/init/gelib.cc b/ge/init/gelib.cc
index 306a804a..92700179 100755
--- a/ge/init/gelib.cc
+++ b/ge/init/gelib.cc
@@ -485,11 +485,9 @@ Status GELib::Finalize() {
 void GELib::ShutDownProfiling() {
   std::lock_guard<std::mutex> lock(status_mutex_);
 
-  if (!ProfilingManager::Instance().ProfilingOpTraceOn() && ProfilingManager::Instance().ProfilingOn()) {
-    ProfilingManager::Instance().StopProfiling();
-  }
   if (ProfilingManager::Instance().ProfilingOn()) {
-    ProfilingManager::Instance().PluginUnInit(GE_PROFILING_MODULE);
+    ProfilingManager::Instance().StopProfiling();
+    ProfilingManager::Instance().PluginUnInit();
   }
 }
 
diff --git a/ge/ir_build/ge_ir_build.cc b/ge/ir_build/ge_ir_build.cc
index 74aa6a60..f181170c 100644
--- a/ge/ir_build/ge_ir_build.cc
+++ b/ge/ir_build/ge_ir_build.cc
@@ -49,6 +49,8 @@ const std::string IR_OPTION_LOG_LEVEL_DEFAULT = "default";
 const std::string IR_OPTION_BUFFER_OPTIMIZE_DEFAULT = "l2_optimize";
 const std::string IR_OPTION_DISABLE_REUSE_MEMORY_DEFAULT = "0";
 const std::string IR_OPTION_ENABLE_COMPRESS_WEIGHT_DEFAULT = "false";
+const std::string kInputShape = "input_shape";
+const std::string kInputFormat = "input_format";
 }  // namespace
 
 static graphStatus CheckGlobalOptions(std::map<std::string, std::string> &global_options) {
@@ -225,7 +227,9 @@ class Impl {
   ~Impl() { (void)generator_.Finalize(); };
   graphStatus CheckOptions(const std::map<std::string, std::string> &options);
   graphStatus CreateInputsForIRBuild(const ge::Graph &graph, vector<ge::GeTensor> &inputs);
-  graphStatus Init(const std::map<std::string, std::string> &options);
+  graphStatus GetDefaultInputShape(const Graph &graph, string &default_shape);
+  graphStatus UpdateDataOpAttr(const Graph &graph);
+  graphStatus Init(const Graph &graph, const std::map<std::string, std::string> &options);
   graphStatus BuildModel(const Graph &graph, const std::map<std::string, std::string> &options,
                          ModelBufferData &ge_models);
   graphStatus InitDomiOmgContext(const string &input_shape, const string &input_format, const string &net_format,
@@ -240,6 +244,40 @@ class Impl {
   OmgContext omg_context_;
 };
 
+graphStatus Impl::UpdateDataOpAttr(const Graph &graph) {
+  GELOGD("Enter Update Data Attr Process!");
+  if (options_.find(kInputShape) == options_.end()) {
+    return GRAPH_SUCCESS;
+  }
+  unordered_map<string, vector<int64_t>> shape_map;
+  vector<pair<string, vector<int64_t>>> user_shape_map;
+  GE_CHK_BOOL_EXEC(ParseInputShape(options_[kInputShape], shape_map, user_shape_map, true),
+    return GRAPH_PARAM_INVALID, "parse input shape failed!");
+  auto compute_graph = ge::GraphUtils::GetComputeGraph(graph);
+  GE_CHECK_NOTNULL(compute_graph);
+  for (ge::NodePtr &input_node : compute_graph->GetDirectNode()) {
+    GE_CHECK_NOTNULL(input_node);
+    ge::OpDescPtr op = input_node->GetOpDesc();
+    GE_CHECK_NOTNULL(op);
+    if (op->GetType() == DATA) {
+      auto tensor_input = op->MutableInputDesc(0);
+      auto tensor_output = op->MutableOutputDesc(0);
+      GE_CHECK_NOTNULL(tensor_input);
+      GE_CHECK_NOTNULL(tensor_output);
+      string data_op_name = op->GetName();
+      auto iter = shape_map.find(data_op_name);
+      if (iter != shape_map.end()) {
+        tensor_input->SetShape(ge::GeShape(iter->second));
+        tensor_output->SetShape(ge::GeShape(iter->second));
+        GELOGD("update input [%s] shape info", data_op_name.c_str());
+      } else {
+        GELOGI("no need update input [%s] attr because not found from input_shape.", data_op_name.c_str());
+      }
+    }
+  }
+  return GRAPH_SUCCESS;
+}
+
 graphStatus Impl::CheckOptions(const std::map<std::string, std::string> &options) {
   for (auto &ele : options) {
     auto it = ge::ir_option::ir_builder_suppported_options.find(ele.first);
@@ -275,17 +313,61 @@ graphStatus Impl::CheckOptions(const std::map<std::string, std::string> &options
       return GRAPH_PARAM_INVALID;
     }
   }
+  // Check option EXEC_DISABLE_REUSED_MEMORY
+  it = options_.find(ge::ir_option::EXEC_DISABLE_REUSED_MEMORY);
+  if (it != options_.end() && (CheckDisableReuseMemoryParamValid(it->second) != GRAPH_SUCCESS)) {
+    return GRAPH_PARAM_INVALID;
+  }
+  return GRAPH_SUCCESS;
+}
+
+graphStatus Impl::GetDefaultInputShape(const Graph &graph, string &default_shape) {
+  auto compute_graph = ge::GraphUtils::GetComputeGraph(graph);
+  GE_CHECK_NOTNULL(compute_graph);
+  for (ge::NodePtr &input_node : compute_graph->GetDirectNode()) {
+    GE_CHECK_NOTNULL(input_node);
+    ge::OpDescPtr op = input_node->GetOpDesc();
+    GE_CHECK_NOTNULL(op);
+    if (op->GetType() == DATA) {
+      string data_op_name = op->GetName();
+      GELOGD("Data op name: %s, data op inputDesc size: %zu", data_op_name.c_str(), op->GetAllInputsDesc().size());
+      ge::GeTensorDesc tensor = op->GetInputDesc(0);
+      ge::GeShape data_shape = tensor.GetShape();
+      GELOGD("Data op get shape from InputDesc in ge ir graph.");
+
+      string tmp_shape_str;
+      const std::vector<int64_t> &tmp_shape = data_shape.GetDims();
+      if (tmp_shape.empty()) {
+        GELOGW("Data op: %s has zero shape dims!", data_op_name.c_str());
+      } else {
+        tmp_shape_str += data_op_name + ":";
+        for (auto tmp_dim : tmp_shape) {
+          tmp_shape_str += to_string((long)tmp_dim) + ",";
+        }
+        tmp_shape_str = tmp_shape_str.substr(0, tmp_shape_str.size() - 1);
+        tmp_shape_str += ";";
+        default_shape += tmp_shape_str;
+      }
+
+      GELOGD("Data op name: %s, data shape: %s.", data_op_name.c_str(), tmp_shape_str.c_str());
+    }
+  }
+  default_shape = (default_shape.empty() ? default_shape : default_shape.substr(0, default_shape.size() - 1));
+  GELOGI("Get default data op shape: %s from ge ir graph.", default_shape.c_str());
   return GRAPH_SUCCESS;
 }
 
-graphStatus Impl::Init(const std::map<std::string, std::string> &options) {
+graphStatus Impl::Init(const Graph &graph, const std::map<std::string, std::string> &options) {
   // 1. check options
   graphStatus ret = CheckOptions(options);
   if (ret != GRAPH_SUCCESS) {
     GELOGE(ret, "User input options are illegal! Please check!");
     return ret;
   }
-
+  ret = UpdateDataOpAttr(graph);
+  if (ret != GRAPH_SUCCESS) {
+    return ret;
+  }
   std::string build_mode = (options_.find(BUILD_MODE) == options_.end() || options_[BUILD_MODE] == BUILD_MODE_NORMAL)
                            ? "" : options_[BUILD_MODE];
   options_[BUILD_MODE] = build_mode;
@@ -296,7 +378,13 @@ graphStatus Impl::Init(const std::map<std::string, std::string> &options) {
   GE_CHK_BOOL_RET_STATUS_NOLOG(ge::CheckLogParamValidAndSetLogLevel(log) == 0, GRAPH_PARAM_INVALID);
   options_[ge::ir_option::LOG_LEVEL] = log;
 
-  string input_shape = options_.find("input_shape") == options_.end() ? "" : options_["input_shape"];
+  string input_shape;
+  if (options_.find("input_shape") == options_.end()) {
+    GE_CHK_BOOL_EXEC(GetDefaultInputShape(graph, input_shape) == ge::SUCCESS,
+                     return ge::GRAPH_PARAM_INVALID, "Get default data op shape from graph failed!");
+  } else {
+    input_shape = options_["input_shape"];
+  }
   string input_format = options_.find("input_format") == options_.end() ? "" : options_["input_format"];
   string net_format = options_.find("net_format") == options_.end() ? "" : options_["net_format"];
   string dynamic_batch_size = options_.find(ge::ir_option::DYNAMIC_BATCH_SIZE) == options_.end()
@@ -416,7 +504,7 @@ graphStatus Impl::CreateInputsForIRBuild(const ge::Graph &graph, vector<ge::GeTe
 graphStatus Impl::BuildModel(const Graph &graph, const std::map<std::string, std::string> &options,
                              ModelBufferData &model) {
   // 1. init GeGenerator with user optios
-  graphStatus ret = Init(options);
+  graphStatus ret = Init(graph, options);
   if (ret != GRAPH_SUCCESS) {
     GELOGE(ret, "Build ir model Init failed!");
     return ret;
@@ -502,7 +590,7 @@ graphStatus aclgrphSaveModel(const string &output_file, const ModelBufferData &m
     GELOGE(GRAPH_PARAM_INVALID, "input model is illegal");
     return GRAPH_PARAM_INVALID;
   }
-  return FileSaver::SaveToFile((output_file + ".om"), reinterpret_cast<void*>(model.data.get()),
+  return FileSaver::SaveToFile((output_file + ".om"), reinterpret_cast<void *>(model.data.get()),
                                static_cast<uint32_t>(model.length));
 }
 
@@ -517,7 +605,7 @@ graphStatus aclgrphSaveModel(const char *output_file, const ModelBufferData &mod
     return GRAPH_PARAM_INVALID;
   }
   std::string str_output_file = output_file;
-  return FileSaver::SaveToFile((str_output_file + ".om"), reinterpret_cast<void*>(model.data.get()),
+  return FileSaver::SaveToFile((str_output_file + ".om"), reinterpret_cast<void *>(model.data.get()),
                                static_cast<uint32_t>(model.length));
 }
 
@@ -543,7 +631,7 @@ graphStatus aclgrphInferShapeAndType(ge::Graph &graph) {
   }
 
   auto ret = compute_graph->TopologicalSorting();
-  if(ret != GRAPH_SUCCESS) {
+  if (ret != GRAPH_SUCCESS) {
     GELOGE(ret, "Acl topo logical sort failed.");
     return ret;
   }
@@ -622,4 +710,52 @@ graphStatus aclgrphDumpGraph(const ge::Graph &graph, const char *file, const siz
   return GRAPH_SUCCESS;
 }
 
+graphStatus aclgrphGenerateForOp(const AscendString &op_type, const vector<TensorDesc> &inputs,
+                                 const vector<TensorDesc> &outputs, Graph &graph) {
+  auto op_type_str = std::string(op_type.GetString());
+  auto op_name = op_type_str + "_" + std::to_string(ge::GetCurrentTimestamp());
+  auto op_desc = ge::MakeShared<ge::OpDesc>(op_name, op_type_str);
+  GE_CHECK_NOTNULL(op_desc);
+
+  // convert input tensordesc to getensor
+  std::vector<ge::GeTensor> input_tensors;
+  for (const auto &input : inputs) {
+    ge::GeTensorDesc tensor_desc(ge::GeShape(input.GetShape().GetDims()), input.GetFormat(), input.GetDataType());
+
+    tensor_desc.SetOriginFormat(input.GetFormat());
+    ge::TensorUtils::SetRealDimCnt(tensor_desc, static_cast<uint32_t>(input.GetShape().GetDims().size()));
+    ge::TensorUtils::SetInputTensor(tensor_desc, true);
+    ge::TensorUtils::SetOutputTensor(tensor_desc, false);
+
+    if (op_desc->AddInputDesc(tensor_desc) != ge::GRAPH_SUCCESS) {
+      GELOGE(ge::FAILED, "AddInputDesc fail.");
+      return ge::FAILED;
+    }
+    input_tensors.emplace_back(tensor_desc);
+  }
+
+  // convert output tensordesc to getensor
+  std::vector<ge::GeTensor> output_tensors;
+  for (const auto &output : outputs) {
+    ge::GeTensorDesc tensor_desc(ge::GeShape(output.GetShape().GetDims()), output.GetFormat(), output.GetDataType());
+
+    tensor_desc.SetOriginFormat(output.GetFormat());
+    ge::TensorUtils::SetRealDimCnt(tensor_desc, static_cast<uint32_t>(output.GetShape().GetDims().size()));
+    ge::TensorUtils::SetInputTensor(tensor_desc, false);
+    ge::TensorUtils::SetOutputTensor(tensor_desc, true);
+
+    (void)op_desc->AddOutputDesc(tensor_desc);
+    output_tensors.emplace_back(tensor_desc);
+  }
+
+  // call api to get graph
+  ge::GeGenerator generator;
+  std::string graph_name = ge::CurrentTimeInStr() + "_graph";
+  if (generator.BuildSingleOpGraph(op_desc, input_tensors, output_tensors, graph_name, graph) != ge::SUCCESS) {
+    GELOGE(GRAPH_FAILED, "make graph fail.");
+    return GRAPH_FAILED;
+  }
+  return GRAPH_SUCCESS;
+}
+
 }  // namespace ge
diff --git a/ge/model/ge_root_model.h b/ge/model/ge_root_model.h
index 53174064..aa5a4d47 100755
--- a/ge/model/ge_root_model.h
+++ b/ge/model/ge_root_model.h
@@ -23,6 +23,7 @@
 namespace ge {
 class GeRootModel {
  public:
+  GeRootModel() = default;
   explicit GeRootModel(ComputeGraphPtr &root_graph) : root_graph_(root_graph), model_id_(INVALID_MODEL_ID) {};
   ~GeRootModel() = default;
 
@@ -35,11 +36,11 @@ class GeRootModel {
   void SetModelId(uint32_t model_id) { model_id_ = model_id; }
   uint32_t GetModelId() const { return model_id_; }
   Status CheckIsUnknownShape(bool &is_dynamic_shape);
-
+  void SetRootGraph(ComputeGraphPtr graph) { root_graph_ = graph; }
  private:
-  ComputeGraphPtr root_graph_;
+  ComputeGraphPtr root_graph_ = nullptr;
   std::map<std::string, GeModelPtr> subgraph_instance_name_to_model_;
-  uint32_t model_id_;
+  uint32_t model_id_ = 0;
 };
 }  // namespace ge
 using GeRootModelPtr = std::shared_ptr<ge::GeRootModel>;
diff --git a/ge/offline/CMakeLists.txt b/ge/offline/CMakeLists.txt
index 49af37c0..21221042 100644
--- a/ge/offline/CMakeLists.txt
+++ b/ge/offline/CMakeLists.txt
@@ -11,13 +11,13 @@ set(SRC_LIST
     "main.cc"
     "single_op_parser.cc"
     "../session/omg.cc"
-    "../ir_build/atc_ir_common.cc" 
+    "../ir_build/atc_ir_common.cc"
 )
 
 ############ atc ############
 add_executable(atc ${SRC_LIST} ${PROTO_HDRS})
 
-target_compile_options(atc PRIVATE 
+target_compile_options(atc PRIVATE
     -Werror
     -O2
     -Wno-deprecated-declarations
@@ -74,6 +74,136 @@ target_link_libraries(atc PRIVATE
     -ldl
 )
 
+############ atc_atc.bin ############
+add_executable(atc_atc.bin ${SRC_LIST} ${PROTO_HDRS})
+
+target_compile_options(atc_atc.bin PRIVATE
+    -Werror
+    -O2
+    -Wno-deprecated-declarations
+)
+
+target_compile_definitions(atc_atc.bin PRIVATE
+    PROTOBUF_INLINE_NOT_IN_HEADERS=0
+    COMPILE_OMG_PACKAGE
+    google=ascend_private
+)
+
+target_include_directories(atc_atc.bin PRIVATE
+    ${CMAKE_CURRENT_LIST_DIR}
+    ${GE_CODE_DIR}
+    ${GE_CODE_DIR}/ge
+    ${GE_CODE_DIR}/inc/external
+    ${GE_CODE_DIR}/common/inc/external
+    ${GE_CODE_DIR}/common/inc/external/graph
+    ${GE_CODE_DIR}/inc
+    ${GE_CODE_DIR}/inc/framework
+    ${METADEF_DIR}/inc
+    ${METADEF_DIR}/inc/graph
+    ${METADEF_DIR}/inc/register
+    ${METADEF_DIR}/inc/external
+    ${METADEF_DIR}/inc/external/graph
+    ${METADEF_DIR}/inc/external/register
+    ${PARSER_DIR}
+    ${CMAKE_BINARY_DIR}
+    ${CMAKE_BINARY_DIR}/proto/ge
+    #### yellow zone ####
+    ${GE_CODE_DIR}/../inc
+    ${GE_CODE_DIR}/../inc/common
+    #### blue zone ####
+    ${GE_CODE_DIR}/third_party/fwkacllib/inc
+    ${GE_CODE_DIR}/third_party/fwkacllib/inc/toolchain
+)
+
+target_link_libraries(atc_atc.bin PRIVATE
+    $<BUILD_INTERFACE:intf_pub>
+    ascend_protobuf
+    ge_common
+    register
+    c_sec
+    graph
+    error_manager
+    ge_compiler
+    parser_common
+    gflags
+    json
+    runtime_compile
+    slog
+    static_mmpa
+    -lrt
+    -ldl
+)
+
+set_target_properties(atc_atc.bin PROPERTIES
+    OUTPUT_NAME atc.bin
+    RUNTIME_OUTPUT_DIRECTORY atclib
+)
+
+############ fwk_atc.bin ############
+add_executable(fwk_atc.bin ${SRC_LIST} ${PROTO_HDRS})
+
+target_compile_options(fwk_atc.bin PRIVATE
+    -Werror
+    -O2
+    -Wno-deprecated-declarations
+)
+
+target_compile_definitions(fwk_atc.bin PRIVATE
+    PROTOBUF_INLINE_NOT_IN_HEADERS=0
+    COMPILE_OMG_PACKAGE
+    google=ascend_private
+)
+
+target_include_directories(fwk_atc.bin PRIVATE
+    ${CMAKE_CURRENT_LIST_DIR}
+    ${GE_CODE_DIR}
+    ${GE_CODE_DIR}/ge
+    ${GE_CODE_DIR}/inc/external
+    ${GE_CODE_DIR}/common/inc/external
+    ${GE_CODE_DIR}/common/inc/external/graph
+    ${GE_CODE_DIR}/inc
+    ${GE_CODE_DIR}/inc/framework
+    ${METADEF_DIR}/inc
+    ${METADEF_DIR}/inc/graph
+    ${METADEF_DIR}/inc/register
+    ${METADEF_DIR}/inc/external
+    ${METADEF_DIR}/inc/external/graph
+    ${METADEF_DIR}/inc/external/register
+    ${PARSER_DIR}
+    ${CMAKE_BINARY_DIR}
+    ${CMAKE_BINARY_DIR}/proto/ge
+    #### yellow zone ####
+    ${GE_CODE_DIR}/../inc
+    ${GE_CODE_DIR}/../inc/common
+    #### blue zone ####
+    ${GE_CODE_DIR}/third_party/fwkacllib/inc
+    ${GE_CODE_DIR}/third_party/fwkacllib/inc/toolchain
+)
+
+target_link_libraries(fwk_atc.bin PRIVATE
+    $<BUILD_INTERFACE:intf_pub>
+    ascend_protobuf
+    ge_common
+    register
+    c_sec
+    graph
+    error_manager
+    ge_runner
+    parser_common
+    gflags
+    json
+    runtime
+    slog
+    static_mmpa
+    -lrt
+    -ldl
+)
+
+set_target_properties(fwk_atc.bin PROPERTIES
+    OUTPUT_NAME atc.bin
+    RUNTIME_OUTPUT_DIRECTORY fwkacl
+)
+
 ############ install ############
 set(INSTALL_BASE_DIR "")
 set(INSTALL_LIBRARY_DIR lib)
@@ -81,3 +211,11 @@ set(INSTALL_LIBRARY_DIR lib)
 install(TARGETS atc OPTIONAL
     LIBRARY DESTINATION ${INSTALL_LIBRARY_DIR}
 )
+
+install(TARGETS atc_atc.bin OPTIONAL
+    RUNTIME DESTINATION ${INSTALL_LIBRARY_DIR}/atclib
+)
+
+install(TARGETS fwk_atc.bin OPTIONAL
+    RUNTIME DESTINATION ${INSTALL_LIBRARY_DIR}/fwkacl
+)
diff --git a/ge/offline/atc b/ge/offline/atc
new file mode 100644
index 00000000..05c65c26
--- /dev/null
+++ b/ge/offline/atc
@@ -0,0 +1,21 @@
+#!/bin/bash
+#-------------------------------------------------------------------
+# Purpose:
+# Copyright 2020 Huawei Technologies Co., Ltd. All rights reserved.
+#-------------------------------------------------------------------
+
+real_path=$(readlink "$0")
+if [ $? -eq 0 ]; then
+    LOCAL_PATH=$(cd "$(dirname "$real_path")"; pwd)
+else
+    LOCAL_PATH=$(cd "$(dirname "$0")"; pwd)
+fi
+PKG_PATH=$(cd ${LOCAL_PATH}/..; pwd)
+LIB_P="/lib64"
+PYTHON_P="/python/site-packages"
+LIB64_PATH="${PKG_PATH}${LIB_P}"
+PYTHON_PATH="${PKG_PATH}${PYTHON_P}"
+export LD_LIBRARY_PATH="${LIB64_PATH}:${LD_LIBRARY_PATH}"
+export PYTHONPATH="${PYTHON_PATH}:${PYTHONPATH}"
+
+${PKG_PATH}/bin/atc.bin "$@"
diff --git a/ge/offline/main.cc b/ge/offline/main.cc
index 76494c68..b7188a85 100755
--- a/ge/offline/main.cc
+++ b/ge/offline/main.cc
@@ -68,7 +68,7 @@ const char *const kModeSupport = "only support 0(model to framework model), "
 const char *const kModelToJsonSupport = "only support 0(Caffe) 3(TensorFlow) 5(Onnx)";
 
 // limit available mem size 2G
-const long kMinAvailableMem = 2 * 1024 * 1024;
+const long kMinAvailableMem = 2097152;  // 2 * 1024 * 1024
 
 DEFINE_string(model, "", "The model file.");
 DEFINE_string(output, "", "The output file path&name.");
diff --git a/ge/offline/module.mk b/ge/offline/module.mk
index 8859df29..5c7a919c 100755
--- a/ge/offline/module.mk
+++ b/ge/offline/module.mk
@@ -54,3 +54,108 @@ LOCAL_LDFLAGS := -lrt -ldl
 
 include $(BUILD_HOST_EXECUTABLE)
 
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := atclib/atc.bin
+
+LOCAL_CFLAGS += -Werror -Wno-deprecated-declarations
+LOCAL_CFLAGS += -DPROTOBUF_INLINE_NOT_IN_HEADERS=0 -DCOMPILE_OMG_PACKAGE -O2 -Dgoogle=ascend_private
+
+LOCAL_SRC_FILES := \
+    main.cc \
+    single_op_parser.cc \
+    ../session/omg.cc \
+    ../ir_build/atc_ir_common.cc \
+
+LOCAL_C_INCLUDES := \
+    $(LOCAL_PATH)/../ ./ \
+    $(TOPDIR)inc \
+    $(TOPDIR)metadef/inc \
+    $(TOPDIR)graphengine/inc \
+    $(TOPDIR)inc/external \
+    $(TOPDIR)metadef/inc/external \
+    $(TOPDIR)graphengine/inc/external \
+    $(TOPDIR)metadef/inc/external/graph \
+    $(TOPDIR)graphengine/inc/framework \
+    $(TOPDIR)libc_sec/include \
+    $(TOPDIR)metadef/inc/common/util \
+    $(TOPDIR)parser    \
+    third_party/json/include \
+    third_party/gflags/include \
+    third_party/protobuf/include \
+    proto/om.proto \
+    proto/ge_ir.proto \
+    proto/task.proto \
+    proto/insert_op.proto \
+
+LOCAL_SHARED_LIBRARIES := \
+    libc_sec \
+    libge_common \
+    libascend_protobuf \
+    libslog \
+    libgraph \
+    libregister \
+    liberror_manager \
+    libge_compiler \
+    libruntime_compile \
+    libparser_common \
+    liberror_manager \
+
+LOCAL_STATIC_LIBRARIES := libgflags
+
+LOCAL_LDFLAGS := -lrt -ldl
+
+include $(BUILD_HOST_EXECUTABLE)
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := fwkacl/atc.bin
+
+LOCAL_CFLAGS += -Werror -Wno-deprecated-declarations
+LOCAL_CFLAGS += -DPROTOBUF_INLINE_NOT_IN_HEADERS=0 -DCOMPILE_OMG_PACKAGE -O2 -Dgoogle=ascend_private
+
+LOCAL_SRC_FILES := \
+    main.cc \
+    single_op_parser.cc \
+    ../session/omg.cc \
+    ../ir_build/atc_ir_common.cc \
+
+LOCAL_C_INCLUDES := \
+    $(LOCAL_PATH)/../ ./ \
+    $(TOPDIR)inc \
+    $(TOPDIR)metadef/inc \
+    $(TOPDIR)graphengine/inc \
+    $(TOPDIR)inc/external \
+    $(TOPDIR)metadef/inc/external \
+    $(TOPDIR)graphengine/inc/external \
+    $(TOPDIR)metadef/inc/external/graph \
+    $(TOPDIR)graphengine/inc/framework \
+    $(TOPDIR)libc_sec/include \
+    $(TOPDIR)metadef/inc/common/util \
+    $(TOPDIR)parser    \
+    third_party/json/include \
+    third_party/gflags/include \
+    third_party/protobuf/include \
+    proto/om.proto \
+    proto/ge_ir.proto \
+    proto/task.proto \
+    proto/insert_op.proto \
+
+LOCAL_SHARED_LIBRARIES := \
+    libc_sec \
+    libge_common \
+    libascend_protobuf \
+    libslog \
+    libgraph \
+    libregister \
+    liberror_manager \
+    libge_runner \
+    libruntime \
+    libparser_common \
+    liberror_manager \
+
+LOCAL_STATIC_LIBRARIES := libgflags
+
+LOCAL_LDFLAGS := -lrt -ldl
+
+include $(BUILD_HOST_EXECUTABLE)
diff --git a/ge/offline/single_op_parser.cc b/ge/offline/single_op_parser.cc
index d4b9c1c9..b1e0da6d 100644
--- a/ge/offline/single_op_parser.cc
+++ b/ge/offline/single_op_parser.cc
@@ -27,6 +27,7 @@
 #include "common/ge_inner_error_codes.h"
 #include "framework/common/util.h"
 #include "graph/utils/tensor_utils.h"
+#include "graph/utils/type_utils.h"
 #include "graph/utils/op_desc_utils.h"
 #include "graph/operator_factory_impl.h"
 
@@ -176,6 +177,7 @@ T GetValue(const map<string, T> &dict, string &key, T default_val) {
 }
 
 void from_json(const Json &j, SingleOpTensorDesc &desc) {
+  bool is_tensor_valid = true;
   desc.dims = j.at(kKeyShape).get<vector<int64_t>>();
   auto it = j.find(kKeyShapeRange);
   if (it != j.end()) {
@@ -189,9 +191,12 @@ void from_json(const Json &j, SingleOpTensorDesc &desc) {
   string type_str = j.at(kKeyType).get<string>();
   desc.format = GetValue(kFormatDict, format_str, FORMAT_RESERVED);
   desc.type = GetValue(kDataTypeDict, type_str, DT_UNDEFINED);
+  is_tensor_valid = is_tensor_valid && ge::TypeUtils::IsFormatValid(format_str);
+  is_tensor_valid = is_tensor_valid && ge::TypeUtils::IsDataTypeValid(type_str);
   it = j.find(kKeyOriginFormat);
   if (it != j.end()) {
     string origin_format_str = j.at(kKeyOriginFormat).get<string>();
+    is_tensor_valid = is_tensor_valid && ge::TypeUtils::IsFormatValid(origin_format_str);
     desc.ori_format = GetValue(kFormatDict, origin_format_str, FORMAT_RESERVED);
   }
   auto tensor_name = j.find(kKeyName);
@@ -202,6 +207,9 @@ void from_json(const Json &j, SingleOpTensorDesc &desc) {
   if (dynamic_input_name != j.end()) {
     desc.dynamic_input_name = dynamic_input_name->get<string>();
   }
+  if (!is_tensor_valid) {
+    desc.SetValidFlag(is_tensor_valid);
+  }
 }
 
 void from_json(const Json &j, SingleOpAttr &attr) {
@@ -305,6 +313,12 @@ bool SingleOpParser::Validate(const SingleOpDesc &op_desc) {
 
   int index = 0;
   for (auto &tensor_desc : op_desc.input_desc) {
+    if (!tensor_desc.GetValidFlag()) {
+      ErrorManager::GetInstance().ATCReportErrMessage("E10027", {"input", "type", "index"},
+          {"intput", "datatype or format", std::to_string(index)});
+      GELOGE(PARAM_INVALID, "Input's dataType or format is invalid when the index is %d", index);
+      return false;
+    }
     if ((tensor_desc.type == DT_UNDEFINED && tensor_desc.format != FORMAT_RESERVED) ||
         (tensor_desc.type != DT_UNDEFINED && tensor_desc.format == FORMAT_RESERVED)){
       ErrorManager::GetInstance().ATCReportErrMessage("E10027", {"input", "type", "index"},
@@ -317,6 +331,12 @@ bool SingleOpParser::Validate(const SingleOpDesc &op_desc) {
 
   index = 0;
   for (auto &tensor_desc : op_desc.output_desc) {
+    if (!tensor_desc.GetValidFlag()) {
+      ErrorManager::GetInstance().ATCReportErrMessage("E10027", {"input", "type", "index"},
+          {"output", "datatype", std::to_string(index)});
+      GELOGE(PARAM_INVALID, "Output's dataType is invalid when the index is %d", index);
+      return false;
+    }
     if (tensor_desc.type == DT_UNDEFINED) {
       ErrorManager::GetInstance().ATCReportErrMessage("E10027", {"input", "type", "index"},
           {"output", "datatype", std::to_string(index)});
diff --git a/ge/offline/single_op_parser.h b/ge/offline/single_op_parser.h
index 19879a32..71aa58bb 100644
--- a/ge/offline/single_op_parser.h
+++ b/ge/offline/single_op_parser.h
@@ -28,6 +28,10 @@
 
 namespace ge {
 struct SingleOpTensorDesc {
+public:
+  bool GetValidFlag() const { return is_valid_; }
+  void SetValidFlag(bool is_valid) { is_valid_ = is_valid; }
+public:
   std::string name;
   std::vector<int64_t> dims;
   std::vector<int64_t> ori_dims;
@@ -36,6 +40,8 @@ struct SingleOpTensorDesc {
   ge::Format ori_format = ge::FORMAT_RESERVED;
   ge::DataType type = ge::DT_UNDEFINED;
   std::string dynamic_input_name;
+private:
+  bool is_valid_ = true;
 };
 
 struct SingleOpAttr {
diff --git a/ge/omm/csa_interact.cc b/ge/omm/csa_interact.cc
index 1599af94..1b33ddbd 100644
--- a/ge/omm/csa_interact.cc
+++ b/ge/omm/csa_interact.cc
@@ -202,7 +202,7 @@ Status CsaInteract::WriteFile(const std::string &file_name, const std::string &c
     }
   }
 
-  mmSsize_t ret = mmWrite(fd, (void *)content.c_str(), content.length());
+  mmSsize_t ret = mmWrite(fd, reinterpret_cast<void *>(const_cast<char *>(content.c_str())), content.length());
   if (ret == EN_ERROR) {
     GELOGE(INTERNAL_ERROR, "write file fail, errno is %d", errno);
     ret = mmClose(fd);
diff --git a/ge/opskernel_manager/ops_kernel_builder_manager.cc b/ge/opskernel_manager/ops_kernel_builder_manager.cc
index e0001fcd..37bdcf7a 100644
--- a/ge/opskernel_manager/ops_kernel_builder_manager.cc
+++ b/ge/opskernel_manager/ops_kernel_builder_manager.cc
@@ -167,4 +167,5 @@ Status OpsKernelBuilderManager::GenerateTask(const Node &node,
   GELOGD("Done invoking GenerateTask successfully");
   return SUCCESS;
 }
-}  // namespace ge
\ No newline at end of file
+
+}  // namespace ge
diff --git a/ge/opskernel_manager/ops_kernel_manager.cc b/ge/opskernel_manager/ops_kernel_manager.cc
index 8134a463..30f39c0d 100644
--- a/ge/opskernel_manager/ops_kernel_manager.cc
+++ b/ge/opskernel_manager/ops_kernel_manager.cc
@@ -175,8 +175,8 @@ Status OpsKernelManager::ParsePluginOptions(const map<string, string> &options,
       } else if (flag == 1) {
         enable_flag = true;
       } else {
-        GELOGE(GE_GRAPH_OPTIONS_INVALID, "option_key:%s, its value %s is invalid, it must be 0 or 1.", plugin_name.c_str(),
-               iter->second.c_str());
+        GELOGE(GE_GRAPH_OPTIONS_INVALID, "option_key:%s, its value %s is invalid, it must be 0 or 1.",
+               plugin_name.c_str(), iter->second.c_str());
         return GE_GRAPH_OPTIONS_INVALID;
       }
     } catch (std::invalid_argument &) {
@@ -188,8 +188,8 @@ Status OpsKernelManager::ParsePluginOptions(const map<string, string> &options,
              iter->second.c_str());
       return GE_GRAPH_OPTIONS_INVALID;
     } catch (...) {
-      GELOGE(GE_GRAPH_OPTIONS_INVALID, "option_key:%s, its value %s is invalid, it must be 0 or 1.", plugin_name.c_str(),
-             iter->second.c_str());
+      GELOGE(GE_GRAPH_OPTIONS_INVALID, "option_key:%s, its value %s is invalid, it must be 0 or 1.",
+             plugin_name.c_str(), iter->second.c_str());
       return GE_GRAPH_OPTIONS_INVALID;
     }
   } else {
diff --git a/ge/session/omg.cc b/ge/session/omg.cc
index df837f99..7ff52e82 100755
--- a/ge/session/omg.cc
+++ b/ge/session/omg.cc
@@ -68,6 +68,9 @@ const std::string kScopeIdAttr = "fusion_scope";
 const char *const kOutputTypeSample = "correct sample is \"opname:index:dtype\"";
 const char *const kOutputTypeSupport = "only support FP32, FP16, UINT8";
 const char *const kOutputTypeError = "The multiple out nodes set in output_type must be found in out_nodes.";
+const size_t kNodeNameIndex = 0;
+const size_t kIndexStrIndex = 1;
+const size_t kDTValueIndex = 2;
 }  // namespace
 
 // When the model is converted to a JSON file, the following operator attributes in the blacklist will be ignored
@@ -381,14 +384,14 @@ Status ParseOutputType(const std::string &output_type, std::map<std::string, vec
       return domi::FAILED;
     }
     ge::DataType tmp_dt;
-    std::string node_name = StringUtils::Trim(node_index_type_v[0]);
-    std::string index_str = StringUtils::Trim(node_index_type_v[1]);
+    std::string node_name = StringUtils::Trim(node_index_type_v[kNodeNameIndex]);
+    std::string index_str = StringUtils::Trim(node_index_type_v[kIndexStrIndex]);
     int32_t index;
     if (StringToInt(index_str, index) != SUCCESS) {
       GELOGE(PARAM_INVALID, "This str must be digit string, while the actual input is %s.", index_str.c_str());
       return domi::FAILED;
     }
-    std::string dt_value = StringUtils::Trim(node_index_type_v[2]);
+    std::string dt_value = StringUtils::Trim(node_index_type_v[kDTValueIndex]);
     auto it = output_type_str_to_datatype.find(dt_value);
     if (it == output_type_str_to_datatype.end()) {
       ErrorManager::GetInstance().ATCReportErrMessage("E10001", {"parameter", "value", "reason"},
@@ -641,7 +644,8 @@ Status ParseOutNodes(const string &out_nodes) {
         if (!domi::GetContext().user_out_nodes_top_vec.empty()) {
           ErrorManager::GetInstance().ATCReportErrMessage("E10001", {"parameter", "value", "reason"},
                                                           {"--out_nodes", out_nodes, "is not all index or top_name"});
-          GELOGE(PARAM_INVALID, "This out_nodes str must be all index or top_name, while the actual input is %s", out_nodes.c_str());
+          GELOGE(PARAM_INVALID,
+                 "This out_nodes str must be all index or top_name, while the actual input is %s", out_nodes.c_str());
           return PARAM_INVALID;
         }
         // stoi: The method may throw an exception: invalid_argument/out_of_range
@@ -891,7 +895,7 @@ FMK_FUNC_HOST_VISIBILITY Status ConvertOmModelToJson(const char *model_file, con
       if (status != ge::GRAPH_SUCCESS) {
         GELOGE(ge::FAILED, "Om file init failed.");
         if (model.model_data != nullptr) {
-          delete[](char *) model.model_data;
+          delete[] reinterpret_cast<char *>(model.model_data);
           model.model_data = nullptr;
         }
         return status;
@@ -902,7 +906,7 @@ FMK_FUNC_HOST_VISIBILITY Status ConvertOmModelToJson(const char *model_file, con
       if (status != ge::GRAPH_SUCCESS) {
         GELOGE(ge::FAILED, "Get model part failed.");
         if (model.model_data != nullptr) {
-          delete[](char *) model.model_data;
+          delete[] reinterpret_cast<char *>(model.model_data);
           model.model_data = nullptr;
         }
         return status;
@@ -928,7 +932,7 @@ FMK_FUNC_HOST_VISIBILITY Status ConvertOmModelToJson(const char *model_file, con
     }
 
     if (model.model_data != nullptr) {
-      delete[](char *) model.model_data;
+      delete[] reinterpret_cast<char *>(model.model_data);
       model.model_data = nullptr;
     }
     return ret;
diff --git a/ge/single_op/single_op.cc b/ge/single_op/single_op.cc
index 371d7110..a2652b67 100755
--- a/ge/single_op/single_op.cc
+++ b/ge/single_op/single_op.cc
@@ -17,6 +17,7 @@
 #include "single_op/single_op.h"
 
 #include "common/fmk_types.h"
+#include "common/ge_types.h"
 #include "common/math/math_util.h"
 #include "common/profiling/profiling_manager.h"
 #include "framework/common/debug/ge_log.h"
@@ -24,19 +25,60 @@
 #include "graph/load/new_model_manager/model_utils.h"
 #include "runtime/mem.h"
 #include "single_op/single_op_manager.h"
+#include "single_op/task/build_task_utils.h"
 #include "graph/load/new_model_manager/model_manager.h"
 
 namespace ge {
 namespace {
 const size_t kDataMemAlignSize = 32;
+const size_t kDataMemAlignUnit = 2;
 
 size_t GetAlignedSize(size_t size) {
-  size_t aligned_size = (size + 2 * kDataMemAlignSize - 1) / kDataMemAlignSize * kDataMemAlignSize;
+  size_t aligned_size = (size + kDataMemAlignUnit * kDataMemAlignSize - 1) / kDataMemAlignSize * kDataMemAlignSize;
   return aligned_size;
 }
+
+Status ProfilingTaskInfo(OpTask *op_task) {
+  if (!ProfilingManager::Instance().ProfilingModelExecuteOn()) {
+    return SUCCESS;
+  }
+
+  string model_name;
+  string op_name;
+  uint32_t model_id;
+  uint32_t block_dim;
+  if (op_task->GetProfilingArgs(model_name, op_name, model_id, block_dim) != SUCCESS) {
+    GELOGE(ACL_ERROR_GE_PARAM_INVALID, "Get profiling data of task failed");
+    return ACL_ERROR_GE_PARAM_INVALID;
+  }
+  GELOGD("ProfilingReport of op[%s] model[%s] start.", op_name.c_str(), model_name.c_str());
+  std::vector<TaskDescInfo> task_desc_info;
+  uint32_t task_id = 0;
+  uint32_t stream_id = 0;
+  if (rtGetTaskIdAndStreamID(&task_id, &stream_id) != RT_ERROR_NONE) {
+    GELOGE(ACL_ERROR_GE_PARAM_INVALID, "Get task_id and stream_id failed.");
+    return ACL_ERROR_GE_PARAM_INVALID;
+  }
+
+  TaskDescInfo tmp_task_desc_info;
+  tmp_task_desc_info.model_name = model_name;
+  tmp_task_desc_info.op_name = op_name;
+  tmp_task_desc_info.block_dim = block_dim;
+  tmp_task_desc_info.task_id = task_id;
+  tmp_task_desc_info.stream_id = stream_id;
+  GELOGD("GetTaskDescInfo of op [%s] end, task_id[%u], stream_id[%u]", op_name.c_str(), task_id, stream_id);
+  task_desc_info.emplace_back(tmp_task_desc_info);
+
+  std::vector<ComputeGraphDescInfo> compute_graph_info;
+
+  auto &profiling_manager = ProfilingManager::Instance();
+  profiling_manager.ReportProfilingData(model_id, task_desc_info, compute_graph_info);
+  return SUCCESS;
+}
 }  // namespace
 
-SingleOp::SingleOp(std::mutex *stream_mutex, rtStream_t stream) : stream_mutex_(stream_mutex), stream_(stream) {
+SingleOp::SingleOp(StreamResource *stream_resource, std::mutex *stream_mutex, rtStream_t stream)
+    : stream_resource_(stream_resource), stream_mutex_(stream_mutex), stream_(stream) {
 }
 
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY SingleOp::~SingleOp() {
@@ -68,7 +110,8 @@ Status SingleOp::ValidateArgs(const std::vector<DataBuffer> &inputs, const std::
 
   auto num_outputs = outputs.size();
   if (num_outputs != output_sizes_.size()) {
-    GELOGE(ACL_ERROR_GE_PARAM_INVALID, "output num mismatch. model expect %zu, but given %zu", output_sizes_.size(), outputs.size());
+    GELOGE(ACL_ERROR_GE_PARAM_INVALID, "output num mismatch. model expect %zu, but given %zu",
+           output_sizes_.size(), outputs.size());
     return ACL_ERROR_GE_PARAM_INVALID;
   }
 
@@ -117,37 +160,6 @@ Status SingleOp::UpdateArgs(const std::vector<DataBuffer> &inputs, const std::ve
       *arg_addr = args_[i];
     }
   }
-  // update aicpu_TF or aicpu_CC args
-  for (auto &task : tasks_) {
-    size_t io_addr_num = args_.size();
-    if (task->GetOpTaskType() == OP_TASK_AICPU) {
-      GELOGD("Update aicpu_TF task args");
-      task->SetIoAddrsForDump(args_);
-      auto *dst_io_addr = const_cast<uintptr_t *>(reinterpret_cast<const uintptr_t *>(task->GetIOAddr()));
-      GE_CHECK_NOTNULL(dst_io_addr);
-      auto rt_ret = rtMemcpyAsync(dst_io_addr,
-                                  sizeof(uint64_t) * args_.size(),
-                                  &args_[0],
-                                  sizeof(uint64_t) * args_.size(),
-                                  RT_MEMCPY_HOST_TO_DEVICE_EX,
-                                  stream_);
-      if (rt_ret != RT_ERROR_NONE) {
-        GELOGE(rt_ret, "rtMemcpyAsync addresses failed, ret = %d", rt_ret);
-        return rt_ret;
-      }
-    } else if (task->GetOpTaskType() == OP_TASK_AICPUCC) {
-      GELOGD("Update aicpu_CC task args");
-      const uintptr_t *task_io_addr = reinterpret_cast<const uintptr_t *>(task->GetIOAddr());
-      GE_CHECK_NOTNULL(task_io_addr);
-      auto io_addr = reinterpret_cast<uint64_t *>(const_cast<uintptr_t *>(task_io_addr));
-      for (size_t i = 0; i < io_addr_num; ++i) {
-        io_addr[i] = static_cast<uintptr_t>(args_[i]);
-      }
-    } else {
-      GELOGW("Only TF_kernel aicpu and aicpu_CC are supported, but got %u", task->GetOpTaskType());
-      continue;
-    }
-  }
   return SUCCESS;
 }
 
@@ -158,7 +170,19 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status SingleOp::ExecuteAsync(c
     return ret;
   }
 
+  GE_CHECK_NOTNULL(stream_resource_);
   std::lock_guard<std::mutex> lk(*stream_mutex_);
+  auto current_mem_base = stream_resource_->GetMemoryBase();
+  if (running_param_->mem_base != current_mem_base) {
+    running_param_->mem_base = const_cast<uint8_t *>(current_mem_base);
+    GELOGD("Memory base changed, new memory base = %p", current_mem_base);
+    for (auto &task : tasks_) {
+      auto new_address = BuildTaskUtils::GetAddresses(task->GetOpdesc(), *running_param_);
+      GE_CHK_STATUS_RET(task->UpdateArgTable(*running_param_),
+                        "[%s] Failed to update arg table",
+                        task->GetOpdesc()->GetName().c_str());
+    }
+  }
   ret = UpdateArgs(inputs, outputs);
   if (ret != SUCCESS) {
     return ret;
@@ -169,6 +193,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status SingleOp::ExecuteAsync(c
     if (ret != SUCCESS) {
       return ret;
     }
+    GE_CHK_STATUS_RET_NOLOG(ProfilingTaskInfo(task));
   }
 
   return ret;
@@ -182,9 +207,6 @@ DynamicSingleOp::DynamicSingleOp(uintptr_t resource_id, std::mutex *stream_mutex
     : resource_id_(resource_id), stream_mutex_(stream_mutex), stream_(stream) {
 }
 
-DynamicSingleOp::~DynamicSingleOp() {
-}
-
 Status DynamicSingleOp::ValidateParams(const vector<GeTensorDesc> &input_desc,
                                        const std::vector<DataBuffer> &inputs,
                                        std::vector<GeTensorDesc> &output_desc,
@@ -206,63 +228,24 @@ Status DynamicSingleOp::ValidateParams(const vector<GeTensorDesc> &input_desc,
   }
 
   if (input_desc.size() != num_inputs_) {
-    GELOGE(ACL_ERROR_GE_PARAM_INVALID, "Input number mismatches. expect %zu, but given %zu", num_inputs_, input_desc.size());
+    GELOGE(ACL_ERROR_GE_PARAM_INVALID,
+           "Input number mismatches. expect %zu, but given %zu",
+           num_inputs_,
+           input_desc.size());
     return ACL_ERROR_GE_PARAM_INVALID;
   }
 
   if (output_desc.size() != num_outputs_) {
-    GELOGE(ACL_ERROR_GE_PARAM_INVALID, "Output number mismatches. expect %zu, but given %zu", num_outputs_, output_desc.size());
+    GELOGE(ACL_ERROR_GE_PARAM_INVALID,
+           "Output number mismatches. expect %zu, but given %zu",
+           num_outputs_,
+           output_desc.size());
     return ACL_ERROR_GE_PARAM_INVALID;
   }
 
   return SUCCESS;
 }
 
-Status DynamicSingleOp::AllocateWorkspaces(const std::vector<int64_t> &workspace_sizes,
-                                           std::vector<void *> &workspaces) {
-  static const std::string kPurpose("malloc workspace memory for dynamic op.");
-  if (workspace_sizes.empty()) {
-    GELOGD("No need to allocate workspace.");
-    return SUCCESS;
-  }
-  int64_t total_size = 0;
-  std::vector<int64_t> ws_offsets;
-  for (auto ws_size : workspace_sizes) {
-    // alignment and padding should be done in OpParaCalculate
-    GE_CHK_STATUS_RET_NOLOG(CheckInt64AddOverflow(total_size, ws_size));
-    ws_offsets.emplace_back(total_size);
-    total_size += ws_size;
-  }
-
-  GELOGD("Total workspace size is %ld", total_size);
-  StreamResource *stream_resource = SingleOpManager::GetInstance().GetResource(resource_id_, stream_);
-  GE_CHECK_NOTNULL(stream_resource);
-  auto ws_base = stream_resource->MallocMemory(kPurpose, static_cast<size_t>(total_size));
-  if (ws_base == nullptr) {
-    GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Failed to allocate memory of size: %ld", total_size);
-    return ACL_ERROR_GE_MEMORY_ALLOCATION;
-  }
-  GELOGD("Done allocating workspace memory successfully.");
-
-  for (auto ws_offset : ws_offsets) {
-    workspaces.emplace_back(ws_base + ws_offset);
-  }
-
-  return SUCCESS;
-}
-
-Status DynamicSingleOp::ExecuteTbeTask(const vector<GeTensorDesc> &input_desc,
-                                       const vector<void *> &inputs,
-                                       vector<GeTensorDesc> &output_desc,
-                                       vector<void *> &outputs) {
-  GE_CHK_STATUS_RET_NOLOG(op_task_->UpdateRunInfo(input_desc, output_desc));
-
-  std::vector<void *> workspace_buffers;
-  GE_CHK_STATUS_RET_NOLOG(AllocateWorkspaces(op_task_->GetWorkspaceSizes(), workspace_buffers));
-
-  return op_task_->LaunchKernel(inputs, outputs, workspace_buffers, stream_);
-}
-
 Status DynamicSingleOp::ExecuteAsync(const vector<GeTensorDesc> &input_desc,
                                      const vector<DataBuffer> &input_buffers,
                                      vector<GeTensorDesc> &output_desc,
@@ -271,24 +254,8 @@ Status DynamicSingleOp::ExecuteAsync(const vector<GeTensorDesc> &input_desc,
   GE_CHK_STATUS_RET_NOLOG(ValidateParams(input_desc, input_buffers, output_desc, output_buffers));
   std::lock_guard<std::mutex> lk(*stream_mutex_);
 
-  std::vector<void *> inputs;
-  std::vector<void *> outputs;
-  for (auto &buffer : input_buffers) {
-    inputs.emplace_back(buffer.data);
-  }
-  for (auto &buffer : output_buffers) {
-    outputs.emplace_back(buffer.data);
-  }
-
-  if (op_task_->GetOpTaskType() == OP_TASK_TBE) {
-    return ExecuteTbeTask(input_desc, inputs, output_desc, outputs);
-  } else if (op_task_->GetOpTaskType() == OP_TASK_AICPU || op_task_->GetOpTaskType() == OP_TASK_AICPUCC) {
-    return op_task_->LaunchKernel(input_desc, input_buffers, output_desc, output_buffers, stream_);
-  } else {
-    GELOGE(ACL_ERROR_GE_OP_TASK_TYPE_INVALID,
-           "Only TBE_Task, AI_CPU_Task and AI_CPUCC_Task are supported, but got %u",
-           op_task_->GetOpTaskType());
-    return ACL_ERROR_GE_OP_TASK_TYPE_INVALID;
-  }
+  GE_CHK_STATUS_RET_NOLOG(op_task_->LaunchKernel(input_desc, input_buffers, output_desc, output_buffers, stream_));
+  GE_CHK_STATUS_RET_NOLOG(ProfilingTaskInfo(op_task_.get()));
+  return SUCCESS;
 }
 }  // namespace ge
diff --git a/ge/single_op/single_op.h b/ge/single_op/single_op.h
index 14ef8ce1..d677f94a 100755
--- a/ge/single_op/single_op.h
+++ b/ge/single_op/single_op.h
@@ -30,9 +30,11 @@
 #include "cce/aicpu_engine_struct.h"
 
 namespace ge {
+class StreamResource;
+struct SingleOpModelParam;
 class SingleOp {
  public:
-  SingleOp(std::mutex *stream_mutex, rtStream_t stream);
+  SingleOp(StreamResource *stream_resource, std::mutex *stream_mutex, rtStream_t stream);
   ~SingleOp();
 
   Status ExecuteAsync(const std::vector<DataBuffer> &inputs, const std::vector<DataBuffer> &outputs);
@@ -44,6 +46,7 @@ class SingleOp {
   Status GetArgs(const std::vector<DataBuffer> &inputs, const std::vector<DataBuffer> &outputs);
 
   friend class SingleOpModel;
+  StreamResource *stream_resource_;
   std::mutex *stream_mutex_;
   rtStream_t stream_ = nullptr;
   std::vector<void *> input_addr_list_;
@@ -54,12 +57,13 @@ class SingleOp {
 
   std::vector<OpTask *> tasks_;
   std::vector<std::vector<uintptr_t *>> arg_table_;
+  std::unique_ptr<SingleOpModelParam> running_param_;
 };
 
 class DynamicSingleOp {
  public:
   DynamicSingleOp(uintptr_t resource_id, std::mutex *stream_mutex_, rtStream_t stream);
-  ~DynamicSingleOp();
+  ~DynamicSingleOp() = default;
   Status ExecuteAsync(const vector<GeTensorDesc> &input_desc,
                       const std::vector<DataBuffer> &inputs,
                       std::vector<GeTensorDesc> &output_desc,
@@ -72,14 +76,6 @@ class DynamicSingleOp {
                         std::vector<GeTensorDesc> &output_desc,
                         std::vector<DataBuffer> &outputs) const;
 
-  Status AllocateWorkspaces(const std::vector<int64_t> &workspace_sizes,
-                            std::vector<void *> &workspaces);
-
-  Status ExecuteTbeTask(const vector<GeTensorDesc> &input_desc,
-                        const vector<void *> &inputs,
-                        vector<GeTensorDesc> &output_desc,
-                        vector<void *> &outputs);
-
   std::unique_ptr<OpTask> op_task_;
   uintptr_t resource_id_ = 0;
   std::mutex *stream_mutex_;
diff --git a/ge/single_op/single_op_model.cc b/ge/single_op/single_op_model.cc
index 49968f4f..a4a4b623 100755
--- a/ge/single_op/single_op_model.cc
+++ b/ge/single_op/single_op_model.cc
@@ -92,7 +92,8 @@ Status SingleOpModel::InitModelMem(StreamResource &res) {
   if (model_params_.memory_size > model_params_.zero_copy_mem_size) {
     const string purpose("malloc feature map memory on model execute.");
     GELOGI("total memory: %lu, zero_copy_mem: %lu", model_params_.memory_size, model_params_.zero_copy_mem_size);
-    model_params_.mem_base = res.MallocMemory(purpose, model_params_.memory_size - model_params_.zero_copy_mem_size);
+    model_params_.mem_base =
+        res.MallocMemory(purpose, model_params_.memory_size - model_params_.zero_copy_mem_size, false);
     if (model_params_.mem_base == nullptr) {
       return ACL_ERROR_GE_MEMORY_ALLOCATION;
     }
@@ -157,6 +158,7 @@ Status SingleOpModel::LoadAllNodes() {
   auto ge_model = model_helper_.GetGeModel();
   GE_CHECK_NOTNULL(ge_model);
   Graph graph = ge_model->GetGraph();
+  model_id_ = ge_model->GetModelId();
   auto compute_graph = GraphUtils::GetComputeGraph(graph);
   if (compute_graph == nullptr) {
     GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[%s] compute_graph is null", model_name_.c_str());
@@ -225,9 +227,10 @@ Status SingleOpModel::SetInputsAndOutputs(SingleOp &single_op) {
   return SUCCESS;
 }
 
-Status SingleOpModel::BuildTaskList(SingleOp &single_op) {
+Status SingleOpModel::BuildTaskList(StreamResource *stream_resource, SingleOp &single_op) {
   auto ge_model = model_helper_.GetGeModel();
   GE_CHECK_NOTNULL(ge_model);
+  single_op.arg_table_.resize(single_op.input_sizes_.size() + single_op.output_sizes_.size());
   auto tasks = ge_model->GetModelTaskDefPtr()->task();
   for (int i = 0; i < tasks.size(); ++i) {
     const TaskDef &task_def = tasks[i];
@@ -237,8 +240,8 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) {
     if (task_type == RT_MODEL_TASK_KERNEL) {
       const domi::KernelDef &kernel_def = task_def.kernel();
       const auto &context = kernel_def.context();
-      auto kernel_type = static_cast<cce::ccKernelType>(context.kernel_type());
-      if (kernel_type == cce::ccKernelType::TE) {
+      auto kernel_type = static_cast<ccKernelType>(context.kernel_type());
+      if (kernel_type == ccKernelType::TE) {
         GELOGD("Building TBE task");
         TbeOpTask *tbe_task = nullptr;
         auto ret = BuildKernelTask(task_def.kernel(), &tbe_task);
@@ -246,10 +249,13 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) {
           return ret;
         }
 
-        single_op.arg_table_.resize(single_op.input_sizes_.size() + single_op.output_sizes_.size());
         ParseArgTable(tbe_task, single_op);
+        tbe_task->SetModelArgs(model_name_, model_id_);
+        if (tbe_task->tiling_buffer_ != nullptr) {
+          tbe_task->stream_resource_ = stream_resource;
+        }
         single_op.tasks_.emplace_back(tbe_task);
-      } else if (kernel_type == cce::ccKernelType::AI_CPU || kernel_type == cce::ccKernelType::CUST_AI_CPU) {
+      } else if (kernel_type == ccKernelType::AI_CPU || kernel_type == ccKernelType::CUST_AI_CPU) {
         GELOGD("Building AICPU_CC task");
         OpTask *task = nullptr;
         uint64_t singleop_kernel_id = aicpu_kernel_id++;
@@ -258,9 +264,12 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) {
         if (ret != SUCCESS) {
           return ret;
         }
+        task->SetModelArgs(model_name_, model_id_);
+        ParseArgTable(task, single_op);
         single_op.tasks_.emplace_back(task);
       } else {
-        GELOGE(ACL_ERROR_GE_OP_KERNEL_TYPE_INVALID, "Only TBE, AI_CPU, CUST_AI_CPU kernel are supported, but got %u", context.kernel_type());
+        GELOGE(ACL_ERROR_GE_OP_KERNEL_TYPE_INVALID,
+               "Only TBE, AI_CPU, CUST_AI_CPU kernel are supported, but got %u", context.kernel_type());
         return ACL_ERROR_GE_OP_KERNEL_TYPE_INVALID;
       }
     } else if (task_type == RT_MODEL_TASK_KERNEL_EX) {
@@ -273,6 +282,8 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) {
       if (ret != SUCCESS) {
         return ret;
       }
+      aicpu_task->SetModelArgs(model_name_, model_id_);
+      ParseArgTable(aicpu_task, single_op);
       single_op.tasks_.emplace_back(aicpu_task);
     } else {
       // skip
@@ -282,21 +293,23 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) {
   return SUCCESS;
 }
 
-void SingleOpModel::ParseArgTable(TbeOpTask *task, SingleOp &op) {
+void SingleOpModel::ParseArgTable(OpTask *task, SingleOp &op) {
   if (task == nullptr) {
     GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "tbe op task is nullptr");
     return;
   }
+
   // args: addr1, addr2, addr3 ...
-  auto *args = const_cast<uintptr_t *>(reinterpret_cast<const uintptr_t *>(task->GetArgs()));
-  size_t arg_size = task->GetArgSize();
-  for (size_t i = 0; i < arg_size / sizeof(void *); ++i) {
-    uintptr_t *ptr_to_addr = args + i;
+  uintptr_t *arg_base = nullptr;
+  size_t arg_num = 0;
+  task->GetIoAddr(arg_base, arg_num);
+  for (size_t i = 0; i < arg_num; ++i) {
+    uintptr_t *ptr_to_addr = arg_base + i;
     uintptr_t addr = *ptr_to_addr;
     auto iter = model_params_.addr_mapping_.find(addr);
     if (iter != model_params_.addr_mapping_.end()) {
       int arg_index = iter->second;
-      GELOGI("%s args[%zu] mapped to user designated args[%d]", task->GetStubName().c_str(), i, arg_index);
+      GELOGI("%s args[%zu] mapped to user designated args[%d]", task->GetOpdesc()->GetName().c_str(), i, arg_index);
       op.arg_table_[iter->second].emplace_back(ptr_to_addr);
     }
   }
@@ -368,7 +381,7 @@ Status SingleOpModel::BuildCpuKernelTask(const domi::KernelDef &kernel_def, OpTa
   }
 
   auto builder = AiCpuCCTaskBuilder(iter->second->GetOpDesc(), kernel_def);
-  auto ret = builder.BuildTask(*aicpucc_task, kernel_id);
+  auto ret = builder.BuildTask(*aicpucc_task, kernel_id, model_params_);
   if (ret != SUCCESS) {
     GELOGE(ret, "build aicpu_CC op task failed");
     return ret;
@@ -381,25 +394,29 @@ Status SingleOpModel::BuildCpuKernelTask(const domi::KernelDef &kernel_def, OpTa
 Status SingleOpModel::BuildOp(StreamResource &resource, SingleOp &single_op) {
   GE_CHK_STATUS_RET_NOLOG(ParseInputsAndOutputs());
   GE_CHK_STATUS_RET_NOLOG(InitModelMem(resource));
+  single_op.running_param_.reset(new (std::nothrow)SingleOpModelParam(model_params_));
+  GE_CHECK_NOTNULL(single_op.running_param_);
   GE_CHK_STATUS_RET_NOLOG(SetInputsAndOutputs(single_op));
-  return BuildTaskList(single_op);
+  return BuildTaskList(&resource, single_op);
 }
 
 Status SingleOpModel::BuildModelTaskKernel(const TaskDef &task_def, DynamicSingleOp &single_op) {
   const domi::KernelDef &kernel_def = task_def.kernel();
   const auto &context = kernel_def.context();
-  auto kernel_type = static_cast<cce::ccKernelType>(context.kernel_type());
-  if (kernel_type == cce::ccKernelType::TE) {
+  auto kernel_type = static_cast<ccKernelType>(context.kernel_type());
+  if (kernel_type == ccKernelType::TE) {
     GELOGD("Building TBE task");
     TbeOpTask *tbe_task = nullptr;
     GE_CHK_STATUS_RET_NOLOG(BuildKernelTask(task_def.kernel(), &tbe_task));
+    tbe_task->SetModelArgs(model_name_, model_id_);
     single_op.op_task_.reset(tbe_task);
-  } else if (kernel_type == cce::ccKernelType::AI_CPU || kernel_type == cce::ccKernelType::CUST_AI_CPU) {
+  } else if (kernel_type == ccKernelType::AI_CPU || kernel_type == ccKernelType::CUST_AI_CPU) {
     GELOGD("Building AICPU_CC task");
     OpTask *task = nullptr;
     uint64_t dynamic_singleop_kernel_id = aicpu_kernel_id++;
     GELOGI("Build dynamic singleOp CCTask, kernel_id = %lu", dynamic_singleop_kernel_id);
     GE_CHK_STATUS_RET_NOLOG(BuildCpuKernelTask(task_def.kernel(), &task, dynamic_singleop_kernel_id));
+    task->SetModelArgs(model_name_, model_id_);
     single_op.op_task_.reset(task);
   } else {
     GELOGE(ACL_ERROR_GE_OP_KERNEL_TYPE_INVALID,
@@ -446,6 +463,7 @@ Status SingleOpModel::BuildTaskListForDynamicOp(DynamicSingleOp &single_op) {
         const TaskDef &copy_task_def = tasks[i];
         GE_CHK_STATUS_RET_NOLOG(aicpu_task->SetMemCopyTask(copy_task_def.kernel_ex()));
       }
+      aicpu_task->SetModelArgs(model_name_, model_id_);
       single_op.op_task_.reset(aicpu_task);
     } else {
       // skip
diff --git a/ge/single_op/single_op_model.h b/ge/single_op/single_op_model.h
index 50aeb7ab..c3164543 100755
--- a/ge/single_op/single_op_model.h
+++ b/ge/single_op/single_op_model.h
@@ -65,7 +65,7 @@ class SingleOpModel {
   Status ParseInputNode(const OpDescPtr &op_desc);
   void ParseOutputNode(const OpDescPtr &op_desc);
 
-  Status BuildTaskList(SingleOp &single_op);
+  Status BuildTaskList(StreamResource *stream_resource, SingleOp &single_op);
   Status BuildTaskListForDynamicOp(DynamicSingleOp &dynamic_single_op);
   Status BuildKernelTask(const domi::KernelDef &kernel_def, TbeOpTask **task);
   Status BuildKernelExTask(const domi::KernelExDef &kernel_def, AiCpuTask **task,
@@ -74,9 +74,10 @@ class SingleOpModel {
   Status BuildModelTaskKernel(const domi::TaskDef &task_def, DynamicSingleOp &single_op);
 
   static void ParseOpModelParams(ModelHelper &model_helper, SingleOpModelParam &param);
-  void ParseArgTable(TbeOpTask *task, SingleOp &op);
+  void ParseArgTable(OpTask *task, SingleOp &op);
 
   std::string model_name_;
+  uint32_t model_id_ = 0;
   const void *ori_model_data_;
   uint32_t ori_model_size_;
 
diff --git a/ge/single_op/stream_resource.cc b/ge/single_op/stream_resource.cc
index f545b6c8..722a1024 100755
--- a/ge/single_op/stream_resource.cc
+++ b/ge/single_op/stream_resource.cc
@@ -69,11 +69,25 @@ uint8_t *StreamResource::DoMallocMemory(const std::string &purpose,
                                         size_t size,
                                         size_t &max_allocated,
                                         std::vector<uint8_t *> &allocated) {
+  if (size == 0) {
+    GELOGD("Mem size == 0");
+    return nullptr;
+  }
+
   if (size <= max_allocated && !allocated.empty()) {
     GELOGD("reuse last memory");
     return allocated.back();
   }
 
+  if (!allocated.empty()) {
+    uint8_t *current_buffer = allocated.back();
+    allocated.pop_back();
+    if (rtStreamSynchronize(stream_) != RT_ERROR_NONE) {
+      GELOGW("Failed to invoke rtStreamSynchronize");
+    }
+    (void) rtFree(current_buffer);
+  }
+
   uint8_t *buffer = nullptr;
   auto ret = rtMalloc(reinterpret_cast<void **>(&buffer), size, RT_MEMORY_HBM);
   if (ret != RT_ERROR_NONE) {
@@ -96,10 +110,14 @@ uint8_t *StreamResource::DoMallocMemory(const std::string &purpose,
   return buffer;
 }
 
-uint8_t *StreamResource::MallocMemory(const std::string &purpose, size_t size) {
+uint8_t *StreamResource::MallocMemory(const std::string &purpose, size_t size, bool holding_lock) {
   GELOGD("To Malloc memory, size = %zu", size);
-  uint8_t *buffer = DoMallocMemory(purpose, size, max_memory_size_, memory_list_);
-  return buffer;
+  if (holding_lock) {
+    return DoMallocMemory(purpose, size, max_memory_size_, memory_list_);
+  } else {
+    std::lock_guard<std::mutex> lk(stream_mu_);
+    return DoMallocMemory(purpose, size, max_memory_size_, memory_list_);
+  }
 }
 
 uint8_t *StreamResource::MallocWeight(const std::string &purpose, size_t size) {
@@ -158,7 +176,7 @@ Status StreamResource::BuildOperator(const string &model_name, const ModelData &
     return ret;
   }
 
-  auto new_op = std::unique_ptr<SingleOp>(new(std::nothrow) SingleOp(&stream_mu_, stream_));
+  auto new_op = std::unique_ptr<SingleOp>(new(std::nothrow) SingleOp(this, &stream_mu_, stream_));
   if (new_op == nullptr) {
     GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "new SingleOp failed");
     return ACL_ERROR_GE_MEMORY_ALLOCATION;
@@ -171,4 +189,12 @@ Status StreamResource::BuildOperator(const string &model_name, const ModelData &
   op_map_[model_data.model_data] = std::move(new_op);
   return SUCCESS;
 }
+
+const uint8_t *StreamResource::GetMemoryBase() const {
+  if (memory_list_.empty()) {
+    return nullptr;
+  }
+
+  return memory_list_.back();
+}
 }  // namespace ge
diff --git a/ge/single_op/stream_resource.h b/ge/single_op/stream_resource.h
index 39f08ebe..d5bc941a 100755
--- a/ge/single_op/stream_resource.h
+++ b/ge/single_op/stream_resource.h
@@ -45,8 +45,9 @@ class StreamResource {
   Status BuildOperator(const std::string &model_name, const ModelData &model_data, SingleOp **single_op);
   Status BuildDynamicOperator(const std::string &model_name, const ModelData &model_data, DynamicSingleOp **single_op);
 
-  uint8_t *MallocMemory(const std::string &purpose, size_t size);
+  uint8_t *MallocMemory(const std::string &purpose, size_t size, bool holding_lock = true);
   uint8_t *MallocWeight(const std::string &purpose, size_t size);
+  const uint8_t *GetMemoryBase() const;
 
  private:
   uint8_t *DoMallocMemory(const std::string &purpose,
diff --git a/ge/single_op/task/aicpu_kernel_task_builder.cc b/ge/single_op/task/aicpu_kernel_task_builder.cc
index 26f6a166..f8a2bd1b 100755
--- a/ge/single_op/task/aicpu_kernel_task_builder.cc
+++ b/ge/single_op/task/aicpu_kernel_task_builder.cc
@@ -15,19 +15,24 @@
  */
 
 #include "single_op/task/aicpu_kernel_task_builder.h"
-#include "cce/taskdown_common.hpp"
+#include "framework/common/taskdown_common.h"
 #include "graph/load/new_model_manager/model_manager.h"
+#include "build_task_utils.h"
 
 namespace ge {
 AiCpuCCTaskBuilder::AiCpuCCTaskBuilder(const OpDescPtr &op_desc, const domi::KernelDef &kernel_def)
     : op_desc_(op_desc), kernel_def_(kernel_def) {}
 
-Status AiCpuCCTaskBuilder::SetKernelArgs(AiCpuCCTask &task) {
+Status AiCpuCCTaskBuilder::SetKernelArgs(AiCpuCCTask &task, const SingleOpModelParam &param) {
   size_t aicpu_arg_size = kernel_def_.args_size();
-  if (aicpu_arg_size <= 0) {
+  if (aicpu_arg_size <= sizeof(aicpu::AicpuParamHead)) {
     GELOGE(ACL_ERROR_GE_PARAM_INVALID, "aicpu_arg_size is invalid, value = %zu", aicpu_arg_size);
     return ACL_ERROR_GE_PARAM_INVALID;
   }
+
+  task.io_addr_num_ = op_desc_->GetInputsSize() + op_desc_->GetOutputsSize();
+  GE_CHECK_GE(aicpu_arg_size - sizeof(aicpu::AicpuParamHead), task.io_addr_num_ * sizeof(void *));
+
   std::unique_ptr<uint8_t[]> aicpu_args;
   aicpu_args.reset(new(std::nothrow) uint8_t[aicpu_arg_size]());
   if (aicpu_args == nullptr) {
@@ -41,13 +46,19 @@ Status AiCpuCCTaskBuilder::SetKernelArgs(AiCpuCCTask &task) {
     return ACL_ERROR_GE_INTERNAL_ERROR;
   }
 
-  task.SetIoAddr(aicpu_args.get() + sizeof(aicpu::AicpuParamHead));
+  task.SetIoAddr(reinterpret_cast<uintptr_t *>(aicpu_args.get() + sizeof(aicpu::AicpuParamHead)));
   task.SetKernelArgs(std::move(aicpu_args), aicpu_arg_size);
+
+  auto addresses = BuildTaskUtils::GetKernelArgs(op_desc_, param);
+  GE_CHECK_GE(addresses.size(), task.io_addr_num_);
+  for (size_t i = 0; i < task.io_addr_num_; ++i) {
+    task.io_addr_[i] = reinterpret_cast<uintptr_t>(addresses[i]);
+  }
   return SUCCESS;
 }
 
-Status AiCpuCCTaskBuilder::BuildTask(AiCpuCCTask &task, uint64_t kernel_id) {
-  auto ret = SetKernelArgs(task);
+Status AiCpuCCTaskBuilder::BuildTask(AiCpuCCTask &task, uint64_t kernel_id, const SingleOpModelParam &param) {
+  auto ret = SetKernelArgs(task, param);
   if (ret != SUCCESS) {
     return ret;
   }
@@ -58,12 +69,16 @@ Status AiCpuCCTaskBuilder::BuildTask(AiCpuCCTask &task, uint64_t kernel_id) {
   task.op_desc_ = op_desc_;
 
   const auto &context = kernel_def_.context();
-  auto kernel_type = static_cast<cce::ccKernelType>(context.kernel_type());
-  if (kernel_type == cce::ccKernelType::CUST_AI_CPU) {
+  auto kernel_type = static_cast<ccKernelType>(context.kernel_type());
+  if (kernel_type == ccKernelType::CUST_AI_CPU) {
     task.is_custom_ = true;
     task.dump_flag_ |= RT_KERNEL_CUSTOM_AICPU;
-    GE_CHK_STATUS_RET(ModelManager::GetInstance()->LoadCustAicpuSo(op_desc_, so_name), "launch cust aicpu so failed");
-    GE_CHK_STATUS_RET(ModelManager::GetInstance()->LaunchCustAicpuSo(), "launch cust aicpu so failed.");
+    bool loaded = false;
+    GE_CHK_STATUS_RET(ModelManager::GetInstance()->LoadCustAicpuSo(op_desc_, so_name, loaded), 
+                      "launch cust aicpu so failed");
+    if (!loaded) {
+      GE_CHK_STATUS_RET(ModelManager::GetInstance()->LaunchCustAicpuSo(), "launch cust aicpu so failed.");
+    }
   }
 
   task.num_inputs_ = op_desc_->GetInputsSize();
@@ -82,6 +97,10 @@ Status AiCpuCCTaskBuilder::BuildTask(AiCpuCCTask &task, uint64_t kernel_id) {
     return ret;
   }
 
+  if (task.GetUnknownType() == DEPEND_COMPUTE) {
+    GELOGE(FAILED, "AiCpuCCTask unknown type is depend compute, it's not supported now.");
+    return FAILED;
+  }
   auto aicpu_param_head = reinterpret_cast<aicpu::AicpuParamHead *>(task.args_.get());
   if (task.ext_info_addr_dev_ != nullptr) {
     aicpu_param_head->extInfoLength = kernel_ext_info.size();
diff --git a/ge/single_op/task/aicpu_kernel_task_builder.h b/ge/single_op/task/aicpu_kernel_task_builder.h
index e77e3c10..85d5034d 100755
--- a/ge/single_op/task/aicpu_kernel_task_builder.h
+++ b/ge/single_op/task/aicpu_kernel_task_builder.h
@@ -30,10 +30,10 @@ class AiCpuCCTaskBuilder {
   explicit AiCpuCCTaskBuilder(const OpDescPtr &op_desc, const domi::KernelDef &kernel_def);
   ~AiCpuCCTaskBuilder() = default;
 
-  Status BuildTask(AiCpuCCTask &task, uint64_t kernel_id);
+  Status BuildTask(AiCpuCCTask &task, uint64_t kernel_id, const SingleOpModelParam &param);
 
  private:
-  Status SetKernelArgs(AiCpuCCTask &task);
+  Status SetKernelArgs(AiCpuCCTask &task, const SingleOpModelParam &param);
   const OpDescPtr op_desc_;
   const domi::KernelDef &kernel_def_;
 };
diff --git a/ge/single_op/task/aicpu_task_builder.cc b/ge/single_op/task/aicpu_task_builder.cc
index 8f28ffda..0cc5c554 100755
--- a/ge/single_op/task/aicpu_task_builder.cc
+++ b/ge/single_op/task/aicpu_task_builder.cc
@@ -26,26 +26,6 @@ namespace ge {
   AiCpuTaskBuilder::AiCpuTaskBuilder(const OpDescPtr &op_desc, const domi::KernelExDef &kernel_def)
       : op_desc_(op_desc), kernel_def_(kernel_def) {}
 
-  Status AiCpuTaskBuilder::SetInputOutputAddr(void **io_addr, const std::vector<void *> &addresses) {
-    size_t arg_size = kernel_def_.args_size();
-    auto rt_ret = rtMalloc(io_addr, arg_size, RT_MEMORY_HBM);
-    if (rt_ret != RT_ERROR_NONE) {
-      GELOGE(rt_ret, "rtMalloc failed, size = %zu, ret = %d", arg_size, rt_ret);
-      return rt_ret;
-    }
-
-    const void *src_addr = reinterpret_cast<const void *>(addresses.data());
-    uint64_t src_len = sizeof(void *) * addresses.size();
-    rt_ret = rtMemcpy(*io_addr, arg_size, src_addr, src_len, RT_MEMCPY_HOST_TO_DEVICE);
-    if (rt_ret != RT_ERROR_NONE) {
-      (void)rtFree(*io_addr);
-      GELOGE(rt_ret, "rtMemcpy addresses failed, ret = %d", rt_ret);
-      return rt_ret;
-    }
-
-    return SUCCESS;
-  }
-
   Status AiCpuTaskBuilder::SetFmkOpKernel(void *io_addr, void *ws_addr, STR_FWK_OP_KERNEL &fwk_op_kernel) {
     auto sec_ret = memcpy_s(&fwk_op_kernel, sizeof(STR_FWK_OP_KERNEL),
                             kernel_def_.args().data(), kernel_def_.args().size());
@@ -80,39 +60,27 @@ namespace ge {
     return SUCCESS;
   }
 
-  Status AiCpuTaskBuilder::InitWorkspaceAndIO(void **io_addr, void **kernel_workspace,
-                                              const SingleOpModelParam &param, bool dynamic_flag) {
+  Status AiCpuTaskBuilder::InitWorkspaceAndIO(AiCpuTask &task, const SingleOpModelParam &param, bool dynamic_flag) {
     if (kernel_def_.args_size() > sizeof(STR_FWK_OP_KERNEL)) {
       GELOGE(ACL_ERROR_GE_PARAM_INVALID, "sizeof STR_FWK_OP_KERNEL is: %lu, but args_size is: %d",
              sizeof(STR_FWK_OP_KERNEL), kernel_def_.args_size());
       return ACL_ERROR_GE_PARAM_INVALID;
     }
-    auto addresses = BuildTaskUtils::GetAddresses(op_desc_, param);
-    auto ws_addr_vec = addresses.at(BuildTaskUtils::kAddressIndexWorkspace);
-
-    if (dynamic_flag) {
-      GE_CHK_RT_RET(rtMalloc(kernel_workspace, kernel_def_.task_info_size(), RT_MEMORY_HBM));
-    } else {
-      if (ws_addr_vec.empty()) {
-        GELOGE(ACL_ERROR_GE_PARAM_INVALID, "workspace Data Address is empty.");
-        return ACL_ERROR_GE_PARAM_INVALID;
-      }
-      *kernel_workspace = ws_addr_vec[0];
-    }
-    GE_CHK_RT_RET(rtMemcpy(*kernel_workspace, kernel_def_.task_info_size(),
+    GE_CHK_RT_RET(rtMalloc(&task.workspace_addr_, kernel_def_.task_info_size(), RT_MEMORY_HBM));
+    GE_CHK_RT_RET(rtMemcpy(task.workspace_addr_, kernel_def_.task_info_size(),
                            kernel_def_.task_info().data(), kernel_def_.task_info_size(),
                            RT_MEMCPY_HOST_TO_DEVICE));
 
-    auto ret = SetInputOutputAddr(io_addr, BuildTaskUtils::JoinAddresses(addresses));
-    if (ret != SUCCESS) {
-      return ret;
-    }
+    auto addresses = BuildTaskUtils::GetAddresses(op_desc_, param, false);
+    task.io_addr_host_ = BuildTaskUtils::JoinAddresses(addresses);
+    task.io_addr_size_ = task.io_addr_host_.size() * sizeof(void *);
+    GE_CHK_RT_RET(rtMalloc(&task.io_addr_, task.io_addr_size_, RT_MEMORY_HBM));
     return SUCCESS;
   }
 
   Status AiCpuTaskBuilder::BuildTask(ge::AiCpuTask &task, const SingleOpModelParam &param,
                                      bool dynamic_flag, uint64_t kernel_id) {
-    GE_CHK_STATUS_RET_NOLOG(InitWorkspaceAndIO(&task.io_addr_, &task.workspace_addr_, param, dynamic_flag));
+    GE_CHK_STATUS_RET_NOLOG(InitWorkspaceAndIO(task, param, dynamic_flag));
 
     STR_FWK_OP_KERNEL fwk_op_kernel = {0};
     auto ret = SetFmkOpKernel(task.io_addr_, task.workspace_addr_, fwk_op_kernel);
diff --git a/ge/single_op/task/aicpu_task_builder.h b/ge/single_op/task/aicpu_task_builder.h
index 4669e118..fe9c9bc2 100755
--- a/ge/single_op/task/aicpu_task_builder.h
+++ b/ge/single_op/task/aicpu_task_builder.h
@@ -33,10 +33,8 @@ namespace ge {
 
   private:
     static Status SetKernelArgs(void **args, STR_FWK_OP_KERNEL &kernel);
-    Status SetInputOutputAddr(void **io_addr, const std::vector<void *> &addresses);
     Status SetFmkOpKernel(void *io_addr, void *ws_addr, STR_FWK_OP_KERNEL &kernel);
-    Status InitWorkspaceAndIO(void **io_addr, void **kernel_workspace,
-                              const SingleOpModelParam &param, bool dynamic_flag);
+    Status InitWorkspaceAndIO(AiCpuTask &task, const SingleOpModelParam &param, bool dynamic_flag);
 
     const OpDescPtr op_desc_;
     const domi::KernelExDef &kernel_def_;
diff --git a/ge/single_op/task/build_task_utils.cc b/ge/single_op/task/build_task_utils.cc
index 29f1657b..071e514b 100644
--- a/ge/single_op/task/build_task_utils.cc
+++ b/ge/single_op/task/build_task_utils.cc
@@ -32,7 +32,8 @@ const uint64_t kVarSize = 0;
 }
 
 std::vector<std::vector<void *>> BuildTaskUtils::GetAddresses(const OpDescPtr &op_desc,
-                                                              const SingleOpModelParam &param) {
+                                                              const SingleOpModelParam &param,
+                                                              bool keep_workspace) {
   std::vector<std::vector<void *>> ret;
   RuntimeParam runtime_para;
   runtime_para.mem_size = param.memory_size;
@@ -49,7 +50,9 @@ std::vector<std::vector<void *>> BuildTaskUtils::GetAddresses(const OpDescPtr &o
 
   ret.emplace_back(ModelUtils::GetInputDataAddrs(runtime_para, op_desc));
   ret.emplace_back(ModelUtils::GetOutputDataAddrs(runtime_para, op_desc));
-  ret.emplace_back(ModelUtils::GetWorkspaceDataAddrs(runtime_para, op_desc));
+  if (keep_workspace) {
+    ret.emplace_back(ModelUtils::GetWorkspaceDataAddrs(runtime_para, op_desc));
+  }
   return ret;
 }
 
diff --git a/ge/single_op/task/build_task_utils.h b/ge/single_op/task/build_task_utils.h
index cddc7a2b..7a2369e4 100644
--- a/ge/single_op/task/build_task_utils.h
+++ b/ge/single_op/task/build_task_utils.h
@@ -27,15 +27,17 @@
 namespace ge {
 class BuildTaskUtils {
  public:
+  static constexpr int kAddressIndexOutput = 1;
   static constexpr int kAddressIndexWorkspace = 2;
 
-  static std::vector<std::vector<void *>> GetAddresses(const OpDescPtr &op_desc, const SingleOpModelParam &param);
+  static std::vector<std::vector<void *>> GetAddresses(const OpDescPtr &op_desc,
+                                                       const SingleOpModelParam &param,
+                                                       bool keep_workspace = true);
   static std::vector<void *> JoinAddresses(const std::vector<std::vector<void *>> &addresses);
   static std::vector<void *> GetKernelArgs(const OpDescPtr &op_desc, const SingleOpModelParam &param);
   static std::string GetTaskInfo(const OpDescPtr &op_desc);
   template<typename T>
-  static std::string VectorToString(const std::vector<T> &values)
-  {
+  static std::string VectorToString(const std::vector<T> &values) {
     std::stringstream ss;
     ss << '[';
     auto size = values.size();
diff --git a/ge/single_op/task/op_task.cc b/ge/single_op/task/op_task.cc
index c3c4e5bb..22433ec9 100755
--- a/ge/single_op/task/op_task.cc
+++ b/ge/single_op/task/op_task.cc
@@ -24,9 +24,11 @@
 #include "common/dump/dump_manager.h"
 #include "common/dump/dump_op.h"
 #include "common/formats/formats.h"
+#include "common/math/math_util.h"
 #include "framework/common/debug/log.h"
 #include "register/op_tiling.h"
 #include "runtime/rt.h"
+#include "build_task_utils.h"
 
 namespace ge {
 namespace {
@@ -48,18 +50,22 @@ Status OpTask::OpenDump(rtStream_t stream) {
     std::vector<uint64_t> output_adds;
     auto input_size = op_desc_->GetInputsSize();
     auto output_size = op_desc_->GetOutputsSize();
-    auto all_size = io_addrs_for_dump_.size();
-    if (input_size + output_size != all_size) {
-      GELOGE(FAILED, "io_addrs_for_dump_ size %zu is not equal input and output size %zu", all_size,
+    uintptr_t *arg_base = nullptr;
+    size_t arg_num = 0;
+    GetIoAddr(arg_base, arg_num);
+    if (arg_num < input_size + output_size) {
+      GELOGE(FAILED, "io_addrs_for_dump_ size %zu is not equal input and output size %zu",
+             arg_num,
              input_size + output_size);
       return FAILED;
     }
+
     for (size_t i = 0; i < input_size; i++) {
-      uint64_t input_addr = io_addrs_for_dump_[i];
+      uint64_t input_addr = arg_base[i];
       input_addrs.emplace_back(input_addr);
     }
     for (size_t j = 0; j < output_size; j++) {
-      uint64_t output_addr = io_addrs_for_dump_[input_size + j];
+      uint64_t output_addr = arg_base[input_size + j];
       output_adds.emplace_back(output_addr);
     }
     dump_op_.SetDumpInfo(DumpManager::GetInstance().GetDumpProperties(), op_desc_, input_addrs, output_adds, stream);
@@ -89,9 +95,50 @@ void TbeOpTask::SetKernelArgs(std::unique_ptr<uint8_t[]> &&args, size_t arg_size
 
 void TbeOpTask::SetSmDesc(void *sm_desc) { sm_desc_ = sm_desc; }
 
-const vector<int64_t> &OpTask::GetWorkspaceSizes() const { return workspace_sizes_; }
+void OpTask::SetModelArgs(std::string model_name, uint32_t model_id) {
+  model_name_ = model_name;
+  model_id_ = model_id;
+}
 
-void OpTask::SetWorkspaceSizes(const vector<int64_t> &workspace_sizes) { workspace_sizes_ = workspace_sizes; }
+Status OpTask::GetProfilingArgs(std::string &model_name, std::string &op_name, uint32_t &model_id,
+                                uint32_t &block_dim) {
+  model_name = model_name_;
+  model_id = model_id_;
+  block_dim = block_dim_;
+  GE_CHECK_NOTNULL(op_desc_);
+  op_name = op_desc_->GetName();
+  return SUCCESS;
+}
+Status OpTask::UpdateRunInfo(const vector<GeTensorDesc> &input_desc, const vector<GeTensorDesc> &output_desc) {
+  return UNSUPPORTED;
+}
+Status OpTask::UpdateArgTable(const SingleOpModelParam &param) {
+  auto addresses = BuildTaskUtils::GetAddresses(op_desc_, param);
+  auto all_addresses = BuildTaskUtils::JoinAddresses(addresses);
+  uintptr_t *arg_base = nullptr;
+  size_t arg_num = 0;
+  GetIoAddr(arg_base, arg_num);
+  if (arg_num !=  all_addresses.size()) {
+    GELOGE(INTERNAL_ERROR, "[%s] arg number mismatches, expect = %zu, but got = %zu",
+           op_desc_->GetName().c_str(),
+           arg_num,
+           all_addresses.size());
+    return INTERNAL_ERROR;
+  }
+
+  for (void *addr : all_addresses) {
+    *arg_base++ = reinterpret_cast<uintptr_t >(addr);
+  }
+  return SUCCESS;
+}
+
+Status OpTask::LaunchKernel(const vector<GeTensorDesc> &input_desc,
+                            const vector<DataBuffer> &input_buffers,
+                            vector<GeTensorDesc> &output_desc,
+                            vector<DataBuffer> &output_buffers,
+                            rtStream_t stream) {
+  return UNSUPPORTED;
+}
 
 TbeOpTask::~TbeOpTask() {
   if (sm_desc_ != nullptr) {
@@ -126,12 +173,6 @@ Status TbeOpTask::LaunchKernel(rtStream_t stream) {
     return RT_FAILED;
   }
   GELOGI("[TASK_INFO] %s", this->stub_name_.c_str());
-
-  size_t input_size = op_desc_->GetInputsSize();
-  size_t output_size = op_desc_->GetOutputsSize();
-  uint64_t *io_addr = reinterpret_cast<uint64_t *>(args_.get());
-  std::vector<uint64_t> io_addrs(io_addr, io_addr + input_size + output_size);
-  SetIoAddrsForDump(io_addrs);
   auto status = OpenDump(stream);
   if (status != SUCCESS) {
     GELOGE(status, "Open dump failed in the tbe single op %s", this->stub_name_.c_str());
@@ -152,11 +193,12 @@ Status TbeOpTask::UpdateRunInfo(const vector<GeTensorDesc> &input_desc, const ve
     GELOGE(FAILED, "Failed to invoke OpParaCalculate. ret = %u", ret);
     return FAILED;
   }
-  SetWorkspaceSizes(run_info.workspaces);
   block_dim_ = run_info.block_dim;
   tiling_data_ = run_info.tiling_data.str();
   GELOGD("Done invoking OpParaCalculate successfully. block_dim = %u, tiling size = %zu", block_dim_,
          tiling_data_.size());
+
+  GE_CHK_STATUS_RET(AllocateWorkspaces(run_info.workspaces), "Failed to allocate workspaces");
   return SUCCESS;
 }
 
@@ -212,13 +254,54 @@ void TbeOpTask::EnableDynamicSupport(const NodePtr &node, void *tiling_buffer, s
   max_tiling_size_ = max_tiling_size;
 }
 
-Status TbeOpTask::LaunchKernel(const vector<void *> &inputs, const vector<void *> &outputs,
-                               const vector<void *> &workspaces, rtStream_t stream) {
+Status TbeOpTask::AllocateWorkspaces(const vector<int64_t> &workspace_sizes) {
+  static const std::string kPurpose("malloc workspace memory for dynamic op.");
+  if (workspace_sizes.empty()) {
+    GELOGD("No need to allocate workspace.");
+    return SUCCESS;
+  }
+  int64_t total_size = 0;
+  std::vector<int64_t> ws_offsets;
+  for (auto ws_size : workspace_sizes) {
+    // alignment and padding should be done in OpParaCalculate
+    GE_CHK_STATUS_RET_NOLOG(CheckInt64AddOverflow(total_size, ws_size));
+    ws_offsets.emplace_back(total_size);
+    total_size += ws_size;
+  }
+
+  GELOGD("Total workspace size is %ld", total_size);
+  GE_CHECK_NOTNULL(stream_resource_);
+  auto ws_base = stream_resource_->MallocMemory(kPurpose, static_cast<size_t>(total_size));
+  if (ws_base == nullptr) {
+    GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Failed to allocate memory of size: %ld", total_size);
+    return ACL_ERROR_GE_MEMORY_ALLOCATION;
+  }
+  GELOGD("Done allocating workspace memory successfully.");
+
+  for (auto ws_offset : ws_offsets) {
+    workspaces_.emplace_back(ws_base + ws_offset);
+  }
+
+  return SUCCESS;
+}
+
+Status TbeOpTask::LaunchKernel(const vector<GeTensorDesc> &input_desc,
+                               const vector<DataBuffer> &input_buffers,
+                               vector<GeTensorDesc> &output_desc,
+                               vector<DataBuffer> &output_buffers,
+                               rtStream_t stream) {
+  GE_CHK_STATUS_RET_NOLOG(UpdateRunInfo(input_desc, output_desc));
   GELOGD("[%s] Start to launch kernel", node_->GetName().c_str());
   std::vector<void *> args;
-  args.insert(args.end(), inputs.begin(), inputs.end());
-  args.insert(args.end(), outputs.begin(), outputs.end());
-  args.insert(args.end(), workspaces.begin(), workspaces.end());
+  for (auto &buffer : input_buffers) {
+    args.emplace_back(buffer.data);
+  }
+  for (auto &buffer : output_buffers) {
+    args.emplace_back(buffer.data);
+  }
+  for (auto &buffer : workspaces_) {
+    args.emplace_back(buffer);
+  }
 
   if (tiling_buffer_ != nullptr) {
     GELOGD("[%s] Start to copy tiling info. size = %zu", node_->GetName().c_str(), tiling_data_.size());
@@ -239,6 +322,14 @@ Status TbeOpTask::LaunchKernel(const vector<void *> &inputs, const vector<void *
   return SUCCESS;
 }
 
+void TbeOpTask::GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) {
+  arg_base = reinterpret_cast<uintptr_t *>(args_.get());
+  arg_count = arg_size_ / sizeof(void *);
+  if (tiling_buffer_ != nullptr) {
+    --arg_count;
+  }
+}
+
 AiCpuBaseTask::~AiCpuBaseTask() {
   if (ext_info_addr_dev_ != nullptr) {
     (void)rtFree(ext_info_addr_dev_);
@@ -363,6 +454,29 @@ Status AiCpuBaseTask::UpdateShapeToOutputDesc(const GeShape &shape_new, GeTensor
   return SUCCESS;
 }
 
+Status AiCpuBaseTask::UpdateIoAddr(const vector<DataBuffer> &inputs, const vector<DataBuffer> &outputs) {
+  uintptr_t *arg_base = nullptr;
+  size_t arg_num = 0;
+  GetIoAddr(arg_base, arg_num);
+
+  // input number and output number was check in ValidateParams
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    auto addr = inputs[i].data;
+    GE_CHECK_NOTNULL(addr);
+    GELOGD("AICpuTask input[%zu] addr = %p", i, addr);
+    *arg_base++ = reinterpret_cast<uintptr_t>(addr);
+  }
+
+  for (size_t i = 0; i < outputs.size(); ++i) {
+    auto addr = outputs[i].data;
+    GE_CHECK_NOTNULL(addr);
+    GELOGD("AICpuTask output[%zu] addr = %p", i, addr);
+    *arg_base++ = reinterpret_cast<uintptr_t>(addr);
+  }
+
+  return SUCCESS;
+}
+
 AiCpuTask::~AiCpuTask() {
   FreeHbm(args_);
   FreeHbm(io_addr_);
@@ -384,12 +498,14 @@ AiCpuTask::~AiCpuTask() {
   }
 }
 
-const void *AiCpuTask::GetIOAddr() const { return io_addr_; }
-
 Status AiCpuTask::LaunchKernel(rtStream_t stream) {
   GELOGD("Start to launch kernel. task = %s", this->op_type_.c_str());
-  auto ret = rtMemcpyAsync(workspace_addr_, task_info_.size(), task_info_.data(), task_info_.size(),
-                           RT_MEMCPY_HOST_TO_DEVICE_EX, stream);
+  auto ret = rtMemcpyAsync(io_addr_,
+                           io_addr_size_,
+                           io_addr_host_.data(),
+                           io_addr_host_.size() * sizeof(void *),
+                           RT_MEMCPY_HOST_TO_DEVICE_EX,
+                           stream);
   if (ret != RT_ERROR_NONE) {
     GELOGE(RT_FAILED, "rtMemcpyAsync workspace data failed. ret = %d, task = %s", ret, this->op_type_.c_str());
     return RT_FAILED;
@@ -538,40 +654,6 @@ Status AiCpuTask::UpdateShapeAndDataByResultSummary(vector<GeTensorDesc> &output
   return SUCCESS;
 }
 
-Status AiCpuTask::SetIO(const vector<void *> &inputs, vector<void *> &outputs) {
-  vector<uint64_t> io_addrs;
-  io_addrs.reserve(num_inputs_ + num_outputs_);
-  for (size_t i = 0; i < num_inputs_; ++i) {
-    GE_CHECK_NOTNULL(inputs[i]);
-    GELOGD("AiCpuTask input[%zu] addr = %p", i, inputs[i]);
-    io_addrs.emplace_back(reinterpret_cast<uintptr_t>(inputs[i]));
-  }
-
-  if (unknown_type_ != DEPEND_COMPUTE) {
-    for (size_t i = 0; i < num_outputs_; ++i) {
-      GE_CHECK_NOTNULL(outputs[i]);
-      GELOGD("AiCpuTask output[%zu] addr = %p", i, outputs[i]);
-      io_addrs.emplace_back(reinterpret_cast<uintptr_t>(outputs[i]));
-    }
-  } else {
-    for (size_t i = 0; i < num_outputs_; ++i) {
-      void *summary_addr = output_summary_[i];
-      io_addrs.emplace_back(reinterpret_cast<uintptr_t>(summary_addr));
-    }
-  }
-
-  if (!io_addrs.empty()) {
-    auto *dst_io_addr = const_cast<uintptr_t *>(reinterpret_cast<const uintptr_t *>(io_addr_));
-    GE_CHK_RT_RET(rtMemcpy(dst_io_addr,
-                           sizeof(uint64_t) * io_addrs.size(),
-                           &io_addrs[0],
-                           sizeof(uint64_t) * io_addrs.size(),
-                           RT_MEMCPY_HOST_TO_DEVICE));
-    GE_CHECK_NOTNULL(dst_io_addr);
-  };
-  return SUCCESS;
-}
-
 Status AiCpuTask::InitForSummaryAndCopy() {
   if (unknown_type_ != DEPEND_COMPUTE || num_outputs_ == 0) {
     GELOGI("Unknown_type is %d, output num is %d.", unknown_type_, num_outputs_);
@@ -643,17 +725,17 @@ Status AiCpuTask::LaunchKernel(const std::vector<GeTensorDesc> &input_desc,
                                std::vector<DataBuffer> &output_buffers,
                                rtStream_t stream) {
   GE_CHK_STATUS_RET_NOLOG(UpdateExtInfo(input_desc, output_desc, stream));
-  std::vector<void *> inputs;
-  std::vector<void *> outputs;
-  for (auto &buffer : input_buffers) {
-    inputs.emplace_back(buffer.data);
-  }
-  for (auto &buffer : output_buffers) {
-    outputs.emplace_back(buffer.data);
+  if (unknown_type_ == DEPEND_COMPUTE) {
+    std::vector<DataBuffer> summary_buffers;
+    for (size_t i = 0; i < num_outputs_; ++i) {
+      summary_buffers.emplace_back(output_summary_[i], sizeof(aicpu::FWKAdapter::ResultSummary), false);
+    }
+    GE_CHK_STATUS_RET_NOLOG(UpdateIoAddr(input_buffers, summary_buffers));
+  } else {
+    GE_CHK_STATUS_RET_NOLOG(UpdateIoAddr(input_buffers, output_buffers));
   }
-  GE_CHK_STATUS_RET_NOLOG(SetIO(inputs, outputs));
-  GE_CHK_STATUS_RET_NOLOG(LaunchKernel(stream));
 
+  GE_CHK_STATUS_RET_NOLOG(LaunchKernel(stream));
   if (unknown_type_ == DEPEND_SHAPE_RANGE) {
     GE_CHK_RT_RET(rtStreamSynchronize(stream));
     GE_CHK_STATUS_RET_NOLOG(UpdateOutputShape(output_desc));
@@ -665,6 +747,17 @@ Status AiCpuTask::LaunchKernel(const std::vector<GeTensorDesc> &input_desc,
   return SUCCESS;
 }
 
+Status AiCpuTask::UpdateArgTable(const SingleOpModelParam &param) {
+  auto addresses = BuildTaskUtils::GetAddresses(op_desc_, param, false);
+  io_addr_host_ = BuildTaskUtils::JoinAddresses(addresses);
+  return SUCCESS;
+}
+
+void AiCpuTask::GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) {
+  arg_base = reinterpret_cast<uintptr_t *>(io_addr_host_.data());
+  arg_count = io_addr_host_.size();
+}
+
 void AiCpuCCTask::SetKernelArgs(std::unique_ptr<uint8_t[]> args, size_t arg_size) {
   args_ = std::move(args);
   arg_size_ = arg_size;
@@ -676,9 +769,7 @@ void AiCpuCCTask::SetSoName(const std::string &so_name) { so_name_ = so_name; }
 
 void AiCpuCCTask::SetkernelName(const std::string &kernel_Name) { kernel_name_ = kernel_Name; }
 
-void AiCpuCCTask::SetIoAddr(void *io_addr) { io_addr_ = io_addr; }
-
-const void *AiCpuCCTask::GetIOAddr() const { return io_addr_; }
+void AiCpuCCTask::SetIoAddr(uintptr_t *io_addr) { io_addr_ = io_addr; }
 
 const void *AiCpuCCTask::GetArgs() const { return args_.get(); }
 
@@ -701,12 +792,6 @@ Status AiCpuCCTask::LaunchKernel(rtStream_t stream) {
     return ret;
   }
   GELOGD("Invoke rtCpuKernelLaunch succeeded");
-
-  size_t input_size = op_desc_->GetInputsSize();
-  size_t output_size = op_desc_->GetOutputsSize();
-  uint64_t *io_addr = reinterpret_cast<uint64_t *>(io_addr_);
-  std::vector<uint64_t> io_addrs (io_addr, io_addr + input_size + output_size);
-  SetIoAddrsForDump(io_addrs);
   auto status = OpenDump(stream);
   if (status != SUCCESS) {
     GELOGE(status, "Open dump failed in the aicpucc single op %s", this->kernel_name_.c_str());
@@ -721,24 +806,9 @@ Status AiCpuCCTask::LaunchKernel(const std::vector<GeTensorDesc> &input_desc,
                                  std::vector<GeTensorDesc> &output_desc,
                                  std::vector<DataBuffer> &output_buffers,
                                  rtStream_t stream) {
-  GE_CHK_BOOL_RET_STATUS(unknown_type_ != DEPEND_COMPUTE, FAILED,
-                         "AiCpuCCTask unknown type[%d] is depend compute, it's not supported now.",
-                         unknown_type_);
-
   GE_CHK_STATUS_RET_NOLOG(UpdateExtInfo(input_desc, output_desc, stream));
-
-  size_t arg_index = 0;
-  auto *task_io_addr = reinterpret_cast<uintptr_t *>(io_addr_);
-  GE_CHECK_NOTNULL(task_io_addr);
-  for (auto &input : input_buffers) {
-    task_io_addr[arg_index++] = reinterpret_cast<uintptr_t>(input.data);
-  }
-  for (auto &output : output_buffers) {
-    task_io_addr[arg_index++] = reinterpret_cast<uintptr_t>(output.data);
-  }
-
+  GE_CHK_STATUS_RET_NOLOG(UpdateIoAddr(input_buffers, output_buffers));
   GE_CHK_STATUS_RET_NOLOG(LaunchKernel(stream));
-
   if (unknown_type_ == DEPEND_SHAPE_RANGE) {
     GE_CHK_RT_RET(rtStreamSynchronize(stream));
     GE_CHK_STATUS_RET_NOLOG(UpdateOutputShape(output_desc));
@@ -746,4 +816,9 @@ Status AiCpuCCTask::LaunchKernel(const std::vector<GeTensorDesc> &input_desc,
 
   return SUCCESS;
 }
+
+void AiCpuCCTask::GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) {
+  arg_base = io_addr_;
+  arg_count = io_addr_num_;
+}
 }  // namespace ge
diff --git a/ge/single_op/task/op_task.h b/ge/single_op/task/op_task.h
index 65c77800..e2122b6f 100644
--- a/ge/single_op/task/op_task.h
+++ b/ge/single_op/task/op_task.h
@@ -32,64 +32,46 @@
 #include "init/gelib.h"
 
 namespace ge {
-enum OpTaskType {
-  OP_TASK_TBE = 0,
-  OP_TASK_AICPU,
-  OP_TASK_AICPUCC,
-  OP_TASK_INVALID,
-};
-
+class StreamResource;
+struct SingleOpModelParam;
 class OpTask {
  public:
   OpTask() = default;
   virtual ~OpTask() = default;
   virtual Status LaunchKernel(rtStream_t stream) = 0;
   virtual Status UpdateRunInfo(const vector<GeTensorDesc> &input_desc,
-                               const vector<GeTensorDesc> &output_desc) {
-    return UNSUPPORTED;
-  }
-  virtual Status LaunchKernel(const std::vector<void *> &inputs,
-                              const std::vector<void *> &outputs,
-                              const std::vector<void *> &workspaces,
-                              rtStream_t stream) {
-    return UNSUPPORTED;
-  }
-  virtual OpTaskType GetOpTaskType() = 0;
-  virtual const void *GetIOAddr() const = 0;
-  const vector<int64_t> &GetWorkspaceSizes() const;
-  void SetWorkspaceSizes(const vector<int64_t> &workspace_sizes);
+                               const vector<GeTensorDesc> &output_desc);
+  virtual Status UpdateArgTable(const SingleOpModelParam &param);
+  void SetModelArgs(std::string model_name, uint32_t model_id);
+  Status GetProfilingArgs(std::string &model_name, std::string &op_name, uint32_t &model_id, uint32_t &block_dim);
   const OpDescPtr &GetOpdesc() const {return op_desc_;}
   Status OpenDump(rtStream_t stream);
-  void SetIoAddrsForDump(const vector<uint64_t> &io_addrs_for_dump) {
-    io_addrs_for_dump_ = io_addrs_for_dump;
-  }
+  virtual void GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) = 0;
   virtual Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc,
                               const std::vector<DataBuffer> &input_buffers,
                               std::vector<GeTensorDesc> &output_desc,
                               std::vector<DataBuffer> &output_buffers,
-                              rtStream_t stream) {
-    return UNSUPPORTED;
-  }
+                              rtStream_t stream);
 
- private:
-  std::vector<int64_t> workspace_sizes_;
  protected:
   DumpProperties dump_properties_;
   DumpOp dump_op_;
   OpDescPtr op_desc_;
-  std::vector<uint64_t> io_addrs_for_dump_;
+  std::string model_name_;
+  uint32_t model_id_ = 0;
+  uint32_t block_dim_ = 1;
 };
 
 class TbeOpTask : public OpTask {
  public:
   ~TbeOpTask() override;
   Status LaunchKernel(rtStream_t stream) override;
-  OpTaskType GetOpTaskType() override {
-    return OP_TASK_TBE;
-  }
-  const void *GetIOAddr() const override {
-    return nullptr;
-  }
+  Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc,
+                      const std::vector<DataBuffer> &input_buffers,
+                      std::vector<GeTensorDesc> &output_desc,
+                      std::vector<DataBuffer> &output_buffers,
+                      rtStream_t stream) override;
+  void GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) override;
   void SetSmDesc(void *sm_desc);
   void SetStubFunc(const std::string &name, const void *stub_func);
   void SetKernelArgs(std::unique_ptr<uint8_t[]> &&args, size_t arg_size, uint32_t block_dim, const OpDescPtr &op_desc);
@@ -97,31 +79,29 @@ class TbeOpTask : public OpTask {
   Status UpdateRunInfo(const vector<GeTensorDesc> &input_desc,
                        const vector<GeTensorDesc> &output_desc) override;
 
-  Status LaunchKernel(const vector<void *> &inputs,
-                      const vector<void *> &outputs,
-                      const vector<void *> &workspaces,
-                      rtStream_t stream) override;
-
   const void *GetArgs() const;
   size_t GetArgSize() const;
   const std::string &GetStubName() const;
   void EnableDynamicSupport(const NodePtr &node, void *tiling_buffer, size_t max_tiling_size);
 
  private:
+  friend class SingleOpModel;
   static Status UpdateTensorDesc(const GeTensorDesc &src_tensor, GeTensorDesc &dst_tensor);
   Status UpdateNodeByShape(const vector<GeTensorDesc> &input_desc,
                            const vector<GeTensorDesc> &output_desc);
+  Status AllocateWorkspaces(const std::vector<int64_t> &workspace_sizes);
 
   const void *stub_func_ = nullptr;
   std::unique_ptr<uint8_t[]> args_;
   size_t arg_size_ = 0;
-  uint32_t block_dim_ = 1;
   void *sm_desc_ = nullptr;
   std::string stub_name_;
 
+  StreamResource *stream_resource_ = nullptr;
   void *tiling_buffer_ = nullptr;
   uint32_t max_tiling_size_ = 0;
   std::string tiling_data_;
+  std::vector<void *> workspaces_;
   NodePtr node_;
 };
 
@@ -129,9 +109,10 @@ class AiCpuBaseTask : public OpTask {
  public:
   AiCpuBaseTask() = default;
   ~AiCpuBaseTask() override;
-  const UnknowShapeOpType GetUnknownType() const { return unknown_type_; }
+  UnknowShapeOpType GetUnknownType() const { return unknown_type_; }
 
  protected:
+  Status UpdateIoAddr(const std::vector<DataBuffer> &inputs, const std::vector<DataBuffer> &outputs);
   Status SetExtInfoAndType(const std::string &kernel_ext_info, uint64_t kernel_id);
 
   Status UpdateExtInfo(const std::vector<GeTensorDesc> &input_desc,
@@ -154,10 +135,8 @@ class AiCpuTask : public AiCpuBaseTask {
   ~AiCpuTask() override;
 
   Status LaunchKernel(rtStream_t stream) override;
-  OpTaskType GetOpTaskType() override {
-    return OP_TASK_AICPU;
-  }
-  const void *GetIOAddr() const override;
+  Status UpdateArgTable(const SingleOpModelParam &param) override;
+  void GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) override;
 
   Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc,
                       const std::vector<DataBuffer> &input_buffers,
@@ -167,8 +146,6 @@ class AiCpuTask : public AiCpuBaseTask {
   Status SetMemCopyTask(const domi::KernelExDef &kernel_def);
 
  private:
-  Status SetIO(const vector<void *> &inputs, vector<void *> &outputs);
-
   // for copy task.
   Status InitForSummaryAndCopy();
   Status UpdateShapeAndDataByResultSummary(vector<GeTensorDesc> &output_desc,
@@ -184,27 +161,31 @@ class AiCpuTask : public AiCpuBaseTask {
   friend class AiCpuTaskBuilder;
   void *workspace_addr_ = nullptr;
   std::string task_info_;
- // device addr
+  // device addr
   void *args_ = nullptr;
   size_t arg_size_ = 0;
   std::string op_type_;
   // device addr
   void *io_addr_ = nullptr;
+  size_t io_addr_size_ = 0;
+
+  // host addr
+  std::vector<void *> io_addr_host_;
 
   bool dynamic_flag_ = false;
   // for copy task
-  void *copy_task_args_buf_;
-  void *copy_workspace_buf_;
+  void *copy_task_args_buf_ = nullptr;
+  void *copy_workspace_buf_ = nullptr;
 
   std::vector<void *> output_summary_;
   std::vector<aicpu::FWKAdapter::ResultSummary> output_summary_host_;
 
-  void *copy_ioaddr_dev_;
+  void *copy_ioaddr_dev_ = nullptr;
 
-  void *copy_input_release_flag_dev_;
-  void *copy_input_data_size_dev_;
-  void *copy_input_src_dev_;
-  void *copy_input_dst_dev_;
+  void *copy_input_release_flag_dev_ = nullptr;
+  void *copy_input_data_size_dev_ = nullptr;
+  void *copy_input_src_dev_ = nullptr;
+  void *copy_input_dst_dev_ = nullptr;
 
   vector<void *> out_shape_hbm_;
   uint64_t kernel_id_ = 0;
@@ -218,13 +199,12 @@ class AiCpuCCTask : public AiCpuBaseTask {
   AiCpuCCTask &operator=(const AiCpuCCTask &) = delete;
 
   Status LaunchKernel(rtStream_t stream) override;
-  OpTaskType GetOpTaskType() override { return OP_TASK_AICPUCC; }
-  const void *GetIOAddr() const override;
+  void GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) override;
   const void *GetArgs() const;
   void SetKernelArgs(std::unique_ptr<uint8_t[]> args, size_t arg_size);
   void SetSoName(const std::string &so_name);
   void SetkernelName(const std::string &kernel_Name);
-  void SetIoAddr(void *io_addr);
+  void SetIoAddr(uintptr_t *io_addr);
   size_t GetArgSize() const;
 
   Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc,
@@ -239,9 +219,9 @@ private:
   std::string kernel_name_;
   std::unique_ptr<uint8_t[]> args_;
   size_t arg_size_ = 0;
-  uint32_t block_dim_ = 1;
   void *sm_desc_ = nullptr;
-  void *io_addr_ = nullptr;
+  uintptr_t *io_addr_ = nullptr;
+  size_t io_addr_num_ = 0;
   bool is_custom_ = false;
   uint32_t dump_flag_ = RT_KERNEL_DEFAULT;
 };
diff --git a/ge/single_op/task/tbe_task_builder.cc b/ge/single_op/task/tbe_task_builder.cc
index e06a08c6..594352aa 100644
--- a/ge/single_op/task/tbe_task_builder.cc
+++ b/ge/single_op/task/tbe_task_builder.cc
@@ -173,7 +173,8 @@ Status TbeTaskBuilder::RegisterKernel(TbeOpTask &task, const SingleOpModelParam
 
     auto tbe_kernel = GetTbeKernel(op_desc_);
     if (tbe_kernel == nullptr) {
-      GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "OP EXT ATTR NAME TBE_KERNEL not found. op = %s", op_desc_->GetName().c_str());
+      GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "OP EXT ATTR NAME TBE_KERNEL not found. op = %s",
+             op_desc_->GetName().c_str());
       return ACL_ERROR_GE_INTERNAL_ERROR;
     }
 
diff --git a/ge/stub/gen_stubapi.py b/ge/stub/gen_stubapi.py
index f2a6a287..1476d505 100644
--- a/ge/stub/gen_stubapi.py
+++ b/ge/stub/gen_stubapi.py
@@ -1,3 +1,10 @@
+#!/usr/bin/python3.7
+# -*- coding: UTF-8 -*-
+#-------------------------------------------------------------------
+# Purpose:
+# Copyright 2020 Huawei Technologies Co., Ltd. All rights reserved.
+#-------------------------------------------------------------------
+
 import os
 import re
 import sys
@@ -64,7 +71,7 @@ max_code_len_per_line = 100
     when DEBUG on
 """
 white_list_for_debug = ["attr_value.h", "operator.h", "tensor.h", "graph.h", "operator_factory.h",
-                        "ge_ir_build.h", "ge_api.h", "ge_prof.h", "tensorflow_parser.h", "caffe_parser.h"]
+                        "ge_ir_build.h", "ge_api.h", "tensorflow_parser.h", "caffe_parser.h"]
 include_dir_key_words = ["ge", "graph", "parser"]
 DEBUG = True
 
diff --git a/inc/external/acl/acl_base.h b/inc/external/acl/acl_base.h
index debadcfd..c1341d59 100644
--- a/inc/external/acl/acl_base.h
+++ b/inc/external/acl/acl_base.h
@@ -223,6 +223,29 @@ ACL_FUNC_VISIBILITY aclDataBuffer *aclCreateDataBuffer(void *data, size_t size);
  */
 ACL_FUNC_VISIBILITY aclError aclDestroyDataBuffer(const aclDataBuffer *dataBuffer);
 
+/**
+ * @ingroup AscendCL
+ * @brief update new data of aclDataBuffer
+ *
+ * @param dataBuffer [OUT]    pointer to aclDataBuffer
+ * @li The old data need to be released by the user, otherwise it may occur memory leak leakage
+ *  call aclGetDataBufferAddr interface to get old data address
+ *  call aclrtFree interface to release memory
+ *
+ * @param data [IN]    pointer to new data
+ * @li Need to be managed by the user,
+ *  call aclrtMalloc interface to apply for memory,
+ *  call aclrtFree interface to release memory
+ *
+ * @param size [IN]    size of data in bytes
+ *
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ *
+ * @see aclrtMalloc | aclrtFree | aclGetDataBufferAddr
+ */
+ACL_FUNC_VISIBILITY aclError aclUpdateDataBuffer(aclDataBuffer *dataBuffer, void *data, size_t size);
+
 /**
  * @ingroup AscendCL
  * @brief get data address from aclDataBuffer
@@ -547,6 +570,19 @@ ACL_FUNC_VISIBILITY void *aclGetTensorDescAddress(const aclTensorDesc *desc);
  */
 ACL_FUNC_VISIBILITY aclError aclSetTensorDynamicInput(aclTensorDesc *desc, const char *dynamicInputName);
 
+/**
+ * @ingroup AscendCL
+ * @brief Set const data specified by the tensor description
+ *
+ * @param  desc [OUT]      pointer to the instance of aclTensorDesc
+ * @param  dataBuffer [IN]       pointer to the const databuffer
+ * @param  length [IN]       the length of const databuffer
+ *
+ * @retval ACL_SUCCESS     The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+ACL_FUNC_VISIBILITY aclError aclSetTensorConst(aclTensorDesc *desc, void *dataBuffer, size_t length);
+
 /**
  * @ingroup AscendCL
  * @brief an interface for users to output  APP logs
diff --git a/inc/external/acl/acl_prof.h b/inc/external/acl/acl_prof.h
index bfb8a68b..65c55290 100644
--- a/inc/external/acl/acl_prof.h
+++ b/inc/external/acl/acl_prof.h
@@ -33,11 +33,11 @@ extern "C" {
 
 typedef enum {
   ACL_AICORE_ARITHMATIC_THROUGHPUT = 0,
-  ACL_AICORE_PIPELINE = 1,
-  ACL_AICORE_SYNCHRONIZATION = 2,
-  ACL_AICORE_MEMORY = 3,
-  ACL_AICORE_INTERNAL_MEMORY = 4,
-  ACL_AICORE_STALL = 5,
+  ACL_AICORE_ARITHMETIC_UTILIZATION = 0,
+  ACL_AICORE_PIPE_UTILIZATION = 1,
+  ACL_AICORE_MEMORY_BANDWIDTH = 2,
+  ACL_AICORE_L0B_AND_WIDTH = 3,
+  ACL_AICORE_RESOURCE_CONFLICT_RATIO = 4,
   ACL_AICORE_NONE = 0xFF
 } aclprofAicoreMetrics;
 
@@ -290,6 +290,32 @@ ACL_FUNC_VISIBILITY uint64_t aclprofGetOpDuration(const void *opInfo, size_t opI
  */
 ACL_FUNC_VISIBILITY size_t aclprofGetModelId(const void *opInfo, size_t opInfoLen, uint32_t index);
 
+/**
+ * @ingroup AscendCL
+ * @brief get cube ops from subscription data
+ *
+ * @param  opInfo [IN]     pointer to subscription data
+ * @param  opInfoLen [IN]  memory size of subscription data
+ * @param  index [IN]      index of op array in opInfo
+ *
+ * @retval cube ops of subscription data
+ * @retval 0 for failed
+ */
+ACL_FUNC_VISIBILITY uint64_t aclprofGetOpCubeOps(const void *opInfo, size_t opInfoLen, uint32_t index);
+
+/**
+ * @ingroup AscendCL
+ * @brief get vector ops from subscription data
+ *
+ * @param  opInfo [IN]      pointer to subscription data
+ * @param  opInfoLen [IN]   memory size of subscription data
+ * @param  index [IN]       index of op array in opInfo
+ *
+ * @retval vector ops of subscription data
+ * @retval 0 for failed
+ */
+ACL_FUNC_VISIBILITY uint64_t aclprofGetOpVectorOps(const void *opInfo, size_t opInfoLen, uint32_t index);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/inc/external/acl/error_codes/rt_error_codes.h b/inc/external/acl/error_codes/rt_error_codes.h
index 2dd2c70c..73d9564b 100644
--- a/inc/external/acl/error_codes/rt_error_codes.h
+++ b/inc/external/acl/error_codes/rt_error_codes.h
@@ -46,6 +46,7 @@ static const int32_t ACL_ERROR_RT_INVALID_MEMORY_TYPE = 107016;       // invalid
 static const int32_t ACL_ERROR_RT_FEATURE_NOT_SUPPROT = 207000;  // feature not support
 static const int32_t ACL_ERROR_RT_MEMORY_ALLOCATION = 207001;    // memory allocation error
 static const int32_t ACL_ERROR_RT_MEMORY_FREE = 207002;          // memory free error
+static const int32_t ACL_ERROR_RT_AICORE_OVER_FLOW = 207003;     // aicore over flow
 
 static const int32_t ACL_ERROR_RT_INTERNEL_ERROR = 507000;          // runtime internel error
 static const int32_t ACL_ERROR_RT_TS_ERROR = 507001;                // ts internel error
diff --git a/inc/external/acl/ops/acl_dvpp.h b/inc/external/acl/ops/acl_dvpp.h
index 32a21e91..1a0f582d 100644
--- a/inc/external/acl/ops/acl_dvpp.h
+++ b/inc/external/acl/ops/acl_dvpp.h
@@ -130,6 +130,23 @@ enum acldvppChannelMode { DVPP_CHNMODE_VPC = 1, DVPP_CHNMODE_JPEGD = 2, DVPP_CHN
 // Supported Border Type
 enum acldvppBorderType { BORDER_CONSTANT = 0, BORDER_REPLICATE, BORDER_REFLECT, BORDER_REFLECT_101 };
 
+// Venc parameter type
+enum aclvencChannelDescParamType {
+  ACL_VENC_THREAD_ID_UINT64 = 0,
+  ACL_VENC_CALLBACK_PTR,
+  ACL_VENC_PIXEL_FORMAT_UINT32,
+  ACL_VENC_ENCODE_TYPE_UINT32,
+  ACL_VENC_PIC_WIDTH_UINT32,
+  ACL_VENC_PIC_HEIGHT_UINT32,
+  ACL_VENC_KEY_FRAME_INTERVAL_UINT32,
+  ACL_VENC_BUF_ADDR_PTR,
+  ACL_VENC_BUF_SIZE_UINT32,
+  ACL_VENC_RC_MODE_UINT32,
+  ACL_VENC_SRC_RATE_UINT32,
+  ACL_VENC_MAX_BITRATE_UINT32,
+  ACL_VENC_MAX_IP_PROP_UINT32
+};
+
 /**
  * @ingroup AscendCL
  * @brief alloc device memory for dvpp.
@@ -1037,6 +1054,21 @@ ACL_FUNC_VISIBILITY aclError aclvencSetChannelDescSrcRate(aclvencChannelDesc *ch
  */
 ACL_FUNC_VISIBILITY aclError aclvencSetChannelDescMaxBitRate(aclvencChannelDesc *channelDesc, uint32_t maxBitRate);
 
+/**
+ * @ingroup AscendCL
+ * @brief Set venc parameter for venc channel desc.
+ *
+ * @param channelDesc [OUT] venc channel desc
+ * @param paramType [IN]    parameter type
+ * @param length [IN]       parameter length
+ * @param param [IN]        pointer to parameter value
+ *
+ * @retval ACL_SUCCESS for success, other for failure
+ */
+ACL_FUNC_VISIBILITY aclError aclvencSetChannelDescParam(aclvencChannelDesc *channelDesc,
+                                                        aclvencChannelDescParamType paramType, size_t length,
+                                                        const void *param);
+
 /**
  * @ingroup AscendCL
  * @brief Get output buffer address for venc channel desc.
@@ -1170,6 +1202,23 @@ ACL_FUNC_VISIBILITY uint32_t aclvencGetChannelDescSrcRate(const aclvencChannelDe
  */
 ACL_FUNC_VISIBILITY uint32_t aclvencGetChannelDescMaxBitRate(const aclvencChannelDesc *channelDesc);
 
+/**
+ * @ingroup AscendCL
+ *
+ * @brief Get venc parameter for venc channel desc.
+ *
+ * @param channelDesc [IN]   venc channel desc
+ * @param paramType [IN]     parameter type
+ * @param length [IN]        parameter length
+ * @param paramRetSize [OUT] pointer to parameter real length
+ * @param param [OUT]        pointer to parameter value
+ *
+ * @retval ACL_SUCCESS for success, other for failure
+ */
+ACL_FUNC_VISIBILITY aclError aclvencGetChannelDescParam(const aclvencChannelDesc *channelDesc,
+                                                        aclvencChannelDescParamType paramType, size_t length,
+                                                        size_t *paramRetSize, void *param);
+
 /**
  * @ingroup AscendCL
  * @brief get forced restart of I-frame interval from config
diff --git a/inc/external/ge/ge_api_types.h b/inc/external/ge/ge_api_types.h
index 374a816a..cce17f93 100644
--- a/inc/external/ge/ge_api_types.h
+++ b/inc/external/ge/ge_api_types.h
@@ -369,6 +369,7 @@ static const char *const OP_BANK_PATH = ge::OP_BANK_PATH_FLAG.c_str();
 static const char *const OP_DEBUG_LEVEL = ge::OP_DEBUG_LEVEL.c_str();
 
 // for interface: aclgrphBuildModel
+#ifdef __GNUC__
 const std::set<std::string> ir_builder_suppported_options = {INPUT_FORMAT,
                                                              INPUT_SHAPE,
                                                              OP_NAME_MAP,
@@ -424,6 +425,7 @@ const std::set<std::string> global_options = {CORE_TYPE,
                                               DEBUG_DIR,
                                               OP_COMPILER_CACHE_DIR,
                                               OP_COMPILER_CACHE_MODE};
+#endif
 }  // namespace ir_option
 }  // namespace ge
 
diff --git a/inc/external/ge/ge_ir_build.h b/inc/external/ge/ge_ir_build.h
index 778ec21d..182c0444 100644
--- a/inc/external/ge/ge_ir_build.h
+++ b/inc/external/ge/ge_ir_build.h
@@ -24,9 +24,9 @@
 #include "graph/ge_error_codes.h"
 
 namespace {
-#define IR_MAJOR_VERSION (int(1))
-#define IR_MINOR_VERSION (int(0))
-#define IR_PATCH_VERSION (int(0))
+const int IR_MAJOR_VERSION = 1;
+const int IR_MINOR_VERSION = 0;
+const int IR_PATCH_VERSION = 0;
 }  // namespace
 
 namespace ge {
@@ -121,5 +121,20 @@ graphStatus aclgrphInferShapeAndType(ge::Graph &graph);
  * @retval OtherValues Failure
  */
 graphStatus aclgrphDumpGraph(const ge::Graph &graph, const char *file, const size_t len);
+
+/**
+ * @ingroup AscendCL
+ * @brief create single op graph
+ *
+ * @param op_type[IN] the op_type
+ * @param inputs[IN] the inputdesc
+ * @param outputs[IN] the outputdesc
+ * @param graph[OUT] the graph
+ * @retval GRAPH_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+graphStatus aclgrphGenerateForOp(const AscendString &op_type, const std::vector<TensorDesc> &inputs,
+                                 const std::vector<TensorDesc> &outputs, Graph &graph);
+
 };      // namespace ge
 #endif  // INC_EXTERNAL_GE_IR_BUILD_H_
diff --git a/inc/external/ge/ge_prof.h b/inc/external/ge/ge_prof.h
deleted file mode 100644
index 658cea76..00000000
--- a/inc/external/ge/ge_prof.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/**
- * Copyright 2019-2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef INC_EXTERNAL_GE_GE_PROF_H_
-#define INC_EXTERNAL_GE_GE_PROF_H_
-
-#include <map>
-#include <string>
-#include <vector>
-
-#include "ge/ge_api_error_codes.h"
-
-namespace ge {
-enum ProfDataTypeConfig {
-  kProfTaskTime = 0x0002,
-  kProfAiCoreMetrics = 0x0004,
-  kProfAicpuTrace = 0x0008,
-  kProfTrainingTrace = 0x0800,
-  kProfHcclTrace = 0x1000
-};
-
-enum ProfilingAicoreMetrics {
-  kAicoreArithmaticThroughput = 0,
-  kAicorePipeline = 1,
-  kAicoreSynchronization = 2,
-  kAicoreMemory = 3,
-  kAicoreInternalMemory = 4,
-  kAicoreStall = 5
-};
-
-typedef struct ProfAicoreEvents ProfAicoreEvents;
-typedef struct aclgrphProfConfig aclgrphProfConfig;
-
-///
-/// @ingroup AscendCL
-/// @brief Initialize the profiling and set profiling configuration path
-/// @param [in] profiler_path: configuration path of profiling
-/// @param [in] length: length of configuration path
-/// @return Status result of function
-///
-Status aclgrphProfInit(const char *profiler_path, uint32_t length);
-
-///
-/// @ingroup AscendCL
-/// @brief Finalize profiling
-/// @return Status result of function
-///
-Status aclgrphProfFinalize();
-
-///
-/// @ingroup AscendCL
-/// @brief Create data of type aclgrphProfConfig
-/// @param [in] deviceid_list: device id list
-/// @param [in] device_nums: device numbers
-/// @param [in] aicore_metrics: type of aicore metrics
-/// @param [in] aicore_events: pointer to aicore events be reserved, only support NULL now
-/// @param [in] data_type_config: modules need profiling
-/// @return Status result of function
-///
-aclgrphProfConfig *aclgrphProfCreateConfig(uint32_t *deviceid_list, uint32_t device_nums,
-                                           ProfilingAicoreMetrics aicore_metrics, ProfAicoreEvents *aicore_events,
-                                           uint64_t data_type_config);
-
-///
-/// @ingroup AscendCL
-/// @brief  Destroy data of type aclgrphProfConfig
-/// @param [in] profiler_config: config of profiling
-/// @return Status result of function
-///
-Status aclgrphProfDestroyConfig(aclgrphProfConfig *profiler_config);
-
-///
-/// @ingroup AscendCL
-/// @brief Start profiling of modules which is configured by profiler config
-/// @param [in] profiler_config: config of profiling
-/// @return Status result of function
-///
-Status aclgrphProfStart(aclgrphProfConfig *profiler_config);
-
-///
-/// @ingroup AscendCL
-/// @brief Stop profiling of modules which is configured by profiler config
-/// @param [in] profiler_config: config of profiling
-/// @return Status result of function
-///
-Status aclgrphProfStop(aclgrphProfConfig *profiler_config);
-}  // namespace ge
-
-#endif  // INC_EXTERNAL_GE_GE_PROF_H_
diff --git a/inc/external/runtime/rt_error_codes.h b/inc/external/runtime/rt_error_codes.h
index 2dd2c70c..73d9564b 100644
--- a/inc/external/runtime/rt_error_codes.h
+++ b/inc/external/runtime/rt_error_codes.h
@@ -46,6 +46,7 @@ static const int32_t ACL_ERROR_RT_INVALID_MEMORY_TYPE = 107016;       // invalid
 static const int32_t ACL_ERROR_RT_FEATURE_NOT_SUPPROT = 207000;  // feature not support
 static const int32_t ACL_ERROR_RT_MEMORY_ALLOCATION = 207001;    // memory allocation error
 static const int32_t ACL_ERROR_RT_MEMORY_FREE = 207002;          // memory free error
+static const int32_t ACL_ERROR_RT_AICORE_OVER_FLOW = 207003;     // aicore over flow
 
 static const int32_t ACL_ERROR_RT_INTERNEL_ERROR = 507000;          // runtime internel error
 static const int32_t ACL_ERROR_RT_TS_ERROR = 507001;                // ts internel error
diff --git a/inc/framework/common/fmk_error_codes.h b/inc/framework/common/fmk_error_codes.h
index ec1f26d0..358fca04 100644
--- a/inc/framework/common/fmk_error_codes.h
+++ b/inc/framework/common/fmk_error_codes.h
@@ -23,10 +23,6 @@
 #include "framework/common/fmk_types.h"
 #include "register/register_error_codes.h"
 
-#define MODID_OMG 1          // OMG module ID
-#define MODID_OME 2          // OME module ID
-#define MODID_CALIBRATION 3  // Calibration module ID
-
 // Each module uses the following four macros to define error codes:
 #define DECLARE_ERRORNO_OMG(name, value) DECLARE_ERRORNO(SYSID_FWK, MODID_OMG, name, value)
 #define DECLARE_ERRORNO_OME(name, value) DECLARE_ERRORNO(SYSID_FWK, MODID_OME, name, value)
@@ -37,6 +33,10 @@
 // Interface for Obtaining Error Code Description
 #define GET_ERRORNO_STR(value) domi::StatusFactory::Instance()->GetErrDesc(value)
 
+const int MODID_OMG = 1;          // OMG module ID
+const int MODID_OME = 2;          // OME module ID
+const int MODID_CALIBRATION = 3;  // Calibration module ID
+
 namespace domi {
 class StatusFactory {
  public:
diff --git a/inc/framework/common/helper/model_helper.h b/inc/framework/common/helper/model_helper.h
index 949d8b4c..7867e63d 100644
--- a/inc/framework/common/helper/model_helper.h
+++ b/inc/framework/common/helper/model_helper.h
@@ -25,6 +25,7 @@
 #include "common/types.h"
 #include "graph/model.h"
 #include "model/ge_model.h"
+#include "model/ge_root_model.h"
 
 namespace ge {
 class ModelHelper {
@@ -32,17 +33,22 @@ class ModelHelper {
   ModelHelper() = default;
   ~ModelHelper();
 
-  Status SaveToOmModel(const GeModelPtr &ge_model, const SaveParam &save_param,
-                       const std::string &output_file, ge::ModelBufferData &model);
-  Status SaveOriginalGraphToOmModel(const ge::Graph& graph, const std::string& output_file);
+  Status SaveToOmModel(const GeModelPtr &ge_model, const SaveParam &save_param, const std::string &output_file,
+                       ge::ModelBufferData &model);
+  Status SaveToOmRootModel(const GeRootModelPtr &ge_root_model, const SaveParam &save_param, const string &output_file,
+                           ModelBufferData &model, bool is_unknown_shape);
+  Status SaveOriginalGraphToOmModel(const ge::Graph &graph, const std::string &output_file);
   Status LoadModel(const ge::ModelData &model_data);
-  Status GetModelBufferData(ge::ModelBufferData& model);
+  Status LoadRootModel(const ge::ModelData &model_data);
+  Status GetModelBufferData(ge::ModelBufferData &model);
 
-  const ModelFileHeader* GetFileHeader() const { return file_header_; }
+  const ModelFileHeader *GetFileHeader() const { return file_header_; }
 
   GeModelPtr GetGeModel();
+  GeRootModelPtr GetGeRootModel();
   void SetSaveMode(bool val) { is_offline_ = val; }
   bool GetSaveMode(void) const { return is_offline_; }
+  bool GetModelType() const { return is_unknown_shape_model_; };
 
   Status GetBaseNameFromFileName(const std::string &file_name, std::string &base_name);
   Status GetModelNameFromMergedGraphName(const std::string &graph_name, std::string &model_name);
@@ -50,24 +56,46 @@ class ModelHelper {
  private:
   bool is_assign_model_ = false;
   bool is_offline_ = true;
-  ModelFileHeader* file_header_ = nullptr;
+  bool is_unknown_shape_model_ = false;
+  ModelFileHeader *file_header_ = nullptr;
   // Encrypted model need delete temp model and unencrypted model need not delete model
   uint8_t *model_addr_tmp_ = nullptr;
   uint32_t model_len_tmp_ = 0;
   GeModelPtr model_;
+  GeRootModelPtr root_model_;
 
-  ModelHelper(const ModelHelper&);
-  ModelHelper& operator=(const ModelHelper&);
-  Status GenerateGeModel(OmFileLoadHelper& om_load_helper);
-  Status LoadModelData(OmFileLoadHelper& om_load_helper);
-  void SetModelToGeModel(ge::Model& model);
-  Status LoadWeights(OmFileLoadHelper& om_load_helper);
-  Status LoadTask(OmFileLoadHelper& om_load_helper);
-  Status LoadTBEKernelStore(OmFileLoadHelper& om_load_helper);
-  Status LoadCustAICPUKernelStore(OmFileLoadHelper& om_load_helper);
+  ModelHelper(const ModelHelper &);
+  ModelHelper &operator=(const ModelHelper &);
+  Status GenerateGeModel(OmFileLoadHelper &om_load_helper);
+  Status GenerateGeRootModel(OmFileLoadHelper &om_load_helper);
+  Status LoadModelData(OmFileLoadHelper &om_load_helper);
+  void SetModelToGeModel(ge::Model &model);
+  Status LoadModelData(OmFileLoadHelper &om_load_helper, GeModelPtr &cur_model, size_t mode_index);
+  Status LoadWeights(OmFileLoadHelper &om_load_helper);
+  Status LoadWeights(OmFileLoadHelper &om_load_helper, GeModelPtr &cur_model, size_t mode_index);
+  Status LoadTask(OmFileLoadHelper &om_load_helper);
+  Status LoadTask(OmFileLoadHelper &om_load_helper, GeModelPtr &cur_model, size_t mode_index);
+  Status LoadTBEKernelStore(OmFileLoadHelper &om_load_helper);
+  Status LoadTBEKernelStore(OmFileLoadHelper &om_load_helper, GeModelPtr &cur_model, size_t mode_index);
+  Status LoadCustAICPUKernelStore(OmFileLoadHelper &om_load_helper);
+  Status LoadCustAICPUKernelStore(OmFileLoadHelper &om_load_helper, GeModelPtr &cur_model, size_t mode_index);
   Status ReleaseLocalModelData() noexcept;
-  Status SaveModelPartition(std::shared_ptr<OmFileSaveHelper>& om_file_save_helper,
-                            ModelPartitionType type, const uint8_t* data, size_t size);
+  Status SaveModelPartition(std::shared_ptr<OmFileSaveHelper> &om_file_save_helper, ModelPartitionType type,
+                            const uint8_t *data, size_t size, size_t model_index);
+  Status SaveModelDef(shared_ptr<OmFileSaveHelper> &om_file_save_helper, const GeModelPtr &ge_model,
+                      Buffer &model_buffer, size_t model_index = 0);
+  Status SaveModelWeights(shared_ptr<OmFileSaveHelper> &om_file_save_helper, const GeModelPtr &ge_model,
+                          size_t model_index = 0);
+  Status SaveModelTbeKernel(shared_ptr<OmFileSaveHelper> &om_file_save_helper, const GeModelPtr &ge_model,
+                            size_t model_index = 0);
+  Status SaveModelCustAICPU(shared_ptr<OmFileSaveHelper> &om_file_save_helper, const GeModelPtr &ge_model,
+                            size_t model_index = 0);
+  Status SaveModelTaskDef(shared_ptr<OmFileSaveHelper> &om_file_save_helper, const GeModelPtr &ge_model,
+                          Buffer &task_buffer, size_t model_index = 0);
+  Status SaveModelHeader(shared_ptr<OmFileSaveHelper> &om_file_save_helper, const GeModelPtr &ge_model,
+                         size_t model_num = 1);
+  Status SaveAllModelPartiton(shared_ptr<OmFileSaveHelper> &om_file_save_helper, const GeModelPtr &ge_model,
+                              Buffer &model_buffer, Buffer &task_buffer, size_t model_index = 0);
 };
 }  // namespace ge
 #endif  // INC_FRAMEWORK_COMMON_HELPER_MODEL_HELPER_H_
diff --git a/inc/framework/common/helper/om_file_helper.h b/inc/framework/common/helper/om_file_helper.h
index 4ca54b50..98ad55d7 100644
--- a/inc/framework/common/helper/om_file_helper.h
+++ b/inc/framework/common/helper/om_file_helper.h
@@ -32,14 +32,14 @@ using std::vector;
 namespace ge {
 struct ModelPartition {
   ModelPartitionType type;
-  uint8_t* data = 0;
+  uint8_t *data = 0;
   uint32_t size = 0;
 };
 
 struct OmFileContext {
   std::vector<ModelPartition> partition_datas_;
   std::vector<char> partition_table_;
-  uint32_t model_data_len_;
+  uint32_t model_data_len_ = 0;
 };
 
 struct SaveParam {
@@ -57,15 +57,23 @@ class OmFileLoadHelper {
 
   Status Init(uint8_t *model_data, const uint32_t model_data_size);
 
+  Status Init(uint8_t *model_data, const uint32_t model_data_size, uint32_t model_num);
+
   Status GetModelPartition(ModelPartitionType type, ModelPartition &partition);
 
+  Status GetModelPartition(ModelPartitionType type, ModelPartition &partition, size_t model_index);
+
   OmFileContext context_;
 
+  vector<OmFileContext> model_contexts_;
+
  private:
   Status CheckModelValid(const ge::ModelData &model) const;
 
   Status LoadModelPartitionTable(uint8_t *model_data, const uint32_t model_data_size);
 
+  Status LoadModelPartitionTable(uint8_t *model_data, const uint32_t model_data_size, uint32_t model_num);
+
   bool is_inited_{false};
 };
 
@@ -79,15 +87,23 @@ class OmFileSaveHelper {
 
   Status AddPartition(ModelPartition &partition);
 
+  Status AddPartition(ModelPartition &partition, size_t cur_index);
+
   const std::vector<ModelPartition> &GetModelPartitions() const;
 
-  Status SaveModel(const SaveParam &save_param, const char *target_file,
-                   ge::ModelBufferData& model, bool is_offline = true);
+  Status SaveModel(const SaveParam &save_param, const char *target_file, ge::ModelBufferData &model,
+                   bool is_offline = true);
 
   Status SaveModelToFile(const char *output_file, ge::ModelBufferData &model, bool is_offline = true);
 
+  vector<OmFileContext> model_contexts_;
+
   ModelFileHeader model_header_;
   OmFileContext context_;
+
+  ModelPartitionTable *GetPartitionTable(size_t cur_ctx_index);
+
+  Status SaveRootModel(const SaveParam &save_param, const char *output_file, ModelBufferData &model, bool is_offline);
 };
 }  // namespace ge
 #endif  // INC_FRAMEWORK_COMMON_HELPER_OM_FILE_HELPER_H_
diff --git a/inc/framework/common/op/ge_op_utils.h b/inc/framework/common/op/ge_op_utils.h
index 4718b180..aa50c8a1 100644
--- a/inc/framework/common/op/ge_op_utils.h
+++ b/inc/framework/common/op/ge_op_utils.h
@@ -17,7 +17,6 @@
 #ifndef INC_FRAMEWORK_COMMON_OP_GE_OP_UTILS_H_
 #define INC_FRAMEWORK_COMMON_OP_GE_OP_UTILS_H_
 
-#include <cce/dnn.h>
 #include <memory>
 #include <vector>
 
@@ -32,7 +31,6 @@
 #include "proto/insert_op.pb.h"
 
 namespace ge {
-using namespace cce;
 using domi::Status;
 
 // Add Sub Mul
@@ -76,18 +74,7 @@ class OpUtils {
   static inline bool CheckEnumValid(int32_t check_value, int32_t min_enum_value, int32_t max_enum_value) {
     return check_value < min_enum_value ? false : (check_value >= max_enum_value ? false : true);
   }
-  ///
-  /// @ingroup domi_omg
-  /// @brief Convert the dimension of array according to different format
-  /// @param [in] src_format src_shape format
-  /// @param [in] src Dimension array to be converted
-  /// @param [in] dst_format Target format after conversion
-  /// @param [out] dst Dimension array after conversion
-  /// @return SUCCESS success
-  /// @return FAILED fail
-  ///
-  static bool ConvertDim(ccTensorFormat_t src_format, const std::vector<int64_t> &src, ccTensorFormat_t dst_format,
-                         std::vector<int64_t> &dst);
+
   ///
   /// @ingroup domi_omg
   /// @brief Determine whether to manually calculate the tensor size based on the values of format and dim
@@ -97,73 +84,6 @@ class OpUtils {
   /// @return false skip
   ///
   static bool IsComputDimsSize(const int32_t format, const uint32_t real_dim_cnt);
-  ///
-  /// @ingroup domi_ome
-  /// @brief Initialize the tensor description, which is used for input and output.
-  /// @param [in] model_tensor Tensor information defined by the offline model
-  /// @param [out] cc_tensor Tensor definition used by CC
-  /// @return SUCCESS success
-  /// @return FAILED fail
-  ///
-  static Status InitTensorDescriptor(const ge::GeTensorDesc &model_tensor, ccTensorDescriptor_t &cc_tensor);
-  ///
-  /// @ingroup domi_ome
-  /// @brief Initialize the tensor description, which is used for input and output.
-  /// @param [in] model_tensor Tensor information defined by the offline model
-  /// @param [in] dst_data_type data_type of the target cc_tensor
-  /// @param [out] cc_tensor Tensor definition used by CC
-  /// @return SUCCESS success
-  /// @return FAILED fail
-  ///
-  static Status InitTensorDescriptor(const ge::GeTensorDesc &model_tensor, int32_t dst_data_type,
-                                     ccTensorDescriptor_t &cc_tensor);
-  ///
-  /// @ingroup domi_ome
-  /// @brief Initialize the tensor description for bias.
-  /// @param [in] model_tensor Tensor information defined by the offline model
-  /// @param [out]  cc_tensor Tensor definition used by CC
-  /// @return SUCCESS success
-  /// @return FAILED fail
-  ///
-  ///
-  static Status InitTensorDescriptor(const ge::GeTensor &model_tensor, ccTensorDescriptor_t &cc_tensor);
-  ///
-  /// @ingroup domi_ome
-  /// @brief Initialize the tensor description for bias.
-  /// @param [in] model_tensor Tensor information defined by the offline model
-  /// @param [in] dst_data_type data_type of the target cc_tensor
-  /// @param [out] cc_tensor Tensor definition used by CC
-  /// @return SUCCESS success
-  /// @return FAILED fail
-  ///
-  static Status InitTensorDescriptor(const ge::GeTensor &model_tensor, int32_t dst_data_type,
-                                     ccTensorDescriptor_t &cc_tensor);
-
-  static Status InitTensorDescriptor(int32_t format, int32_t data_type, const std::vector<int64_t> &dim,
-                                     ccTensorDescriptor_t &cc_tensor, uint32_t real_dim_cnt = 4);
-  ///
-  /// @ingroup domi_ome
-  /// @brief Destroys a tensor
-  /// @param [inout] cc_tensor Tensor definition used by CC
-  ///
-  static void DestroyTensorDescriptor(ccTensorDescriptor_t &cc_tensor) noexcept;
-
-  ///
-  /// @ingroup domi_ome
-  /// @brief Destroys a tensor
-  /// @param [inout] cc_filter cc_filter Definition of the filter used by CC
-  ///
-  static void DestroyFilterDescriptor(ccFilterDescriptor_t &cc_filter);
-
-  ///
-  /// @ingroup domi_ome
-  /// @brief Initializing Filter Description
-  /// @param [in] model_filter Filter information defined in the offline model
-  /// @param [out] cc_filter Definition of the filter used by CC
-  /// @return SUCCESS success
-  /// @return FAILED fail
-  ///
-  static Status InitFilterDescriptor(const ge::GeTensor &model_filter, ccFilterDescriptor_t &cc_filter);
 
   ///
   /// @brief Extract AIPP parameters from AttrDefMap and splice them
@@ -209,16 +129,7 @@ class OpUtils {
   /// @param [out] output Data pointer after conversion. The format is HWCK
   ///
   static void TransDataKCHW2HWCK(const void *input, int64_t K, int64_t C, int64_t H, int64_t W, void *output);
-  ///
-  /// @ingroup domi_omg
-  /// @brief Initialize the input and output description of the data node which is applied to filter weight in the
-  /// training network
-  /// @param [in] model_tensor input and output tensor information
-  /// @param [out] cc_tensor Tensor in CCE format after conversion
-  ///
-  static Status InitFilterTensorDescriptor(const ge::GeTensorDesc &model_tensor, ccFilterDescriptor_t &cc_tensor);
-
-  static void SetTensorDescriptorAllOffsetQuantizeInfo(const GeTensorDesc &tensor, ccTensorDescriptor_t cc_tensor);
+  
   static vector<ConstGeTensorPtr> GetWeights(const ge::Node &node);
   static vector<ConstGeTensorPtr> GetWeights(ge::ConstNodePtr node);
   static vector<GeTensorPtr> MutableWeights(const ge::Node &node);
@@ -228,69 +139,7 @@ class OpUtils {
   static Status GetShapeDataFromConstTensor(const ConstGeTensorPtr &tensor, DataType type, std::vector<int64_t> &dims);
 
  private:
-  friend class CceTensorDescriptor;
   static uint32_t GetRealDimCnt(const GeTensorDesc &tensor_desc);
 };
-
-class CceTensorDescriptor;
-
-using CceTensorDescriptorPtr = std::shared_ptr<CceTensorDescriptor>;
-
-class CceTensorDescriptor {
- public:
-  explicit CceTensorDescriptor(ccTensorDescriptor_t cc_tensor);
-  CceTensorDescriptor(const CceTensorDescriptor &) = delete;
-  CceTensorDescriptor &operator=(const CceTensorDescriptor &) = delete;
-
-  ~CceTensorDescriptor();
-
-  ccTensorDescriptor_t GetPtr() { return cc_tensor_; }
-
-  ///
-  /// @brief      Initializes the tensor based on shape information.
-  /// @param[in]  format  data permutation format
-  /// @param[in]  data_type Data Type
-  /// @param[in]  dim dim information
-  /// @return     return code
-  ///
-  Status InitTensor(int32_t format, int32_t data_type, const std::vector<int64_t> &dims);
-
-  Status InitTensor(int32_t format, int32_t data_type, const ge::GeShape &shape);
-
-  ///
-  /// @brief      get format of tensor
-  /// @param[out] format format of the tensor
-  /// @return     return code
-  ///
-  Status GetFormat(ccTensorFormat_t *format);
-
-  ///
-  /// @brief      Obtains the size of the tensor.
-  /// @param[out] size size of Tensor
-  /// @return     return code
-  ///
-  Status GetTensorSizeInBytes(uint32_t *size);
-
-  ///
-  /// @brief transform tensor between 4d(NCHW) and 5d(NC1HWC0)
-  /// @param [in] xDesc   descriptor of input tensor
-  /// @param [in] x   point to input data in host memory
-  /// @param [in] dataTypeTransmode   mode of data type transform
-  /// @param [in] yDesc   descriptor of output tensor
-  /// @param [in|out] y   point to output data in host memory
-  /// @param [in] ySizeInBytes   size of outputData
-  /// @return return code
-  ///
-  static Status TransTensor(const ccTensorDescriptor_t xDesc, const void *x, const CceTensorDescriptorPtr &yDesc,
-                            void *y, uint32_t ySizeInBytes);
-
-  ///
-  /// @brief      CceTensorDescriptor Static Constructor
-  /// @return     CceTensorDescriptor smart pointer
-  ///
-  static CceTensorDescriptorPtr Create();
-
-  ccTensorDescriptor_t cc_tensor_ = nullptr;
-};
 }  // namespace ge
 #endif  // INC_FRAMEWORK_COMMON_OP_GE_OP_UTILS_H_
diff --git a/inc/framework/common/op/op_parser_util.h b/inc/framework/common/op/op_parser_util.h
index 49b4350a..43254ca9 100644
--- a/inc/framework/common/op/op_parser_util.h
+++ b/inc/framework/common/op/op_parser_util.h
@@ -17,7 +17,6 @@
 #ifndef INC_FRAMEWORK_COMMON_OP_OP_PARSER_UTIL_H_
 #define INC_FRAMEWORK_COMMON_OP_OP_PARSER_UTIL_H_
 
-#include <cce/dnn.h>
 #include <limits.h>
 #include <math.h>
 #include <stdint.h>
@@ -31,10 +30,7 @@ const uint32_t NORMAL_OUTPUT_NUM = 1;
 const uint32_t NORMAL_WORKSPACE_NUM = 0;
 const int32_t NORMAL_1D_DIM_NUM = 1;
 const int32_t NORMAL_SCALE_DIM_NUM = 0;
-const int NORMAL_TENSOR_FORMAT = static_cast<const int>(cce::CC_TENSOR_NC1HWC0);
 const int NORMAL_TENSOR_SIZE = 4;
-const int NORMAL_DEVICE_DATA_TYPE = static_cast<const int>(cce::CC_DATA_HALF);
-const int DEFAULT_POOLING_MODE = static_cast<const int>(cce::CC_POOLING_MAX);
 const uint32_t DEFAULT_REAL_DIM_CNT = 4;
 
 // const
@@ -183,7 +179,6 @@ const int32_t SSD_DETECTIONOUTPUT_BACKGROUND_LABEL_ID_DEFAULT_VALUE = 0;
 const float SSD_DETECTIONOUTPUT_NMS_THRESHOLD_DEFAULT_VALUE = 0.3;
 const int32_t SSD_DETECTIONOUTPUT_TOP_K_DEFAULT_VALUE = 200;
 const float SSD_DETECTIONOUTPUT_ETA_DEFAULT_VALUE = 1.0;
-const int SSD_DETECTIONOUTPUT_CODE_TYPE_DEFAULT_VALUE = static_cast<const int>(cce::CC_BOX_CENTER_SIZE);
 const int32_t SSD_DETECTIONOUTPUT_KEEP_TOP_K_DEFAULT_VALUE = 200;
 const bool SSD_DETECTIONOUTPUT_VARIANCE_ENCODED_IN_TARGET_DEFAULT_VALUE = false;
 const float SSD_DETECTIONOUTPUT_CONFIDENCE_THRESHOLD_DEFAULT_VALUE = 0.1;
@@ -200,7 +195,6 @@ const float REFINEDET_DETECTIONOUTPUT_NMS_THRESHOLD_DEFAULT_VALUE = 0.3;
 const int32_t REFINEDET_DETECTIONOUTPUT_TOP_K_DEFAULT_VALUE = 200;
 const float REFINEDET_DETECTIONOUTPUT_ETA_DEFAULT_VALUE = 1.0;
 const bool REFINEDET_DETECTIONOUTPUT_VARIANCE_ENCODED_IN_TARGET_DEFAULT_VALUE = false;
-const int REFINEDET_DETECTIONOUTPUT_CODE_TYPE_DEFAULT_VALUE = static_cast<const int>(cce::CC_BOX_CENTER_SIZE);
 const int32_t REFINEDET_DETECTIONOUTPUT_KEEP_TOP_K_DEFAULT_VALUE = 200;
 const float REFINEDET_DETECTIONOUTPUT_CONFIDENCE_THRESHOLD_DEFAULT_VALUE = 0.1;
 const float REFINEDET_DETECTIONOUTPUT_OBJECTNESS_SCORE_DEFAULT_VALUE = 0;
diff --git a/inc/framework/common/profiling/ge_profiling.h b/inc/framework/common/profiling/ge_profiling.h
new file mode 100644
index 00000000..e56411c9
--- /dev/null
+++ b/inc/framework/common/profiling/ge_profiling.h
@@ -0,0 +1,45 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef INC_FRAMEWORK_COMMON_GE_PROFILING_H_
+#define INC_FRAMEWORK_COMMON_GE_PROFILING_H_
+
+#include "ge/ge_api_error_codes.h"
+#include "toolchain/prof_callback.h"
+
+#define MAX_DEV_NUM (64)
+enum ProfCommandHandleType {
+  kProfCommandhandleInit = 0,
+  kProfCommandhandleStart,
+  kProfCommandhandleStop,
+  kProfCommandhandleFinalize,
+  kProfCommandhandleModelSubscribe,
+  kProfCommandhandleModelUnsubscribe
+};
+
+struct ProfCommandHandleData {
+  uint64_t profSwitch;
+  uint32_t devNums; // length of device id list
+  uint32_t devIdList[MAX_DEV_NUM];
+  uint32_t modelId;
+};
+
+ge::Status RegProfCtrlCallback(MsprofCtrlCallback func);
+ge::Status RegProfSetDeviceCallback(MsprofSetDeviceCallback func);
+ge::Status RegProfReporterCallback(MsprofReporterCallback func);
+ge::Status ProfCommandHandle(ProfCommandHandleType type, void *data, uint32_t len);
+
+#endif  // INC_FRAMEWORK_COMMON_GE_PROFILING_H_
diff --git a/inc/framework/common/profiling/ge_runner_profiling.h b/inc/framework/common/profiling/ge_runner_profiling.h
new file mode 100644
index 00000000..d2eff767
--- /dev/null
+++ b/inc/framework/common/profiling/ge_runner_profiling.h
@@ -0,0 +1,24 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef INC_FRAMEWORK_COMMON_GE_RUNNER_PROFILING_H_
+#define INC_FRAMEWORK_COMMON_GE_RUNNER_PROFILING_H_
+
+#include "profiling/ge_profiling.h"
+
+bool IsInitialize();
+
+#endif  // INC_FRAMEWORK_COMMON_GE_RUNNER_PROFILING_H_
diff --git a/inc/framework/common/taskdown_common.h b/inc/framework/common/taskdown_common.h
new file mode 100644
index 00000000..090e7e26
--- /dev/null
+++ b/inc/framework/common/taskdown_common.h
@@ -0,0 +1,71 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef INC_FRAMEWORK_COMMON_TASKDOWN_COMMON_H_
+#define INC_FRAMEWORK_COMMON_TASKDOWN_COMMON_H_
+
+#include "runtime/rt.h"
+
+namespace ge {
+
+const int CC_FUSION_OP_MAX = 32;
+
+typedef enum tagCcStatus {
+  CC_STATUS_SUCCESS = 0,         /**< succ */
+  CC_STATUS_NOT_INITIALIZED = 1, /**< not init */
+  CC_STATUS_ALLOC_FAILED = 2,    /**< alloc mem failed */
+  CC_STATUS_BAD_PARAM = 3,       /**< para check failed */
+  CC_STATUS_INTERNAL_ERROR = 4,  /**< internal error */
+  CC_STATUS_KERNEL_ERROR = 5,    /**< kernel error */
+  CC_STATUS_RUNTIME_ERROR = 6,   /**< runtime error */
+  CC_STATUS_NOT_SUPPORTED = 7,   /**< unsupport error */
+  CC_STATUS_INVALID_VALUE = 7,   /**< invalid value error for blas*/
+  CC_STATUS_RESERVED             /**< just for check */
+} ccStatus_t;
+
+typedef enum tagccKernelType {
+  CCE_AI_CORE = 0, /* cce aicore */
+  CCE_AI_CPU = 1,  /* cce aicpu */
+  TE = 2,          /* te operator*/
+  CUSTOMIZED = 3,  /* customized operator */
+  TE_AI_CORE = 4,  /* te aicore operator*/
+  TE_AI_CPU = 5,   /* te aicpu operator */
+  AI_CPU = 6,      /* aicpu */
+  CUST_AI_CPU = 7, /* custom aicpu*/
+  INVALID = 8,     /* unknown kernel type */
+} ccKernelType;
+
+typedef struct tagOpContext {
+  ccKernelType kernelType;
+  uint32_t opId;
+  uint32_t kernelFuncId;
+  uint32_t opIndex;
+  uint32_t opCount;
+  uint32_t opIndex2[CC_FUSION_OP_MAX];
+  bool isFlowtable;
+  uint16_t *argsOffset;
+  uint32_t argsCount;
+  uint64_t genDataBaseAddr;
+  uint64_t genDataBaseSize;
+  uint64_t genWeightBaseAddr;
+  uint64_t genWeightBaseSize;
+  uint64_t genVariableBaseAddr;
+  uint64_t genVariableBaseSize;
+  uint64_t l2ctrlSize;
+} ccOpContext;
+}  // namespace ge
+
+#endif  // INC_FRAMEWORK_COMMON_TASKDOWN_COMMON_H_
diff --git a/inc/framework/common/types.h b/inc/framework/common/types.h
index 441d0757..99c2ea03 100644
--- a/inc/framework/common/types.h
+++ b/inc/framework/common/types.h
@@ -529,7 +529,7 @@ REGISTER_OPTYPE_DECLARE(HVDWAIT, "HorovodWait");
 // aicpu op for online_infer dynamic_dims
 REGISTER_OPTYPE_DECLARE(GETDYNAMICDIMS, "GetDynamicDims");
 
-enum InputMode { INPUT = 0, CONST_INPUT};
+enum InputMode { INPUT = 0, CONST_INPUT };
 
 // Definition of the processing status enum of the process module
 enum ModelProcessState {
@@ -605,7 +605,7 @@ static constexpr uint32_t MODEL_FILE_CHECKSUM_LENGTH = 64;
 ///
 /// @brief length of the reserved field in the model file header
 ///
-static constexpr uint32_t MODEL_FILE_RESERVED_LENGTH = 79;
+static constexpr uint32_t MODEL_FILE_RESERVED_LENGTH = 75;
 
 ///
 /// @ingroup domi_omg
@@ -843,9 +843,10 @@ struct ModelFileHeader {
   uint32_t ops = 0;                                       // Computing power (Kops)
   uint8_t userdefineinfo[USER_DEFINE_INFO_LENGTH] = {0};  // User-defined information. The value contains 32 characters
   uint32_t om_ir_version = 0;
+  uint32_t model_num = 0;
   uint8_t platform_version[PLATFORM_VERSION_LEN] = {0};
   uint8_t platform_type = {0};
-  uint8_t reserved[MODEL_FILE_RESERVED_LENGTH] = {0};  // Reserved field 79
+  uint8_t reserved[MODEL_FILE_RESERVED_LENGTH] = {0};  // Reserved field 75
 };
 
 static constexpr uint8_t TARGET_TYPE_LTTE_8BIT = 0;
diff --git a/inc/framework/generator/ge_generator.h b/inc/framework/generator/ge_generator.h
index c446b983..e0904965 100644
--- a/inc/framework/generator/ge_generator.h
+++ b/inc/framework/generator/ge_generator.h
@@ -74,11 +74,22 @@ class GeGenerator {
   /// @param [in] op_desc: the OP description.
   /// @param [in] inputs: input tensors.
   /// @param [in] outputs: output tensors.
-  /// @param [in] engine_type: specific engine.
-  /// @param [out] model_buff: model buff of single op.
+  /// @param [in] engine_type: engine type.
+  /// @param [out] model_buff: model buff of op.
   /// @return SUCCESS or FAILED
   Status BuildSingleOpModel(OpDescPtr &op_desc, const vector<GeTensor> &inputs, const vector<GeTensor> &outputs,
                             OpEngineType engine_type, ModelBufferData &model_buff);
+  ///
+  /// @ingroup ge
+  /// @brief: Build single Op into model buff.
+  /// @param [in] op_desc: the OP description.
+  /// @param [in] inputs: input tensors.
+  /// @param [in] outputs: output tensors.
+  /// @param [in] graph_name: graph name.
+  /// @param [out] graph: graph of single op.
+  /// @return SUCCESS or FAILED
+  Status BuildSingleOpGraph(OpDescPtr &op_desc, const vector<GeTensor> &inputs, const vector<GeTensor> &outputs,
+                            std::string graph_name, Graph &graph);
 
  private:
   Status GenerateModel(const Graph &graph, const string &file_name_prefix, const vector<GeTensor> &inputs,
diff --git a/metadef b/metadef
index 4176fab0..dba83744 160000
--- a/metadef
+++ b/metadef
@@ -1 +1 @@
-Subproject commit 4176fab0cb2fd4f8794061916878983afb75c8da
+Subproject commit dba83744a3ffe3d5f89496e69bb65c50f800c299
diff --git a/parser b/parser
index 9e392045..ce574894 160000
--- a/parser
+++ b/parser
@@ -1 +1 @@
-Subproject commit 9e392045c26a57913b512d0686e1285650b62abe
+Subproject commit ce574894f13cd94749d1a3964a13e8c97c20434a
diff --git a/tests/depends/error_manager/src/error_manager_stub.cc b/tests/depends/error_manager/src/error_manager_stub.cc
index 4f6b6b3d..edf5a487 100644
--- a/tests/depends/error_manager/src/error_manager_stub.cc
+++ b/tests/depends/error_manager/src/error_manager_stub.cc
@@ -58,7 +58,7 @@
   /// @param [in] value: vector parameter value
   ///
   void ErrorManager::ATCReportErrMessage(std::string error_code, const std::vector<std::string> &key,
-                           const std::vector<std::string> &value) { 
+                                         const std::vector<std::string> &value) { 
   }
 
   ///
diff --git a/tests/depends/hccl/src/hccl_stub.cc b/tests/depends/hccl/src/hccl_stub.cc
index 1cc8fdb3..b9b9d4f6 100644
--- a/tests/depends/hccl/src/hccl_stub.cc
+++ b/tests/depends/hccl/src/hccl_stub.cc
@@ -19,26 +19,26 @@
 #include "hccl/hcom.h"
 
 HcclResult hcom_all_gather(const char *tag, void *input_count_ptr, void *output_ptr, u64 input_count,
-                             HcclDataType data_type, const char *group, rtStream_t stream) {
+                           HcclDataType data_type, const char *group, rtStream_t stream) {
   return HCCL_SUCCESS;
 }
 
 HcclResult hcom_broadcast(const char *tag, void *ptr, u64 count, HcclDataType data_type, u32 root,
-                            const char *group, rtStream_t stream) {
+                          const char *group, rtStream_t stream) {
   return HCCL_SUCCESS;
 }
 
 HcclResult hcom_all_reduce(const char *tag, void *input_ptr, void *output_ptr, u64 count, HcclDataType data_type,
-                             HcclReduceOp op, const char *group, rtStream_t stream) {
+                           HcclReduceOp op, const char *group, rtStream_t stream) {
   return HCCL_SUCCESS;
 }
 
 HcclResult hcom_get_split_strategy(const char *group, const struct model_feature *feature, u32 max_segment_num,
-                                     u32 *segment_num, u32 *segment_idx) {
+                                   u32 *segment_num, u32 *segment_idx) {
   return HCCL_SUCCESS;
 }
 
 HcclResult hcom_reduce_scatter(const char *tag, void *input_ptr, void *output_ptr, u64 count,
-                                 HcclDataType data_type, HcclReduceOp op, const char *group, rtStream_t stream) {
+                               HcclDataType data_type, HcclReduceOp op, const char *group, rtStream_t stream) {
   return HCCL_SUCCESS;
 }
diff --git a/tests/depends/runtime/src/runtime_stub.cc b/tests/depends/runtime/src/runtime_stub.cc
index 2ab6684d..75eefdd1 100644
--- a/tests/depends/runtime/src/runtime_stub.cc
+++ b/tests/depends/runtime/src/runtime_stub.cc
@@ -325,7 +325,7 @@ rtError_t rtSetTaskFailCallback(rtTaskFailCallback callback)
 }
 
 rtError_t rtMallocHostSharedMemory(rtMallocHostSharedMemoryIn *in,
-		                   rtMallocHostSharedMemoryOut *out)
+		                               rtMallocHostSharedMemoryOut *out)
 {
   out->ptr = new uint8_t[in->size];
   out->devPtr = new uint8_t[in->size];
diff --git a/tests/st/CMakeLists.txt b/tests/st/CMakeLists.txt
deleted file mode 100644
index 56babec1..00000000
--- a/tests/st/CMakeLists.txt
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright 2019-2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-cmake_minimum_required(VERSION 3.0)
-set(CMAKE_CXX_STANDARD 11)
-project(ge_st CXX C)
-
-set(CMAKE_CXX_FLAGS "-O1 -fPIC -Wl,-unresolved-symbols=ignore-in-shared-libs")
-
-
-file(GLOB_RECURSE RES50_TRAIN_SRCS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
-        "resnet50/resnet50_train.cc"
-        "resnet50/common.cc"
-)
-
-include_directories(${GE_SOURCE_DIR}/inc)
-include_directories(${GE_SOURCE_DIR}/inc/graph)
-include_directories(${GE_SOURCE_DIR}/inc/framework)
-include_directories(${GE_SOURCE_DIR}/inc/external)
-include_directories(${GE_SOURCE_DIR}/inc/external/ge)
-include_directories(${GE_SOURCE_DIR}/inc/external/graph)
-include_directories(${GE_SOURCE_DIR}/third_party/fwkacllib/inc)
-include_directories(${GE_SOURCE_DIR}/third_party/fwkacllib/inc/ops)
-include_directories(/usr/local/HiAI/opp/op_proto/built-in/inc)
-
-add_executable(st_resnet50_train ${RES50_TRAIN_SRCS})
-target_link_libraries(st_resnet50_train
-        ${PROTOBUF_LIBRARY}
-        ge_client_train ge_memory
-)
\ No newline at end of file
diff --git a/tests/st/resnet50/common.cc b/tests/st/resnet50/common.cc
deleted file mode 100644
index 674ef926..00000000
--- a/tests/st/resnet50/common.cc
+++ /dev/null
@@ -1,768 +0,0 @@
-/**
- * Copyright 2019-2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <math.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <iostream>
-#include <vector>
-
-#include "common.h"
-#include "model.h"
-
-#define MAX_HEAD_SIZE 50
-
-using namespace std;
-using namespace ge;
-
-void update_op_format(Operator ops, Format format) {
-  printf("set format begin.........\n");
-  ge::TensorDesc tensor_desc_x = ops.GetInputDesc("x");
-  ge::TensorDesc tensor_desc_y = ops.GetOutputDesc("y");
-  Format f_x0 = tensor_desc_x.GetFormat();
-  Format f_y0 = tensor_desc_x.GetFormat();
-  printf("before set  x format:%d \n", f_x0);
-  printf("before set  y format:%d \n", f_y0);
-  printf("format to be set is :%d \n", format);
-  tensor_desc_x.SetFormat(format);
-  tensor_desc_y.SetFormat(format);
-  ops.UpdateInputDesc("x", tensor_desc_x);
-  ops.UpdateOutputDesc("y", tensor_desc_y);
-  Format f_x = tensor_desc_x.GetFormat();
-  Format f_y = tensor_desc_y.GetFormat();
-  printf("after set  x format:%d \n", f_x);
-  printf("after set  y format:%d \n", f_y);
-}
-
-/// getDimInfo: get dim info from data file
-/// param:
-/// fp: the testing datafile object
-///
-/// return :
-/// dim_info: array to store the info of the dim in datafile, like [4,3,3,6,3,162(3*3*6*3)],4 is dim size,3,3,6,3 is the
-/// dim shape data_size: the size of the testing data including the data file
-void getDimInfo(FILE *fp, std::vector<uint64_t> &dim_info) {
-  // get dim info from hisi testing data file
-  uint32_t *dim_buffer = (uint32_t *)malloc(MAX_HEAD_SIZE * sizeof(uint32_t));
-  fread(dim_buffer, sizeof(uint32_t), MAX_HEAD_SIZE, fp);
-  dim_info.push_back(*dim_buffer);  // get dim size
-
-  // get data shape to compute the datasize
-  uint64_t data_size = 1;
-  uint32_t i = 1;
-  for (; i <= dim_info[0]; i++) {
-    dim_info.push_back(*(dim_buffer + i));
-    data_size *= *(dim_buffer + i);
-  }
-  dim_info.push_back(data_size);
-
-  free(dim_buffer);
-}
-
-/// readTestDataFile: read test date from hisi .t datafile
-/// param:
-///  infile: the path of hisi .t datafile
-/// return:
-///  dim_info: array to store the info of the dim in datafile, like [4,3,3,6,3],4 is dim size,3,3,6,3 is the dim shape
-void *readTestDataFile(std::string infile, std::vector<uint64_t> &dim_info) {
-  FILE *fp;
-  fp = fopen(infile.c_str(), "r");
-
-  if (fp == NULL) {
-    printf("ERROR: cant't open file %s\n", infile.c_str());
-    return NULL;
-  } else {
-    getDimInfo(fp, dim_info);
-    uint64_t data_size = dim_info[dim_info.size() - 1];
-
-    fclose(fp);
-
-    fp = fopen(infile.c_str(), "r");
-    if (fp == NULL) {
-      printf("ERROR: cant't open file %s\n", infile.c_str());
-      return NULL;
-    }
-    uint32_t *memory = (uint32_t *)malloc((dim_info[0] + 1 + data_size) * sizeof(uint32_t));
-    fread(memory, sizeof(uint32_t), (dim_info[0] + 1 + data_size), fp);
-    fclose(fp);
-    return memory + (dim_info[0] + 1);
-  }
-}
-
-void *readUint8TestDataFile(std::string infile, int size) {
-  FILE *fp;
-  fp = fopen(infile.c_str(), "r");
-
-  if (fp == NULL) {
-    printf("ERROR: cant't open file %s\n", infile.c_str());
-    return NULL;
-  }
-  uint8_t *memory = (uint8_t *)malloc((size) * sizeof(uint8_t));
-  fread(memory, sizeof(uint8_t), (size), fp);
-  fclose(fp);
-  return memory;
-}
-
-/// allclose
-/// param:
-///  a:compared file a
-///  b:compared file b
-///  count: the count size which will compare
-///  rtol:
-///  atol:
-/// return:
-///  true or false
-bool allclose(float *a, float *b, uint64_t count, float rtol = 1e-05, float atol = 1e-08) {
-  uint32_t i = 0;
-
-  for (; i < count; ++i) {
-    if (fabs(a[i] - b[i]) > (atol + rtol * fabs(b[i]))) {
-      printf("compara failed: i= %d, a[i]=%f, b[i]=%f,atol=%f,rtol=%f\n", i, a[i], b[i], atol, rtol);
-      return false;
-    }
-  }
-
-  return true;
-}
-
-/// compFp32WithTData: compare the data with the data in hisi .t file
-/// param:
-///  actual_output_data: the result of ge
-///  expected_data_file: the path of hisi .t result file
-///  rtol:
-///  atol:
-/// return:
-///  true of false
-bool compFp32WithTData(float *actual_output_data, std::string expected_data_file, float rtol = 1e-05, float atol = 1e-08) {
-  std::vector<uint64_t> dim_info;
-  float *expected_output_data = (float *)readTestDataFile(expected_data_file, dim_info);
-
-  uint32_t i = 1;
-  uint64_t data_size = 1;
-  for (; i <= dim_info[0]; i++) {
-    data_size *= dim_info[i];
-  }
-  return allclose(actual_output_data, expected_output_data, data_size, rtol, atol);
-}
-
-int SwitchDatatype(DataType dt) {
-  int size = 1;
-  if (dt == ge::DT_FLOAT) size = 4;
-  if (dt == ge::DT_INT32) size = 4;
-  if (dt == ge::DT_FLOAT16) size = 2;
-  if (dt == ge::DT_INT64) size = 8;
-  return size;
-}
-
-ge::Tensor genTensor(std::vector<int64_t> tensor_shape, Format format, DataType dt) {
-  int size = 1;
-  for (int i = 0; i < tensor_shape.size(); i++) {
-    size = size * tensor_shape[i];
-  }
-
-  int data_type_size = SwitchDatatype(dt);
-
-  size = abs(size * data_type_size);
-  vector<uint8_t> data_value;
-
-  if (size == 0) {
-    TensorDesc input_tensor_desc = TensorDesc(ge::Shape(tensor_shape), format, dt);
-    input_tensor_desc.SetRealDimCnt(tensor_shape.size());
-    Tensor gen_tensor = Tensor(input_tensor_desc, data_value);
-    return gen_tensor;
-  }
-  for (int i = 0; i < size; i++) {
-    data_value.push_back(1);
-  }
-  TensorDesc input_tensor_desc = TensorDesc(ge::Shape(tensor_shape), format, dt);
-  input_tensor_desc.SetRealDimCnt(tensor_shape.size());
-  Tensor gen_tensor = Tensor(input_tensor_desc, data_value);
-  return gen_tensor;
-}
-
-ge::Tensor genTensor_withVaule(std::vector<int64_t> tensor_shape, float value) {
-  int size = 1;
-  for (int i = 0; i < tensor_shape.size(); i++) {
-    size = size * tensor_shape[i];
-  }
-
-  float *data_value = new float[size];
-  for (int i = 0; i < size; i++) {
-    *(data_value + i) = value;
-  }
-  Tensor gen_ge_tensor;
-  TensorDesc input_tensor_desc = TensorDesc(ge::Shape(tensor_shape), FORMAT_NCHW);
-  gen_ge_tensor.SetTensorDesc(input_tensor_desc);
-  gen_ge_tensor.SetData((uint8_t *)data_value, size * 4);
-
-  return gen_ge_tensor;
-}
-
-Tensor genTesnor_Shape_as_data(std::vector<int64_t> tensor_shape) {
-  Format format = FORMAT_NCHW;
-  DataType dt = DT_INT32;
-  int size = tensor_shape.size();
-  int32_t *tensor_data = new int32_t[size];
-  std::cout << "shape tensor size:" << size << endl;
-  for (int i = 0; i < size; i++) {
-    *(tensor_data + i) = tensor_shape[i];
-  }
-
-  Tensor gen_tensor;
-  TensorDesc input_tensor_desc = TensorDesc(ge::Shape({size}), FORMAT_NCHW, DT_INT32);
-  gen_tensor.SetData((uint8_t *)tensor_data, size * GetDatTypeSize(dt));
-  gen_tensor.SetTensorDesc(input_tensor_desc);
-
-  return gen_tensor;
-}
-
-/// train_flag is 0 when infer; train_flag is 1 when train; train_flag is 0 default
-/// run_mode_path is not 0,1,2 when TBE; run_mode_path is 1 when FE; run_mode_path is 0 default
-/// run_mode_path is 2 now when AICPU, ge.enabledlocalFmkop is 1
-ge::Status GEInitialize_api(string train_flag, string run_mode_path) {
-  ge::Status ret;
-  if (run_mode_path == "0") {
-    const std::map<string, string> config = {
-        {"device_id", "0,2,4,6"},
-        {"rank_table_file", "hccl from csa/paas"},
-        {"ge.graphRunMode", train_flag},
-        {"ge.aicpuFlag", "1"},
-        {"ge.feFlag", "1"},
-        {DDK_VERSION_FLAG, "1.60.T17.B830"},
-        {"ge.soLoadPath",
-         "/usr/local/HiAI/runtime/lib64/plugin/opskernel/libfe.so:/usr/local/HiAI/runtime/lib64/plugin/opskernel/"
-         "libaicpu_plugin.so"}};
-    ret = ge::GEInitialize(config);
-  } else if (run_mode_path == "1") {
-    const std::map<string, string> config = {
-        {"device_id", "0,2,4,6"},
-        {"rank_table_file", "hccl from csa/paas"},
-        {"ge.graphRunMode", train_flag},
-        {"ge.feFlag", "1"},
-        {DDK_VERSION_FLAG, "1.60.T17.B830"},
-        {TBE_PLUGIN_PATH_FLAG, "/usr/local/HiAI/runtime/lib64/tbe_plugin/bert"},
-        {"ge.soLoadPath", "/usr/local/HiAI/runtime/lib64/plugin/opskernel/libfe.so"}};
-    ret = ge::GEInitialize(config);
-  } else if (run_mode_path == "2") {
-    const std::map<string, string> config = {{"device_id", "0,2,4,6"},
-                                             {"rank_table_file", "hccl from csa/paas"},
-                                             {"ge.graphRunMode", train_flag},
-                                             {LOCAL_FMKOP_FLAG, "1"}};
-    ret = ge::GEInitialize(config);
-  } else {
-    const std::map<string, string> config = {
-        {"device_id", "0,2,4,6"},
-        {"rank_table_file", "hccl from csa/paas"},
-        {"ge.graphRunMode", train_flag},
-        {DDK_VERSION_FLAG, "1.60.T17.B830"},
-        {TBE_PLUGIN_PATH_FLAG, "/usr/local/HiAI/runtime/lib64/tbe_plugin/" + run_mode_path}};
-    ret = ge::GEInitialize(config);
-  }
-  std::cout << "GEInitialize_ret is " << ret << std::endl;
-
-  return ret;
-}
-
-/// train_flag is infer default
-/// run_mode: is multi group of [fe,aicpu,bert,deeplabv3,mobilenetv2,single_path_nas,ssd]
-/// but bert,deeplabv3,mobilenetv2,single_path_nas,ssd can only set one value from array
-/// eg:"fe,aicpu,bert" or "fe", default is “fe”
-/// "fe,aicpu,bert" remain open fe aicpu and bert
-ge::Status GEInitialize_api_new(string train_flag, string run_mode) {
-  ge::Status ret;
-  vector<string> modes;
-
-  char *strs = new char[run_mode.length() + 1];
-  strcpy(strs, run_mode.c_str());
-  const char *delim = ",";
-  char *p = strtok(strs, delim);
-  while (p) {
-    string s = p;        // transform substr to string
-    modes.push_back(s);  // save to result array
-    p = strtok(NULL, delim);
-  }
-
-  std::map<string, string> config = {
-      {"device_id", "0,2,4,6"},
-      {"rank_table_file", "hccl from csa/paas"},
-      {DDK_VERSION_FLAG, "1.60.T17.B830"},
-      {"ge.opsProtoLibPath", "/usr/local/HiAI/runtime/ops/op_proto/built-in/libopsproto.so"}};
-  if (train_flag == "infer")
-    config.insert(pair<string, string>("ge.graphRunMode", "0"));
-  else if (train_flag == "train")
-    config.insert(pair<string, string>("ge.graphRunMode", "1"));
-  else
-    std::cout << "GeInitialize give the error param" << std::endl;
-
-  for (int i = 0; i < modes.size(); i++) {
-    if (modes[i] == "fe") {
-      config.insert(pair<string, string>("ge.feFlag", "1"));
-      if (config.find("ge.soLoadPath") != config.end()) {
-        config["ge.soLoadPath"] =
-            "/usr/local/HiAI/runtime/lib64/plugin/opskernel/libfe.so:/usr/local/HiAI/runtime/lib64/plugin/opskernel/"
-            "libaicpu_plugin.so:/usr/local/HiAI/runtime/lib64/plugin/opskernel/libge_local_engine.so:/usr/local/HiAI/"
-            "runtime/lib64/plugin/opskernel/librts_engine.so";
-      } else {
-        config.insert(pair<string, string>(
-            "ge.soLoadPath",
-            "/usr/local/HiAI/runtime/lib64/plugin/opskernel/libfe.so:/usr/local/HiAI/runtime/lib64/plugin/opskernel/"
-            "libge_local_engine.so:/usr/local/HiAI/runtime/lib64/plugin/opskernel/librts_engine.so"));
-      }
-    } else if (modes[i] == "aicpu") {
-      config.insert(pair<string, string>("ge.aicpuFlag", "1"));
-      if (config.find("ge.soLoadPath") != config.end()) {
-        config["ge.soLoadPath"] =
-            "/usr/local/HiAI/runtime/lib64/plugin/opskernel/libfe.so:/usr/local/HiAI/runtime/lib64/plugin/opskernel/"
-            "libaicpu_plugin.so:/usr/local/HiAI/runtime/lib64/plugin/opskernel/libge_local_engine.so:/usr/local/HiAI/"
-            "runtime/lib64/plugin/opskernel/librts_engine.so";
-      } else {
-        config.insert(pair<string, string>(
-            "ge.soLoadPath",
-            "/usr/local/HiAI/runtime/lib64/plugin/opskernel/libaicpu_plugin.so:/usr/local/HiAI/runtime/lib64/plugin/"
-            "opskernel/libge_local_engine.so:/usr/local/HiAI/runtime/lib64/plugin/opskernel/librts_engine.so"));
-      }
-    } else if (modes[i] == "bert" || modes[i] == "deeplabv3" || modes[i] == "mobilenetv2" ||
-               modes[i] == "single_path_nas" || modes[i] == "ssd") {
-      config.insert(pair<string, string>(TBE_PLUGIN_PATH_FLAG, "/usr/local/HiAI/runtime/lib64/tbe_plugin/" + modes[i]));
-    } else if (modes[i] == "plugin") {
-
-    } else
-      std::cout << "GeInitialize give the error param" << std::endl;
-  }
-  ret = ge::GEInitialize(config);
-
-  std::cout << "GEInitialize_ret is " << ret << std::endl;
-
-  return ret;
-}
-
-ge::Status GEFinalize_api() {
-  ge::Status ret = ge::GEFinalize();
-  std::cout << "GEFinalize ret is " << ret << std::endl;
-
-  return ret;
-}
-
-/// set train_flag
-/// if run_mode_path is "fe" remain FE process; "fe,plugin" is FE and TBE plugin process
-/// "aicpu" is open aicpu plugin
-int RunGraph_initData(Graph &graph, string op_name, map<string, std::vector<int64_t>> attr_test, string train_flag,
-                      string run_mode_path) {
-  std::map<string, string> options = {{RUN_FLAG, "1"}};
-  uint32_t graph_id = 0;
-
-  ge::Status ret = GEInitialize_api_new(train_flag, run_mode_path);
-  EXPECT_EQ(ret, ge::SUCCESS);
-
-  ge::Session *session = new Session(options);
-  ASSERT_TRUE(session != NULL);
-
-  std::vector<Tensor> input;
-  if (attr_test.find("input1") != attr_test.end()) {
-    Tensor input_tensor = genTensor(attr_test["input1"]);
-    input.push_back(input_tensor);
-  }
-  if (attr_test.find("input2") != attr_test.end()) {
-    Tensor input_tensor = genTensor(attr_test["input2"]);
-    input.push_back(input_tensor);
-  }
-  if (attr_test.find("input3") != attr_test.end()) {
-    Tensor input_tensor = genTensor(attr_test["input3"]);
-    input.push_back(input_tensor);
-  }
-  std::vector<Tensor> output;
-
-  ret = session->AddGraph(graph_id, graph);
-  EXPECT_EQ(ret, ge::SUCCESS);
-  if (train_flag == "1") {
-    setenv("GE_TRAIN", "1", true);
-    ret = session->RunGraph(graph_id, input, output);
-    setenv("GE_TRAIN", "0", true);
-  } else {
-    ret = session->RunGraph(graph_id, input, output);
-  }
-  delete session;
-  GEFinalize_api();
-
-  if (ret != ge::SUCCESS) {
-    std::cout << " run graph failed" << std::endl;
-    return -1;
-  } else {
-    return 0;
-  }
-}
-
-ge::Status session_add_and_run_graph(ge::Session *session, uint32_t graph_id, Graph &graph, std::vector<Tensor> inputs,
-                                     std::vector<Tensor> &outputs) {
-  ge::Status ret = session->AddGraph(graph_id, graph);
-  EXPECT_EQ(ret, ge::SUCCESS);
-  ret = session->RunGraph(graph_id, inputs, outputs);
-
-  return ret;
-}
-
-ge::Session *create_session() {
-  // Init session
-  std::map<string, string> options = {{"a", "b"}, {TRAIN_FLAG, "1"}};
-  ge::Session *session = new Session(options);
-  ASSERT_TRUE(session != NULL);
-
-  return session;
-}
-
-ge::Session *create_aipp_session() {
-  // Init session
-  std::map<string, string> options = {{"a", "b"}, {TRAIN_FLAG, "1"}, {"ge.insertOpFile", "/root/host/ge/aipp.cfg"}};
-  ge::Session *session = new Session(options);
-  ASSERT_TRUE(session != NULL);
-
-  return session;
-}
-
-int buildCheckPointGraph(Graph &graph, map<string, TensorDesc> variables) {
-  std::vector<Operator> inputs{};
-  std::vector<Operator> outputs{};
-
-  for (map<string, TensorDesc>::iterator it = variables.begin(); it != variables.end(); ++it) {
-    auto var = op::Variable(string(it->first));
-    var.update_output_desc_y(it->second);
-    inputs.push_back(var);
-    graph.AddOp(var);
-  }
-
-  auto save = op::Save().create_dynamic_input_tensors(inputs.size());
-  for (int i = 0; i < inputs.size(); i++) {
-    save.set_dynamic_input_tensors(i, inputs[i]);
-  }
-
-  graph.SetInputs(inputs).SetOutputs(outputs);
-  return 0;
-}
-
-int buildInitGraph(Graph &graph, std::vector<TensorDesc> desc_var, std::vector<std::string> name_var,
-                   std::vector<float> values_var) {
-  std::vector<Operator> inputs{};
-  std::vector<Operator> outputs{};
-
-  for (int i = 0; i < desc_var.size(); i++) {
-    desc_var[i].SetRealDimCnt(desc_var[i].GetShape().GetDimNum());
-    auto tensor_data = genTensor_withVaule(desc_var[i].GetShape().GetDims(), values_var[i]);
-    auto var_constant = op::Constant().set_attr_value(tensor_data);
-    var_constant.update_output_desc_y(desc_var[i]);
-
-    auto var_init = op::Variable(string(name_var[i]));
-    var_init.update_output_desc_y(desc_var[i]);
-    auto var_assign = op::Assign().set_input_ref(var_init).set_input_value(var_constant);
-    inputs.push_back(var_init);
-  }
-  graph.SetInputs(inputs).SetOutputs(outputs);
-  return 0;
-}
-
-int buildInitGraph_other_dataType(Graph &graph, std::vector<TensorDesc> desc_var, std::vector<std::string> name_var) {
-  std::vector<Operator> inputs{};
-  std::vector<Operator> outputs{};
-
-  for (int i = 0; i < desc_var.size(); i++) {
-    desc_var[i].SetRealDimCnt(desc_var[i].GetShape().GetDimNum());
-    auto tensor_data = genTensor(desc_var[i].GetShape().GetDims(), desc_var[i].GetFormat(), desc_var[i].GetDataType());
-    auto var_constant = op::Constant().set_attr_value(tensor_data);
-    var_constant.update_output_desc_y(desc_var[i]);
-
-    auto var_init = op::Variable(string(name_var[i]));
-    var_init.update_output_desc_y(desc_var[i]);
-    auto var_assign = op::Assign().set_input_ref(var_init).set_input_value(var_constant);
-    inputs.push_back(var_init);
-
-    graph.AddOp(var_constant);
-    graph.AddOp(var_init);
-    graph.AddOp(var_assign);
-  }
-  graph.SetInputs(inputs).SetOutputs(outputs);
-  return 0;
-}
-
-bool build_multi_input_multi_output_graph(Graph &graph) {
-  auto data1 = op::Data("Data1").set_attr_index(0);
-  auto data2 = op::Data("Data2").set_attr_index(1);
-
-  vector<uint64_t> dim_info;
-
-  auto relu1 = op::Relu("Relu1").set_input_x(data1);
-  auto relu2 = op::Relu("Relu2").set_input_x(data2);
-
-  auto eltwise = op::Eltwise("Eltwise")
-                     .create_dynamic_input_x(2)
-                     .set_dynamic_input_x(0, relu1)
-                     .set_dynamic_input_x(1, relu2)
-                     .set_attr_N(2)
-                     .set_attr_mode(1)
-                     .set_attr_coeff({1, 1});
-
-  auto eltwise1 = op::Eltwise("Eltwise1")
-                      .create_dynamic_input_x(2)
-                      .set_dynamic_input_x(0, eltwise)
-                      .set_dynamic_input_x(1, eltwise)
-                      .set_attr_N(2)
-                      .set_attr_mode(1)
-                      .set_attr_coeff({1, 1});
-
-  auto eltwise2 = op::Eltwise("Eltwise2")
-                      .create_dynamic_input_x(2)
-                      .set_dynamic_input_x(0, eltwise)
-                      .set_dynamic_input_x(1, eltwise)
-                      .set_attr_N(2)
-                      .set_attr_mode(1)
-                      .set_attr_coeff({1, 1});
-
-  std::vector<Operator> inputs{data1, data2};
-  std::vector<Operator> outputs{eltwise1, eltwise2};
-  graph.SetInputs(inputs).SetOutputs(outputs);
-  return true;
-}
-
-void build_big_graph(Graph &graph, map<string, std::vector<int64_t>> attr) {
-  auto data = op::Data("Data").set_attr_index(0);
-  auto weight = op::Const("weight1").set_attr_value(genTensor(attr["weight"]));
-  vector<int64_t> weight_shape(attr["weight"].begin(), attr["weight"].end());
-  TensorDesc weight_desc(ge::Shape(weight_shape), FORMAT_NCHW, DT_FLOAT);
-  weight.update_output_desc_y(weight_desc);
-  auto conv_1 = op::Conv2D("conv1").set_input_x(data).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-
-  auto conv_2 = op::Conv2D("conv2").set_input_x(conv_1).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_3 = op::Conv2D("conv3").set_input_x(conv_2).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_4 = op::Conv2D("conv4").set_input_x(conv_3).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_5 = op::Conv2D("conv5").set_input_x(conv_4).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_6 = op::Conv2D("conv6").set_input_x(conv_5).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_7 = op::Conv2D("conv7").set_input_x(conv_6).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_8 = op::Conv2D("conv8").set_input_x(conv_7).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_9 = op::Conv2D("conv9").set_input_x(conv_8).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_10 = op::Conv2D("conv10").set_input_x(conv_9).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_11 = op::Conv2D("conv11").set_input_x(conv_10).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_12 = op::Conv2D("conv12").set_input_x(conv_11).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_13 = op::Conv2D("conv13").set_input_x(conv_12).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_14 = op::Conv2D("conv14").set_input_x(conv_13).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_15 = op::Conv2D("conv15").set_input_x(conv_14).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_16 = op::Conv2D("conv16").set_input_x(conv_15).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_17 = op::Conv2D("conv17").set_input_x(conv_16).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_18 = op::Conv2D("conv18").set_input_x(conv_17).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_19 = op::Conv2D("conv19").set_input_x(conv_18).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_20 = op::Conv2D("conv20").set_input_x(conv_19).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_21 = op::Conv2D("conv21").set_input_x(conv_20).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_22 = op::Conv2D("conv22").set_input_x(conv_21).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_23 = op::Conv2D("conv23").set_input_x(conv_22).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_24 = op::Conv2D("conv24").set_input_x(conv_23).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_25 = op::Conv2D("conv25").set_input_x(conv_24).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_26 = op::Conv2D("conv26").set_input_x(conv_25).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_27 = op::Conv2D("conv27").set_input_x(conv_26).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_28 = op::Conv2D("conv28").set_input_x(conv_27).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_29 = op::Conv2D("conv29").set_input_x(conv_28).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_30 = op::Conv2D("conv30").set_input_x(conv_29).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_31 = op::Conv2D("conv31").set_input_x(conv_30).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_32 = op::Conv2D("conv32").set_input_x(conv_31).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_33 = op::Conv2D("conv33").set_input_x(conv_32).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_34 = op::Conv2D("conv34").set_input_x(conv_33).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_35 = op::Conv2D("conv35").set_input_x(conv_34).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_36 = op::Conv2D("conv36").set_input_x(conv_35).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_37 = op::Conv2D("conv37").set_input_x(conv_36).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_38 = op::Conv2D("conv38").set_input_x(conv_37).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_39 = op::Conv2D("conv39").set_input_x(conv_38).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_40 = op::Conv2D("conv40").set_input_x(conv_39).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_41 = op::Conv2D("conv41").set_input_x(conv_40).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_42 = op::Conv2D("conv42").set_input_x(conv_41).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_43 = op::Conv2D("conv43").set_input_x(conv_42).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_44 = op::Conv2D("conv44").set_input_x(conv_43).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_45 = op::Conv2D("conv45").set_input_x(conv_44).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_46 = op::Conv2D("conv46").set_input_x(conv_45).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_47 = op::Conv2D("conv47").set_input_x(conv_46).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_48 = op::Conv2D("conv48").set_input_x(conv_47).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_49 = op::Conv2D("conv49").set_input_x(conv_48).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_50 = op::Conv2D("conv50").set_input_x(conv_49).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_51 = op::Conv2D("conv51").set_input_x(conv_50).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_52 = op::Conv2D("conv52").set_input_x(conv_51).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_53 = op::Conv2D("conv53").set_input_x(conv_52).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_54 = op::Conv2D("conv54").set_input_x(conv_53).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_55 = op::Conv2D("conv55").set_input_x(conv_54).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_56 = op::Conv2D("conv56").set_input_x(conv_55).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_57 = op::Conv2D("conv57").set_input_x(conv_56).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_58 = op::Conv2D("conv58").set_input_x(conv_57).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_59 = op::Conv2D("conv59").set_input_x(conv_58).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_60 = op::Conv2D("conv60").set_input_x(conv_59).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_61 = op::Conv2D("conv61").set_input_x(conv_60).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_62 = op::Conv2D("conv62").set_input_x(conv_61).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_63 = op::Conv2D("conv63").set_input_x(conv_62).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_64 = op::Conv2D("conv64").set_input_x(conv_63).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_65 = op::Conv2D("conv65").set_input_x(conv_64).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_66 = op::Conv2D("conv66").set_input_x(conv_65).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_67 = op::Conv2D("conv67").set_input_x(conv_66).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_68 = op::Conv2D("conv68").set_input_x(conv_67).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_69 = op::Conv2D("conv69").set_input_x(conv_68).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_70 = op::Conv2D("conv70").set_input_x(conv_69).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_71 = op::Conv2D("conv71").set_input_x(conv_70).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_72 = op::Conv2D("conv72").set_input_x(conv_71).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_73 = op::Conv2D("conv73").set_input_x(conv_72).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_74 = op::Conv2D("conv74").set_input_x(conv_73).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_75 = op::Conv2D("conv75").set_input_x(conv_74).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_76 = op::Conv2D("conv76").set_input_x(conv_75).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_77 = op::Conv2D("conv77").set_input_x(conv_76).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_78 = op::Conv2D("conv78").set_input_x(conv_77).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_79 = op::Conv2D("conv79").set_input_x(conv_78).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_80 = op::Conv2D("conv80").set_input_x(conv_79).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_81 = op::Conv2D("conv81").set_input_x(conv_80).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_82 = op::Conv2D("conv82").set_input_x(conv_81).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_83 = op::Conv2D("conv83").set_input_x(conv_82).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_84 = op::Conv2D("conv84").set_input_x(conv_83).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_85 = op::Conv2D("conv85").set_input_x(conv_84).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_86 = op::Conv2D("conv86").set_input_x(conv_85).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_87 = op::Conv2D("conv87").set_input_x(conv_86).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_88 = op::Conv2D("conv88").set_input_x(conv_87).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_89 = op::Conv2D("conv89").set_input_x(conv_88).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_90 = op::Conv2D("conv90").set_input_x(conv_89).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_91 = op::Conv2D("conv91").set_input_x(conv_80).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_92 = op::Conv2D("conv92").set_input_x(conv_91).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_93 = op::Conv2D("conv93").set_input_x(conv_92).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_94 = op::Conv2D("conv94").set_input_x(conv_93).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_95 = op::Conv2D("conv95").set_input_x(conv_94).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_96 = op::Conv2D("conv96").set_input_x(conv_95).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_97 = op::Conv2D("conv97").set_input_x(conv_96).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_98 = op::Conv2D("conv98").set_input_x(conv_97).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_99 = op::Conv2D("conv99").set_input_x(conv_98).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_100 = op::Conv2D("conv100").set_input_x(conv_99).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_101 = op::Conv2D("conv101").set_input_x(conv_100).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_102 = op::Conv2D("conv102").set_input_x(conv_101).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_103 = op::Conv2D("conv103").set_input_x(conv_102).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_104 = op::Conv2D("conv104").set_input_x(conv_103).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_105 = op::Conv2D("conv105").set_input_x(conv_104).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_106 = op::Conv2D("conv106").set_input_x(conv_105).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_107 = op::Conv2D("conv107").set_input_x(conv_106).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_108 = op::Conv2D("conv108").set_input_x(conv_107).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_109 = op::Conv2D("conv109").set_input_x(conv_108).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_110 = op::Conv2D("conv110").set_input_x(conv_109).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_111 = op::Conv2D("conv111").set_input_x(conv_110).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_112 = op::Conv2D("conv112").set_input_x(conv_111).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_113 = op::Conv2D("conv113").set_input_x(conv_112).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_114 = op::Conv2D("conv114").set_input_x(conv_113).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_115 = op::Conv2D("conv115").set_input_x(conv_114).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_116 = op::Conv2D("conv116").set_input_x(conv_115).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_117 = op::Conv2D("conv117").set_input_x(conv_116).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_118 = op::Conv2D("conv118").set_input_x(conv_117).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_119 = op::Conv2D("conv119").set_input_x(conv_118).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_120 = op::Conv2D("conv120").set_input_x(conv_119).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_121 = op::Conv2D("conv121").set_input_x(conv_120).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_122 = op::Conv2D("conv122").set_input_x(conv_121).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_123 = op::Conv2D("conv123").set_input_x(conv_122).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_124 = op::Conv2D("conv124").set_input_x(conv_123).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_125 = op::Conv2D("conv125").set_input_x(conv_124).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_126 = op::Conv2D("conv126").set_input_x(conv_125).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_127 = op::Conv2D("conv127").set_input_x(conv_126).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_128 = op::Conv2D("conv128").set_input_x(conv_127).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_129 = op::Conv2D("conv129").set_input_x(conv_128).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_130 = op::Conv2D("conv130").set_input_x(conv_129).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-
-  std::vector<Operator> inputs{data};
-  std::vector<Operator> outputs{conv_130};
-  graph.SetInputs(inputs).SetOutputs(outputs);
-}
-
-int GetDatTypeSize(DataType dt) {
-  int dailation = 1;
-  if (dt == ge::DT_FLOAT)
-    dailation = 4;
-  else if (dt == ge::DT_FLOAT16)
-    dailation = 2;
-  else if (dt == ge::DT_INT16)
-    dailation = 2;
-  else if (dt == ge::DT_UINT16)
-    dailation = 2;
-  else if (dt == ge::DT_INT32)
-    dailation = 4;
-  else if (dt == ge::DT_UINT32)
-    dailation = 4;
-  else if (dt == ge::DT_INT64)
-    dailation = 8;
-  else if (dt == ge::DT_UINT64)
-    dailation = 8;
-  else if (dt == ge::DT_INT8)
-    dailation = 1;
-
-  return dailation;
-}
-
-int buildConvGraph_new(Graph &graph, std::vector<TensorDesc> desc_var, std::vector<std::string> name_var, int flag,
-                       Format format) {
-  auto data_x_shape = op::Data("xShape").set_attr_index(0);
-  auto var = op::Variable(name_var[0]);
-  auto var1 = op::Variable(name_var[1]);    //add one seat of ApplyMomentum()
-  auto label1 = op::Variable(name_var[2]);  //add one seat of ApplyMomentum()
-  auto conv2dgrad = op::Conv2DBackpropFilterD("output_1");
-  auto test2 = op::ApplyMomentum();
-
-  var.update_output_desc_y(desc_var[0]);
-  var1.update_output_desc_y(desc_var[1]);
-  label1.update_output_desc_y(desc_var[2]);
-
-  graph.AddOp(var);
-  graph.AddOp(var1);
-  graph.AddOp(label1);
-
-  auto conv2d = op::Conv2D().set_input_x(data_x_shape).set_input_filter(var).set_attr_strides({1, 1, 1, 1}).set_attr_pads({0,0,0,0});
-  update_op_format(conv2d, format);
-  ge::TensorDesc tensor_desc_w = conv2d.GetInputDesc("filter");
-  tensor_desc_w.SetFormat(format);
-  conv2d.UpdateInputDesc("filter", tensor_desc_w);
-
-  if (flag >= 1) {
-    conv2dgrad.set_input_x(data_x_shape)
-        .set_attr_filter_size(desc_var[0].GetShape().GetDims())
-        .set_input_out_backprop(conv2d)
-        .set_attr_strides({1, 1, 1, 1})
-        .set_attr_pads({0, 0, 0, 0});
-    update_op_format(conv2dgrad, format);
-    graph.AddOp(conv2dgrad);
-  }
-  if (flag >= 2) {
-    // set conv2dgrad var
-    test2.set_input_accum(var1)
-        .set_input_grad(conv2dgrad)
-        .set_input_lr(label1)
-        .set_input_momentum(label1)
-        .set_input_var(var);
-    graph.AddOp(test2);
-  }
-
-  std::vector<Operator> inputs{data_x_shape};  // set all val
-  std::vector<Operator> outputs{conv2d};
-  graph.SetInputs(inputs).SetOutputs(outputs);
-  graph.AddOp(conv2d);
-
-  return 0;
-}
-
-/// load bin data_fail
-/// input_path: path of bin data_file
-/// shapes: the shape of Tensor
-/// ft: the format of Tensor
-/// dt: the dataType of Tensor
-Tensor load_variable_input_data(string input_path, std::vector<int64_t> shapes, Format ft, DataType dt) {
-  vector<uint64_t> dim_info1;
-
-  uint8_t *input_data = (uint8_t *)readTestDataFile(input_path, dim_info1);  // common.h
-  TensorDesc input_tensor_desc = TensorDesc(ge::Shape(shapes), ft, dt);
-  input_tensor_desc.SetRealDimCnt(shapes.size());
-  Tensor input_tensor = Tensor(input_tensor_desc, input_data, GetDatTypeSize(dt) * dim_info1[dim_info1[0] + 1]);
-  return input_tensor;
-}
diff --git a/tests/st/resnet50/common.h b/tests/st/resnet50/common.h
deleted file mode 100644
index 75805db7..00000000
--- a/tests/st/resnet50/common.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/**
- * Copyright 2019-2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ST_RESNET50_GE_COMMON_H_
-#define ST_RESNET50_GE_COMMON_H_
-#include "common/ge_inner_error_codes.h"
-#include "utils/tensor_utils.h"
-
-#define MY_USER_GE_LOGI(...) GE_LOG_INFO(1, __VA_ARGS__)
-#define MY_USER_GE_LOGW(...) GE_LOG_WARN(1, __VA_ARGS__)
-#define MY_USER_GE_LOGE(...) GE_LOG_ERROR(1, 3, __VA_ARGS__)
-
-#ifndef USER_GE_LOGI
-#define USER_GE_LOGI MY_USER_GE_LOGI
-#endif  // USER_GE_LOGI
-
-#ifndef USER_GE_LOGW
-#define USER_GE_LOGW MY_USER_GE_LOGW
-#endif  // USER_GE_LOGW
-
-#ifndef USER_GE_LOGE
-#define USER_GE_LOGE MY_USER_GE_LOGE
-#endif  // USER_GE_LOGE
-
-/// train_flag is 0 when infer, train_flag is 1 when train.this param is set for RunGranph_readData() and
-/// RunGraph_initData()
-#define TRAIN_FLAG_INFER "infer"
-#define TRAIN_FLAG_TRAIN "train"
-
-#include <string.h>
-#include <unistd.h>
-#include <algorithm>
-#include <chrono>
-#include <iostream>
-#include <thread>
-#include <vector>
-
-#include "ge_api.h"
-#include "graph.h"
-#include "ptest.h"
-#include "ops/all_ops.h"
-using namespace std;
-using namespace ge;
-
-// read bin file and compile result
-void update_op_format(Operator ops, Format format = ge::FORMAT_NCHW);
-void getDimInfo(FILE *fp, std::vector<uint64_t> &dim_info);
-void *readTestDataFile(std::string infile, std::vector<uint64_t> &dim_info);
-void *readUint8TestDataFile(std::string infile, int size);
-bool allclose(float *a, float *b, uint64_t count, float rtol, float atol);
-bool compFp32WithTData(float *actual_output_data, std::string expected_data_file, float rtol, float atol);
-Tensor load_variable_input_data(string input_path, std::vector<int64_t> shapes, Format ft = ge::FORMAT_NCHW,
-                                DataType dt = ge::DT_FLOAT);
-// constructor Tensor
-int GetDatTypeSize(DataType dt);
-ge::Tensor genTensor(std::vector<int64_t> tensor_shape, Format format = ge::FORMAT_NCHW, DataType dt = ge::DT_FLOAT);
-ge::Tensor genTensor_withVaule(std::vector<int64_t> tensor_shape, float value = 1);
-Tensor genTesnor_Shape_as_data(std::vector<int64_t> tensor_shape);
-// Init GE
-ge::Status GEInitialize_api(string train_flag = "0", string run_mode_path = "0");
-ge::Status GEInitialize_api_new(string train_flag = "infer", string run_mode = "fe");
-ge::Status GEFinalize_api();
-// constructor session and build graph
-ge::Session *create_aipp_session();
-ge::Session *create_session();
-ge::Status session_add_and_run_graph(ge::Session *session, uint32_t graphId, Graph &graph, std::vector<Tensor> inputs,
-                                     std::vector<Tensor> &outputs);
-
-// common interface for infer
-int RunGraph_initData(Graph &graph, string op_name, map<string, std::vector<int64_t>> attr_test,
-                      string train_flag = "infer", string run_mode_path = "fe");
-void Inputs_load_Data(string op_name, std::vector<Tensor> &input, map<string, std::vector<int64_t>> attr_test,
-                      Format format = ge::FORMAT_NCHW, DataType dt = ge::DT_FLOAT);
-bool comparaData(std::vector<Tensor> &output, string op_name, map<string, std::vector<int64_t>> attr_test);
-int RunGraph_readData(Graph &graph, string op_name, map<string, std::vector<int64_t>> attr_test,
-                      string train_flag = "infer", string run_mode_path = "fe", Format format = ge::FORMAT_NCHW,
-                      DataType dt = ge::DT_FLOAT);
-
-// common interface for train
-int buildCheckPointGraph(Graph &graph, map<string, TensorDesc> variables);
-int buildInitGraph(Graph &graph, std::vector<TensorDesc> desc_var, std::vector<std::string> name_var,
-                   std::vector<float> values_var);
-int buildInitGraph_other_dataType(Graph &graph, std::vector<TensorDesc> desc_var, std::vector<std::string> name_var);
-
-bool build_multi_input_multi_output_graph(Graph &graph);
-void build_big_graph(Graph &graph, map<string, std::vector<int64_t>> attr);
-int buildConvGraph_new(Graph &graph, std::vector<TensorDesc> desc_var, std::vector<std::string> name_var, int flag = 2);
-
-#endif  // ST_RESNET50_GE_COMMON_H_
diff --git a/tests/st/resnet50/ptest.h b/tests/st/resnet50/ptest.h
deleted file mode 100644
index 568969f8..00000000
--- a/tests/st/resnet50/ptest.h
+++ /dev/null
@@ -1,225 +0,0 @@
-/**
- * Copyright 2019-2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ST_RESNET50_PTEST_H_
-#define ST_RESNET50_PTEST_H_
-
-#include <stdarg.h>
-#include <string.h>
-#include <exception>
-#include <functional>
-#include <iostream>
-#include <list>
-#include <map>
-#include <memory>
-#include <string>
-
-namespace ptest {
-class assertion_error : public std::exception {
- public:
-  const char *what() const throw() { return "Assertion Exception"; }
-};
-
-class TestFixture {
- public:
-  virtual void SetUp() {}
-  virtual void TearDown() {}
-  void Run() { _func(); }
-  void BindFunction(std::function<void(void)> function) { _func = function; }
-  void SetName(const std::string &name) { _name = name; }
-  std::string Name() const { return _name; }
-  virtual ~TestFixture() {}
-
- private:
-  std::function<void(void)> _func;
-  std::string _name;
-};
-
-enum TestResult { SUCCESS, FAILED, UNAVAILABLE, UNKNOWN, NOCASEFOUND };
-
-class TestManager {
- public:
-  static TestManager &GetSingleton() {
-    static TestManager instance;
-    return instance;
-  }
-  void RegisterTest(const std::string &name, TestFixture *fixture) { _testfixtures[name] = fixture; }
-
-  const std::string GetRunningTestcaseName() const { return _running_testcase_name; }
-
-  const std::list<std::string> GetAllTestNames() const {
-    std::list<std::string> result;
-    for (auto &t : _testfixtures) {
-      result.push_back(t.first);
-    }
-    return result;
-  }
-
-  TestResult RunTest(const std::string &name) {
-    if (_testfixtures.find(name) == _testfixtures.end()) {
-      return NOCASEFOUND;
-    }
-
-    _running_testcase_name = name;
-
-    do {
-      SetTestResult(name, UNKNOWN);
-      _testfixtures[name]->SetUp();
-      if (_testresults[name] == FAILED) {
-        _testresults[name] = UNAVAILABLE;
-        break;
-      }
-      SetTestResult(name, SUCCESS);
-      try {
-        _testfixtures[name]->Run();
-      } catch (assertion_error &e) {
-        // Do nothing as the error has been handled by the TestManager.
-      }
-      _testfixtures[name]->TearDown();
-    } while (0);
-
-    return _testresults[name];
-  }
-  void SetTestResult(const std::string &name, TestResult result) { _testresults[name] = result; }
-  TestResult GetTestResult(const std::string &name) { return _testresults[name]; }
-
- private:
-  std::map<std::string, TestFixture *> _testfixtures;
-  std::map<std::string, TestResult> _testresults;
-  std::string _running_testcase_name;
-};
-
-class TestFixtureRegister {
- public:
-  TestFixtureRegister(const std::string &name, TestFixture *fixture, std::function<void(void)> function) {
-    fixture->BindFunction(function);
-    fixture->SetName(name);
-    TestManager::GetSingleton().RegisterTest(name, fixture);
-  }
-};
-}  // namespace ptest
-
-#define _STR(x) #x
-#define _EMPTY_NAMESPACE
-
-#define _TEST(NAMESPACE, FIXTURECLASS, TESTNAME, CASENAME)                                              \
-  void g_func_##TESTNAME##_##CASENAME(void);                                                            \
-  NAMESPACE::FIXTURECLASS g_fixture_##TESTNAME##_##CASENAME;                                            \
-  ptest::TestFixtureRegister g_register_##TESTNAME##_##CASENAME(                                        \
-      _STR(TESTNAME##_##CASENAME), &g_fixture_##TESTNAME##_##CASENAME, g_func_##TESTNAME##_##CASENAME); \
-  void g_func_##TESTNAME##_##CASENAME(void)
-
-#define TEST(TESTNAME, CASENAME) _TEST(ptest, TestFixture, TESTNAME, CASENAME)
-
-#define TEST_F(TESTFIXTURE, CASENAME) _TEST(_EMPTY_NAMESPACE, TESTFIXTURE, TESTFIXTURE, CASENAME)
-
-#define EXPECT_TRUE(X)                                                                    \
-  do {                                                                                    \
-    if (!(X)) {                                                                           \
-      std::string test_name = ptest::TestManager::GetSingleton().GetRunningTestcaseName(); \
-      ptest::TestManager::GetSingleton().SetTestResult(test_name, ptest::FAILED);          \
-      std::cerr << #X << "Expectation Failed\n"                                           \
-                << "Testcase Name: " << test_name << "\n"                                  \
-                << "File: " __FILE__ << "\tLine:" << __LINE__ << std::endl;               \
-    }                                                                                     \
-  } while (0);
-
-// With the macro definition ensures that the compiler can detect compiler warning.
-#define Max_Log_Len 1024
-#define PRINT_ERR(lpszFormat, ...)                              \
-  do {                                                          \
-    char szTmpBuf[Max_Log_Len + 1] = {0};                       \
-    snprintf(szTmpBuf, Max_Log_Len, lpszFormat, ##__VA_ARGS__); \
-    std::cerr << szTmpBuf << std::endl;                         \
-  } while (0)
-
-// Increase the content of print error messages and error to facilitate rapid analysis
-#define EXPECT_TRUE_C(X, ERR_TYPE, format, ...)                                                             \
-  do {                                                                                                      \
-    if (!(X)) {                                                                                             \
-      std::string test_name = ptest::TestManager::GetSingleton().GetRunningTestcaseName();                   \
-      ptest::TestManager::GetSingleton().SetTestResult(test_name, ptest::FAILED);                            \
-      std::cerr << #X << " Expectation Failed."                                                             \
-                << "Testcase Name: " << test_name << " File:" __FILE__ << " Line:" << __LINE__ << std::endl; \
-      PRINT_ERR("[" ERR_TYPE "]" format, ##__VA_ARGS__);                                                    \
-    }                                                                                                       \
-  } while (0)
-
-#define ASSERT_TRUE(X)                                                                    \
-  do {                                                                                    \
-    if (!(X)) {                                                                           \
-      std::string test_name = ptest::TestManager::GetSingleton().GetRunningTestcaseName(); \
-      ptest::TestManager::GetSingleton().SetTestResult(test_name, ptest::FAILED);          \
-      std::cerr << #X << "Assertion Failed\n"                                             \
-                << "Testcase Name: " << test_name << "\n"                                  \
-                << "File: " __FILE__ << "\tLine:" << __LINE__ << std::endl;               \
-      throw ptest::assertion_error();                                                     \
-    }                                                                                     \
-  } while (0);
-
-// Add printing error information and error line content for quick analysis
-#define ASSERT_TRUE_C(X, ERR_TYPE, format, ...)                                                             \
-  do {                                                                                                      \
-    if (!(X)) {                                                                                             \
-      std::string test_name = ptest::TestManager::GetSingleton().GetRunningTestcaseName();                   \
-      ptest::TestManager::GetSingleton().SetTestResult(test_name, ptest::FAILED);                            \
-      std::cerr << #X << " Assertion Failed."                                                               \
-                << "Testcase Name: " << test_name << " File:" __FILE__ << " Line:" << __LINE__ << std::endl; \
-      PRINT_ERR("[" ERR_TYPE "]" format, ##__VA_ARGS__);                                                    \
-      throw ptest::assertion_error();                                                                       \
-    }                                                                                                       \
-  } while (0);
-
-#define CONFIG_ERR "CONFIG_ERR"
-#define LOAD_MODEL_ERR "LOAD_MODEL_ERR"
-#define FILE_READ_ERR "FILE_READ_ERR"
-#define RUN_ERROR "RUN_ERROR"
-#define MEM_ERROR "MEM_ERROR"
-#define RESULT_ERR "RESULT_ERR"
-
-#define EXPECT_FALSE(X) EXPECT_TRUE(!(X))
-#define EXPECT_EQ(X, Y) EXPECT_TRUE(((X) == (Y)))
-#define EXPECT_NE(X, Y) EXPECT_TRUE(((X) != (Y)))
-#define EXPECT_GT(X, Y) EXPECT_TRUE(((X) > (Y)))
-#define EXPECT_GE(X, Y) EXPECT_TRUE(((X) >= (Y)))
-#define EXPECT_LT(X, Y) EXPECT_TRUE(((X) < (Y)))
-#define EXPECT_LE(X, Y) EXPECT_TRUE(((X) <= (Y)))
-
-#define EXPECT_FALSE_C(X, ERR_TYPE, format, ...) EXPECT_TRUE_C(!(X), ERR_TYPE, format, ##__VA_ARGS__)
-#define EXPECT_EQ_C(X, Y, ERR_TYPE, format, ...) EXPECT_TRUE_C(((X) == (Y)), ERR_TYPE, format, ##__VA_ARGS__)
-#define EXPECT_NE_C(X, Y, ERR_TYPE, format, ...) EXPECT_TRUE_C(((X) != (Y)), ERR_TYPE, format, ##__VA_ARGS__)
-#define EXPECT_GT_C(X, Y, ERR_TYPE, format, ...) EXPECT_TRUE_C(((X) > (Y)), ERR_TYPE, format, ##__VA_ARGS__)
-#define EXPECT_GE_C(X, Y, ERR_TYPE, format, ...) EXPECT_TRUE_C(((X) >= (Y)), ERR_TYPE, format, ##__VA_ARGS__)
-#define EXPECT_LT_C(X, Y, ERR_TYPE, format, ...) EXPECT_TRUE_C(((X) < (Y)), ERR_TYPE, format, ##__VA_ARGS__)
-#define EXPECT_LE_C(X, Y, ERR_TYPE, format, ...) EXPECT_TRUE_C(((X) <= (Y)), ERR_TYPE, format, ##__VA_ARGS__)
-
-#define ASSERT_FALSE(X) ASSERT_TRUE(!(X))
-#define ASSERT_EQ(X, Y) ASSERT_TRUE(((X) == (Y)))
-#define ASSERT_NE(X, Y) ASSERT_TRUE(((X) != (Y)))
-#define ASSERT_GT(X, Y) ASSERT_TRUE(((X) > (Y)))
-#define ASSERT_GE(X, Y) ASSERT_TRUE(((X) >= (Y)))
-#define ASSERT_LT(X, Y) ASSERT_TRUE(((X) < (Y)))
-#define ASSERT_LE(X, Y) ASSERT_TRUE(((X) <= (Y)))
-
-#define ASSERT_FALSE_C(X, ERR_TYPE, format, ...) ASSERT_TRUE_C(!(X), ERR_TYPE, format, ##__VA_ARGS__)
-#define ASSERT_EQ_C(X, Y, ERR_TYPE, format, ...) ASSERT_TRUE_C(((X) == (Y)), ERR_TYPE, format, ##__VA_ARGS__)
-#define ASSERT_NE_C(X, Y, ERR_TYPE, format, ...) ASSERT_TRUE_C(((X) != (Y)), ERR_TYPE, format, ##__VA_ARGS__)
-#define ASSERT_GT_C(X, Y, ERR_TYPE, format, ...) ASSERT_TRUE_C(((X) > (Y)), ERR_TYPE, format, ##__VA_ARGS__)
-#define ASSERT_GE_C(X, Y, ERR_TYPE, format, ...) ASSERT_TRUE_C(((X) >= (Y)), ERR_TYPE, format, ##__VA_ARGS__)
-#define ASSERT_LT_C(X, Y, ERR_TYPE, format, ...) ASSERT_TRUE_C(((X) < (Y)), ERR_TYPE, format, ##__VA_ARGS__)
-#define ASSERT_LE_C(X, Y, ERR_TYPE, format, ...) ASSERT_TRUE_C(((X) <= (Y)), ERR_TYPE, format, ##__VA_ARGS__)
-
-#endif  // ST_RESNET50_PTEST_H_
diff --git a/tests/st/resnet50/resnet50_train.cc b/tests/st/resnet50/resnet50_train.cc
deleted file mode 100644
index f1d1e58d..00000000
--- a/tests/st/resnet50/resnet50_train.cc
+++ /dev/null
@@ -1,852 +0,0 @@
-﻿/**
- * Copyright 2019-2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <assert.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <algorithm>
-#include <chrono>
-#include <ctime>
-#include <sstream>
-
-#include "common.h"
-#include "ge_api.h"
-#include "graph.h"
-#include "ops/all_ops.h"
-#include "types.h"
-#include "utils/tensor_utils.h"
-
-using namespace std;
-using namespace ge;
-using namespace op;
-
-typedef bool (*Func)(Graph &graph);
-
-#define PADDING_MODE 6
-#define GRAD_PADDING_MODE 3
-vector<int64_t> pad_1{1, 1, 1, 1};
-vector<int64_t> pad_0{0, 0, 0, 0};
-vector<int64_t> stride_1{1, 1};
-vector<int64_t> stride_2{2, 2};
-
-// (int out_channels, int h, int w, vector<uint_64> stride{1,1}, vector<uint_64> pad{1,1,1,1}, op::Data() input)
-#define GENERATE_CONV_VAR(LAYER, BLK, OPNUM, in_channels, out_channels, h, w, stride, pad, input)                     \
-  auto &LAYER##_##BLK##_##OPNUM##_input = input;                                                                      \
-                                                                                                                      \
-  TensorDesc LAYER##_##BLK##_##OPNUM##_desc(ge::Shape({out_channels, in_channels, h, w}), FORMAT_NCHW, DT_FLOAT);     \
-  auto LAYER##_##BLK##_##OPNUM##_weight = op::Variable(string(#LAYER) + string(#BLK) + string(#OPNUM) + "_weight");   \
-  LAYER##_##BLK##_##OPNUM##_weight.update_output_desc_y(LAYER##_##BLK##_##OPNUM##_desc);                              \
-                                                                                                                      \
-  auto LAYER##_##BLK##_##OPNUM##_mom_weight =                                                                         \
-      op::Variable(string(#LAYER) + string(#BLK) + string(#OPNUM) + "_mom_weight");                                   \
-  LAYER##_##BLK##_##OPNUM##_mom_weight.update_output_desc_y(LAYER##_##BLK##_##OPNUM##_desc);                          \
-  LAYER##_##BLK##_##OPNUM##_mom_weight.update_input_desc_x(LAYER##_##BLK##_##OPNUM##_desc);                           \
-                                                                                                                      \
-  cout << string(#LAYER) + string(#BLK) + string(#OPNUM) << "'s weight shape is:" << in_channels << out_channels << h \
-       << w << endl;                                                                                                  \
-  cout << string(#LAYER) + string(#BLK) + string(#OPNUM)                                                              \
-       << "'s input_x op's shape is:" << input.GetOutputDesc("y").GetShape().GetDim(2) << endl;                       \
-  auto LAYER##_##BLK##_##OPNUM##_tmp_dims = input.GetOutputDesc("y").GetShape().GetDims();                            \
-  for (auto LAYER##_##BLK##_##OPNUM##_tmp_it = LAYER##_##BLK##_##OPNUM##_tmp_dims.begin();                            \
-       LAYER##_##BLK##_##OPNUM##_tmp_it != LAYER##_##BLK##_##OPNUM##_tmp_dims.end();                                  \
-       LAYER##_##BLK##_##OPNUM##_tmp_it++) {                                                                          \
-    cout << *LAYER##_##BLK##_##OPNUM##_tmp_it;                                                                        \
-  }                                                                                                                   \
-  cout << endl;                                                                                                       \
-                                                                                                                      \
-  auto LAYER##_##BLK##_##OPNUM = op::Conv2D(string(#LAYER) + string(#BLK) + string(#OPNUM))                           \
-                                     .set_input_x(input, "y")                                                         \
-                                     .set_input_filter(LAYER##_##BLK##_##OPNUM##_weight)                              \
-                                     .set_attr_strides({1, 1, stride[0], stride[1]})                                  \
-                                     .set_attr_pads(pad)                                                              \
-                                     .set_attr_data_format("NCHW");                                                   \
-  update_op_format(LAYER##_##BLK##_##OPNUM);
-
-#define GENERATE_CONSTANT(LAYER, BLK, OPNUM, CONSTNAME)                                                           \
-  Tensor LAYER##_##BLK##_##OPNUM##_##CONSTNAME##_tensor;                                                          \
-  float *LAYER##_##BLK##_##OPNUM##_##CONSTNAME##_data = new float[LAYER##_##BLK##_##OPNUM##_size];                \
-  for (int i = 0; i < (int)LAYER##_##BLK##_##OPNUM##_size; i++) {                                                 \
-    *(LAYER##_##BLK##_##OPNUM##_##CONSTNAME##_data + i) = 0.01;                                                   \
-  }                                                                                                               \
-  LAYER##_##BLK##_##OPNUM##_##CONSTNAME##_tensor.SetData((uint8_t *)LAYER##_##BLK##_##OPNUM##_##CONSTNAME##_data, \
-                                                         LAYER##_##BLK##_##OPNUM##_size * sizeof(float));         \
-  LAYER##_##BLK##_##OPNUM##_##CONSTNAME##_tensor.SetTensorDesc(LAYER##_##BLK##_##OPNUM##_desc);                   \
-                                                                                                                  \
-  auto LAYER##_##BLK##_##OPNUM##_##CONSTNAME##_constant =                                                         \
-      op::Constant().set_attr_value(LAYER##_##BLK##_##OPNUM##_##CONSTNAME##_tensor);                              \
-  LAYER##_##BLK##_##OPNUM##_##CONSTNAME##_constant.update_output_desc_y(LAYER##_##BLK##_##OPNUM##_desc);          \
-  delete[] LAYER##_##BLK##_##OPNUM##_##CONSTNAME##_data;
-
-#define GENERATE_CONV_VAR_VAR(LAYER, BLK, OPNUM, in_channels, out_channels, h, w, stride, pad, input)               \
-  TensorDesc LAYER##_##BLK##_##OPNUM##_desc(ge::Shape({out_channels, in_channels, h, w}), FORMAT_NCHW, DT_FLOAT);   \
-  uint32_t LAYER##_##BLK##_##OPNUM##_size = LAYER##_##BLK##_##OPNUM##_desc.GetShape().GetShapeSize();               \
-  auto LAYER##_##BLK##_##OPNUM##_weight = op::Variable(string(#LAYER) + string(#BLK) + string(#OPNUM) + "_weight"); \
-  LAYER##_##BLK##_##OPNUM##_weight.update_output_desc_y(LAYER##_##BLK##_##OPNUM##_desc);                            \
-                                                                                                                    \
-  auto LAYER##_##BLK##_##OPNUM##_mom_weight =                                                                       \
-      op::Variable(string(#LAYER) + string(#BLK) + string(#OPNUM) + "_mom_weight");                                 \
-  LAYER##_##BLK##_##OPNUM##_mom_weight.update_output_desc_y(LAYER##_##BLK##_##OPNUM##_desc);                        \
-                                                                                                                    \
-  GENERATE_CONSTANT(LAYER, BLK, OPNUM, weight);                                                                     \
-  auto LAYER##_##BLK##_##OPNUM##_weight_assign = op::Assign()                                                       \
-                                                     .set_input_ref(LAYER##_##BLK##_##OPNUM##_weight)               \
-                                                     .set_input_value(LAYER##_##BLK##_##OPNUM##_weight_constant);   \
-                                                                                                                    \
-  GENERATE_CONSTANT(LAYER, BLK, OPNUM, mom_weight);                                                                 \
-  auto LAYER##_##BLK##_##OPNUM##_mom_weight_assign =                                                                \
-      op::Assign()                                                                                                  \
-          .set_input_ref(LAYER##_##BLK##_##OPNUM##_mom_weight)                                                      \
-          .set_input_value(LAYER##_##BLK##_##OPNUM##_mom_weight_constant);                                          \
-                                                                                                                    \
-  input.push_back(LAYER##_##BLK##_##OPNUM##_weight);                                                                \
-  input.push_back(LAYER##_##BLK##_##OPNUM##_mom_weight);
-
-// (int out_channels, Operator& input)
-#define GENERATE_BN_VAR(LAYER, BLK, OPNUM, out_channels, input)                                                   \
-  auto &LAYER##_##BLK##_##OPNUM##_input = input;                                                                  \
-                                                                                                                  \
-  TensorDesc LAYER##_##BLK##_##OPNUM##_desc(ge::Shape({1, out_channels, 1, 1}), FORMAT_NCHW, DT_FLOAT);           \
-  auto LAYER##_##BLK##_##OPNUM##_scale = op::Variable(string(#LAYER) + string(#BLK) + string(#OPNUM) + "_scale"); \
-  LAYER##_##BLK##_##OPNUM##_scale.update_output_desc_y(LAYER##_##BLK##_##OPNUM##_desc);                           \
-                                                                                                                  \
-  auto LAYER##_##BLK##_##OPNUM##_mom_scale =                                                                      \
-      op::Variable(string(#LAYER) + string(#BLK) + string(#OPNUM) + "_mom_scale");                                \
-  LAYER##_##BLK##_##OPNUM##_mom_scale.update_output_desc_y(LAYER##_##BLK##_##OPNUM##_desc);                       \
-                                                                                                                  \
-  auto LAYER##_##BLK##_##OPNUM##_b = op::Variable(string(#LAYER) + string(#BLK) + string(#OPNUM) + "_b");         \
-  LAYER##_##BLK##_##OPNUM##_b.update_output_desc_y(LAYER##_##BLK##_##OPNUM##_desc);                               \
-                                                                                                                  \
-  auto LAYER##_##BLK##_##OPNUM##_mom_b = op::Variable(string(#LAYER) + string(#BLK) + string(#OPNUM) + "_mom_b"); \
-  LAYER##_##BLK##_##OPNUM##_mom_b.update_output_desc_y(LAYER##_##BLK##_##OPNUM##_desc);                           \
-                                                                                                                  \
-  auto LAYER##_##BLK##_##OPNUM##_mean = op::Variable(string(#LAYER) + string(#BLK) + string(#OPNUM) + "_mean");   \
-  LAYER##_##BLK##_##OPNUM##_mean.update_output_desc_y(LAYER##_##BLK##_##OPNUM##_desc);                            \
-  auto LAYER##_##BLK##_##OPNUM##_variance =                                                                       \
-      op::Variable(string(#LAYER) + string(#BLK) + string(#OPNUM) + "_variance");                                 \
-  LAYER##_##BLK##_##OPNUM##_variance.update_output_desc_y(LAYER##_##BLK##_##OPNUM##_desc);                        \
-                                                                                                                  \
-  auto LAYER##_##BLK##_##OPNUM = op::FusedBatchNorm(string(#LAYER) + string(#BLK) + string(#OPNUM))               \
-                                     .set_input_x(input, "y")                                                     \
-                                     .set_input_scale(LAYER##_##BLK##_##OPNUM##_scale)                            \
-                                     .set_input_b(LAYER##_##BLK##_##OPNUM##_b)                                    \
-                                     .set_input_mean(LAYER##_##BLK##_##OPNUM##_mean)                              \
-                                     .set_input_variance(LAYER##_##BLK##_##OPNUM##_variance)                      \
-                                     .set_attr_mode(1)                                                            \
-                                     .set_attr_epsilon(1e-5)                                                      \
-                                     .set_attr_is_training(true);
-
-#define GENERATE_BN_VAR_VAR(LAYER, BLK, OPNUM, out_channels, input)                                                   \
-  TensorDesc LAYER##_##BLK##_##OPNUM##_desc(ge::Shape({1, out_channels, 1, 1}), FORMAT_NCHW, DT_FLOAT);               \
-  uint32_t LAYER##_##BLK##_##OPNUM##_size = LAYER##_##BLK##_##OPNUM##_desc.GetShape().GetShapeSize();                 \
-  auto LAYER##_##BLK##_##OPNUM##_scale = op::Variable(string(#LAYER) + string(#BLK) + string(#OPNUM) + "_scale");     \
-  LAYER##_##BLK##_##OPNUM##_scale.update_output_desc_y(LAYER##_##BLK##_##OPNUM##_desc);                               \
-                                                                                                                      \
-  auto LAYER##_##BLK##_##OPNUM##_mom_scale =                                                                          \
-      op::Variable(string(#LAYER) + string(#BLK) + string(#OPNUM) + "_mom_scale");                                    \
-  LAYER##_##BLK##_##OPNUM##_mom_scale.update_output_desc_y(LAYER##_##BLK##_##OPNUM##_desc);                           \
-                                                                                                                      \
-  auto LAYER##_##BLK##_##OPNUM##_b = op::Variable(string(#LAYER) + string(#BLK) + string(#OPNUM) + "_b");             \
-  LAYER##_##BLK##_##OPNUM##_b.update_output_desc_y(LAYER##_##BLK##_##OPNUM##_desc);                                   \
-                                                                                                                      \
-  auto LAYER##_##BLK##_##OPNUM##_mom_b = op::Variable(string(#LAYER) + string(#BLK) + string(#OPNUM) + "_mom_b");     \
-  LAYER##_##BLK##_##OPNUM##_mom_b.update_output_desc_y(LAYER##_##BLK##_##OPNUM##_desc);                               \
-                                                                                                                      \
-  auto LAYER##_##BLK##_##OPNUM##_mean = op::Variable(string(#LAYER) + string(#BLK) + string(#OPNUM) + "_mean");       \
-  LAYER##_##BLK##_##OPNUM##_mean.update_output_desc_y(LAYER##_##BLK##_##OPNUM##_desc);                                \
-  auto LAYER##_##BLK##_##OPNUM##_variance =                                                                           \
-      op::Variable(string(#LAYER) + string(#BLK) + string(#OPNUM) + "_variance");                                     \
-  LAYER##_##BLK##_##OPNUM##_variance.update_output_desc_y(LAYER##_##BLK##_##OPNUM##_desc);                            \
-                                                                                                                      \
-  GENERATE_CONSTANT(LAYER, BLK, OPNUM, scale);                                                                        \
-                                                                                                                      \
-  auto LAYER##_##BLK##_##OPNUM##_scale_assign = op::Assign()                                                          \
-                                                    .set_input_ref(LAYER##_##BLK##_##OPNUM##_scale)                   \
-                                                    .set_input_value(LAYER##_##BLK##_##OPNUM##_scale_constant);       \
-  GENERATE_CONSTANT(LAYER, BLK, OPNUM, mom_scale);                                                                    \
-                                                                                                                      \
-  auto LAYER##_##BLK##_##OPNUM##_mom_scale_assign =                                                                   \
-      op::Assign()                                                                                                    \
-          .set_input_ref(LAYER##_##BLK##_##OPNUM##_mom_scale)                                                         \
-          .set_input_value(LAYER##_##BLK##_##OPNUM##_mom_scale_constant);                                             \
-                                                                                                                      \
-  GENERATE_CONSTANT(LAYER, BLK, OPNUM, b);                                                                            \
-                                                                                                                      \
-  auto LAYER##_##BLK##_##OPNUM##_b_assign =                                                                           \
-      op::Assign().set_input_ref(LAYER##_##BLK##_##OPNUM##_b).set_input_value(LAYER##_##BLK##_##OPNUM##_b_constant);  \
-                                                                                                                      \
-  GENERATE_CONSTANT(LAYER, BLK, OPNUM, mom_b);                                                                        \
-                                                                                                                      \
-  auto LAYER##_##BLK##_##OPNUM##_mom_b_assign = op::Assign()                                                          \
-                                                    .set_input_ref(LAYER##_##BLK##_##OPNUM##_mom_b)                   \
-                                                    .set_input_value(LAYER##_##BLK##_##OPNUM##_mom_b_constant);       \
-  GENERATE_CONSTANT(LAYER, BLK, OPNUM, mean);                                                                         \
-                                                                                                                      \
-  auto LAYER##_##BLK##_##OPNUM##_mean_assign = op::Assign()                                                           \
-                                                   .set_input_ref(LAYER##_##BLK##_##OPNUM##_mean)                     \
-                                                   .set_input_value(LAYER##_##BLK##_##OPNUM##_mean_constant);         \
-                                                                                                                      \
-  GENERATE_CONSTANT(LAYER, BLK, OPNUM, variance);                                                                     \
-                                                                                                                      \
-  auto LAYER##_##BLK##_##OPNUM##_variance_assign = op::Assign()                                                       \
-                                                       .set_input_ref(LAYER##_##BLK##_##OPNUM##_variance)             \
-                                                       .set_input_value(LAYER##_##BLK##_##OPNUM##_variance_constant); \
-                                                                                                                      \
-  input.push_back(LAYER##_##BLK##_##OPNUM##_scale);                                                                   \
-  input.push_back(LAYER##_##BLK##_##OPNUM##_mom_scale);                                                               \
-  input.push_back(LAYER##_##BLK##_##OPNUM##_b);                                                                       \
-  input.push_back(LAYER##_##BLK##_##OPNUM##_mom_b);                                                                   \
-  input.push_back(LAYER##_##BLK##_##OPNUM##_mean);                                                                    \
-  input.push_back(LAYER##_##BLK##_##OPNUM##_variance);
-
-// (int out_channels, Operator& input)
-#define GENERATE_RELU_VAR(LAYER, BLK, OPNUM, input) \
-  auto &LAYER##_##BLK##_##OPNUM##_input = input;    \
-  auto LAYER##_##BLK##_##OPNUM = op::Relu(string(#LAYER) + string(#BLK) + string(#OPNUM)).set_input_x(input, "y");
-
-// (int out_channels, Operator& input)
-#define GENERATE_MAXPOOL_VAR(LAYER, BLK, OPNUM, input)                                                 \
-  auto &LAYER##_##BLK##_##OPNUM##_input = input;                                                       \
-                                                                                                       \
-  auto LAYER##_##BLK##_##OPNUM = op::MaxPoolWithArgmax(string(#LAYER) + string(#BLK) + string(#OPNUM)) \
-                                     .set_input_x(input, "y")                                          \
-                                     .set_attr_ksize({1, 3, 3, 1})                                     \
-                                     .set_attr_padding("SAME")                                         \
-                                     .set_attr_strides({1, 2, 2, 1});
-
-// (int out_channels, Operator& input)
-#define GENERATE_ADD_VAR(LAYER, BLK, OPNUM, input_x1, input_x2) \
-  auto LAYER##_##BLK##_##OPNUM =                                \
-      op::Add(string(#LAYER) + string(#BLK) + string(#OPNUM)).set_input_x1(input_x1, "y").set_input_x2(input_x2, "y");
-
-// (int in_channels, int out_channels,vector<int64_t> stride{1,1}, Operator& input)
-#define MAKE_RESIDUAL_BLOCK(LAYER, BLK, in_channels, out_channels, stride, input)                                 \
-  auto &LAYER##_##BLK##_input = input;                                                                            \
-  auto &LAYER##_##BLK##_stride = stride;                                                                          \
-  int LAYER##_##BLK##_out_chls = out_channels / 4;                                                                \
-                                                                                                                  \
-  GENERATE_CONV_VAR(LAYER, BLK, conv1, in_channels, LAYER##_##BLK##_out_chls, 1, 1, stride, pad_0, input);        \
-  GENERATE_BN_VAR(LAYER, BLK, bn1, LAYER##_##BLK##_out_chls, LAYER##_##BLK##_conv1);                              \
-  GENERATE_RELU_VAR(LAYER, BLK, relu1, LAYER##_##BLK##_bn1);                                                      \
-                                                                                                                  \
-  GENERATE_CONV_VAR(LAYER, BLK, conv2, LAYER##_##BLK##_out_chls, LAYER##_##BLK##_out_chls, 3, 3, stride_1, pad_1, \
-                    LAYER##_##BLK##_relu1);                                                                       \
-  GENERATE_BN_VAR(LAYER, BLK, bn2, LAYER##_##BLK##_out_chls, LAYER##_##BLK##_conv2);                              \
-  GENERATE_RELU_VAR(LAYER, BLK, relu2, LAYER##_##BLK##_bn2);                                                      \
-                                                                                                                  \
-  GENERATE_CONV_VAR(LAYER, BLK, conv3, LAYER##_##BLK##_out_chls, out_channels, 1, 1, stride_1, pad_0,             \
-                    LAYER##_##BLK##_relu2);                                                                       \
-  GENERATE_BN_VAR(LAYER, BLK, bn3, out_channels, LAYER##_##BLK##_conv3);                                          \
-                                                                                                                  \
-  GENERATE_CONV_VAR(LAYER, BLK, conv4, in_channels, out_channels, 1, 1, stride, pad_0, input);                    \
-  GENERATE_BN_VAR(LAYER, BLK, bn4, out_channels, LAYER##_##BLK##_conv4);                                          \
-                                                                                                                  \
-  GENERATE_ADD_VAR(LAYER, BLK, add5, LAYER##_##BLK##_bn3, LAYER##_##BLK##_bn4);                                   \
-  GENERATE_RELU_VAR(LAYER, BLK, relu5, LAYER##_##BLK##_add5);                                                     \
-                                                                                                                  \
-  auto &LAYER##_##BLK##_output = LAYER##_##BLK##_relu5;                                                           \
-  auto &LAYER##_##BLK##_output_label = "y";
-
-#define MAKE_RESIDUAL_BLOCK_VAR(LAYER, BLK, in_channels, out_channels, stride, input)                                 \
-  int LAYER##_##BLK##_out_chls = out_channels / 4;                                                                    \
-  GENERATE_CONV_VAR_VAR(LAYER, BLK, conv1, in_channels, LAYER##_##BLK##_out_chls, 1, 1, stride, pad_0, input);        \
-  GENERATE_BN_VAR_VAR(LAYER, BLK, bn1, LAYER##_##BLK##_out_chls, input);                                              \
-                                                                                                                      \
-  GENERATE_CONV_VAR_VAR(LAYER, BLK, conv2, LAYER##_##BLK##_out_chls, LAYER##_##BLK##_out_chls, 3, 3, stride_1, pad_1, \
-                        input);                                                                                       \
-  GENERATE_BN_VAR_VAR(LAYER, BLK, bn2, LAYER##_##BLK##_out_chls, input);                                              \
-                                                                                                                      \
-  GENERATE_CONV_VAR_VAR(LAYER, BLK, conv3, LAYER##_##BLK##_out_chls, out_channels, 1, 1, stride_1, pad_0, input);     \
-  GENERATE_BN_VAR_VAR(LAYER, BLK, bn3, out_channels, input);                                                          \
-                                                                                                                      \
-  GENERATE_CONV_VAR_VAR(LAYER, BLK, conv4, in_channels, out_channels, 1, 1, stride, pad_0, input);                    \
-  GENERATE_BN_VAR_VAR(LAYER, BLK, bn4, out_channels, input);
-
-// (int in_channels, int out_channels,vector<int64_t> stride{1,1}, Operator& input)
-#define MAKE_NORMAL_BLOCK(LAYER, BLK, in_channels, out_channels, stride, input)                                   \
-  auto &LAYER##_##BLK##_input = input;                                                                            \
-  auto &LAYER##_##BLK##_stride = stride;                                                                          \
-  int LAYER##_##BLK##_out_chls = out_channels / 4;                                                                \
-                                                                                                                  \
-  GENERATE_CONV_VAR(LAYER, BLK, conv1, in_channels, LAYER##_##BLK##_out_chls, 1, 1, stride, pad_0, input);        \
-  GENERATE_BN_VAR(LAYER, BLK, bn1, LAYER##_##BLK##_out_chls, LAYER##_##BLK##_conv1);                              \
-  GENERATE_RELU_VAR(LAYER, BLK, relu1, LAYER##_##BLK##_bn1);                                                      \
-                                                                                                                  \
-  GENERATE_CONV_VAR(LAYER, BLK, conv2, LAYER##_##BLK##_out_chls, LAYER##_##BLK##_out_chls, 3, 3, stride_1, pad_1, \
-                    LAYER##_##BLK##_relu1);                                                                       \
-  GENERATE_BN_VAR(LAYER, BLK, bn2, LAYER##_##BLK##_out_chls, LAYER##_##BLK##_conv2);                              \
-  GENERATE_RELU_VAR(LAYER, BLK, relu2, LAYER##_##BLK##_bn2);                                                      \
-                                                                                                                  \
-  GENERATE_CONV_VAR(LAYER, BLK, conv3, LAYER##_##BLK##_out_chls, out_channels, 1, 1, stride_1, pad_0,             \
-                    LAYER##_##BLK##_relu2);                                                                       \
-  GENERATE_BN_VAR(LAYER, BLK, bn3, out_channels, LAYER##_##BLK##_conv3);                                          \
-                                                                                                                  \
-  GENERATE_ADD_VAR(LAYER, BLK, add5, LAYER##_##BLK##_bn3, input);                                                 \
-  GENERATE_RELU_VAR(LAYER, BLK, relu5, LAYER##_##BLK##_add5);                                                     \
-                                                                                                                  \
-  auto &LAYER##_##BLK##_output = LAYER##_##BLK##_relu5;                                                           \
-  auto &LAYER##_##BLK##_output_label = "y";
-
-#define MAKE_NORMAL_BLOCK_VAR(LAYER, BLK, in_channels, out_channels, stride, input)                                   \
-  int LAYER##_##BLK##_out_chls = out_channels / 4;                                                                    \
-  GENERATE_CONV_VAR_VAR(LAYER, BLK, conv1, in_channels, LAYER##_##BLK##_out_chls, 1, 1, stride, pad_0, input);        \
-  GENERATE_BN_VAR_VAR(LAYER, BLK, bn1, LAYER##_##BLK##_out_chls, input);                                              \
-                                                                                                                      \
-  GENERATE_CONV_VAR_VAR(LAYER, BLK, conv2, LAYER##_##BLK##_out_chls, LAYER##_##BLK##_out_chls, 3, 3, stride_1, pad_1, \
-                        input);                                                                                       \
-  GENERATE_BN_VAR_VAR(LAYER, BLK, bn2, LAYER##_##BLK##_out_chls, input);                                              \
-                                                                                                                      \
-  GENERATE_CONV_VAR_VAR(LAYER, BLK, conv3, LAYER##_##BLK##_out_chls, out_channels, 1, 1, stride_1, pad_0, input);     \
-  GENERATE_BN_VAR_VAR(LAYER, BLK, bn3, out_channels, input);
-
-// (int in_channels, int out_channels,vector<int64_t> stride{1,1}, Operator& input)
-#define MAKE_RESIDUAL_LAYER(LAYER, in_channels, out_channels, stride, input)  \
-  MAKE_RESIDUAL_BLOCK(LAYER, blk1, in_channels, out_channels, stride, input); \
-                                                                              \
-  auto &LAYER##_output = LAYER##_blk1_output;                                 \
-  auto &LAYER##_output_label = LAYER##_blk1_output_label;
-
-#define MAKE_RESIDUAL_LAYER_VAR(LAYER, in_channels, out_channels, stride, input) \
-  MAKE_RESIDUAL_BLOCK_VAR(LAYER, blk1, in_channels, out_channels, stride, input);
-
-// (int in_channels, int out_channels,vector<int64_t> stride{1,1}, Operator& input)
-#define MAKE_NORMAL_LAYER(LAYER, in_channels, out_channels, stride, input)  \
-  MAKE_NORMAL_BLOCK(LAYER, blk1, in_channels, out_channels, stride, input); \
-                                                                            \
-  auto &LAYER##_output = LAYER##_blk1_output;                               \
-  auto &LAYER##_output_label = LAYER##_blk1_output_label;
-
-#define MAKE_NORMAL_LAYER_VAR(LAYER, in_channels, out_channels, stride, input) \
-  MAKE_NORMAL_BLOCK_VAR(LAYER, blk1, in_channels, out_channels, stride, input);
-
-#define MAKE_RESNET50(input)                                         \
-  MAKE_RESIDUAL_LAYER(layer1, 64, 256, stride_1, input)              \
-  MAKE_NORMAL_LAYER(layer2, 256, 256, stride_1, layer1_output)       \
-  MAKE_NORMAL_LAYER(layer3, 256, 256, stride_1, layer2_output)       \
-  MAKE_RESIDUAL_LAYER(layer4, 256, 512, stride_2, layer3_output)     \
-  MAKE_NORMAL_LAYER(layer5, 512, 512, stride_1, layer4_output)       \
-  MAKE_NORMAL_LAYER(layer6, 512, 512, stride_1, layer5_output)       \
-  MAKE_NORMAL_LAYER(layer7, 512, 512, stride_1, layer6_output)       \
-  MAKE_RESIDUAL_LAYER(layer8, 512, 1024, stride_2, layer7_output)    \
-  MAKE_NORMAL_LAYER(layer9, 1024, 1024, stride_1, layer8_output)     \
-  MAKE_NORMAL_LAYER(layer10, 1024, 1024, stride_1, layer9_output)    \
-  MAKE_NORMAL_LAYER(layer11, 1024, 1024, stride_1, layer10_output)   \
-  MAKE_NORMAL_LAYER(layer12, 1024, 1024, stride_1, layer11_output)   \
-  MAKE_NORMAL_LAYER(layer13, 1024, 1024, stride_1, layer12_output)   \
-  MAKE_RESIDUAL_LAYER(layer14, 1024, 2048, stride_2, layer13_output) \
-  MAKE_NORMAL_LAYER(layer15, 2048, 2048, stride_1, layer14_output)   \
-  MAKE_NORMAL_LAYER(layer16, 2048, 2048, stride_1, layer15_output)   \
-                                                                     \
-  auto &resnet50_output = layer16_output;                            \
-  auto &resnet50_output_label = layer16_output_label;
-
-#define MAKE_RESNET50_VAR(inputs)                                \
-  MAKE_RESIDUAL_LAYER_VAR(layer1, 64, 256, stride_1, inputs)     \
-  MAKE_NORMAL_LAYER_VAR(layer2, 256, 256, stride_1, inputs)      \
-  MAKE_NORMAL_LAYER_VAR(layer3, 256, 256, stride_1, inputs)      \
-  MAKE_RESIDUAL_LAYER_VAR(layer4, 256, 512, stride_2, inputs)    \
-  MAKE_NORMAL_LAYER_VAR(layer5, 512, 512, stride_1, inputs)      \
-  MAKE_NORMAL_LAYER_VAR(layer6, 512, 512, stride_1, inputs)      \
-  MAKE_NORMAL_LAYER_VAR(layer7, 512, 512, stride_1, inputs)      \
-  MAKE_RESIDUAL_LAYER_VAR(layer8, 512, 1024, stride_2, inputs)   \
-  MAKE_NORMAL_LAYER_VAR(layer9, 1024, 1024, stride_1, inputs)    \
-  MAKE_NORMAL_LAYER_VAR(layer10, 1024, 1024, stride_1, inputs)   \
-  MAKE_NORMAL_LAYER_VAR(layer11, 1024, 1024, stride_1, inputs)   \
-  MAKE_NORMAL_LAYER_VAR(layer12, 1024, 1024, stride_1, inputs)   \
-  MAKE_NORMAL_LAYER_VAR(layer13, 1024, 1024, stride_1, inputs)   \
-  MAKE_RESIDUAL_LAYER_VAR(layer14, 1024, 2048, stride_2, inputs) \
-  MAKE_NORMAL_LAYER_VAR(layer15, 2048, 2048, stride_1, inputs)   \
-  MAKE_NORMAL_LAYER_VAR(layer16, 2048, 2048, stride_1, inputs)   \
-//---------------------------------------------------------------------------------------------
-
-// (Operator& input)
-#define GENERATE_BIASADD_GRAD(LAYER, BLK, OPNUM, input)                                \
-  auto LAYER##_##BLK##_##OPNUM##_grad =                                                \
-      op::BiasAddGrad(string(#LAYER) + string(#BLK) + string(#OPNUM) + string("grad")) \
-          .set_input_x(input, input.name_out_dx());
-
-// (Operator& input)
-#define GENERATE_MATMUL_GRAD(LAYER, BLK, OPNUM, input) \
-  auto LAYER##_##BLK##_##OPNUM##_grad =                \
-      op::MatMul(string(#LAYER) + string(#BLK) + string(#OPNUM) + string("grad")).set_input_x1(input);
-
-// (Operator& input)
-#define GENERATE_RESHAPE_GRAD(LAYER, BLK, OPNUM, input) \
-  auto LAYER##_##BLK##_##OPNUM##_grad =                 \
-      op::Reshape(string(#LAYER) + string(#BLK) + string(#OPNUM) + string("grad")).set_input_tensor(input);
-
-// (Operator& input_grad, Operator& input_maxpool)
-#define GENERATE_MAXPOOL_GRAD(LAYER, BLK, OPNUM, input_grad, input_maxpool)                      \
-  auto LAYER##_##BLK##_##OPNUM##_grad =                                                          \
-      op::MaxPoolGradWithArgmax(string(#LAYER) + string(#BLK) + string(#OPNUM) + string("grad")) \
-          .set_input_x(LAYER##_##BLK##_##OPNUM##_input, "y")                                     \
-          .set_input_grad(input_grad)                                                            \
-          .set_input_argmax(input_maxpool, input_maxpool.name_out_argmax())                      \
-          .set_attr_ksize({1, 1, 3, 3})                                                          \
-          .set_attr_strides({1, 1, 2, 2})                                                        \
-          .set_attr_padding("SAME");
-
-// (Operator& input_dy)
-#define GENERATE_RELU_GRAD(LAYER, BLK, OPNUM, input_dy, dy_label)                                                     \
-  auto LAYER##_##BLK##_##OPNUM##_grad = op::ReluGrad(string(#LAYER) + string(#BLK) + string(#OPNUM) + string("grad")) \
-                                            .set_input_gradients(input_dy, dy_label)                                  \
-                                            .set_input_features(LAYER##_##BLK##_##OPNUM, "y");
-
-// (Operator& input_dy)
-#define GENERATE_BN_GRAD(LAYER, BLK, OPNUM, input_dy)                                                         \
-  auto LAYER##_##BLK##_##OPNUM##_grad =                                                                       \
-      op::FusedBatchNormGrad(string(#LAYER) + string(#BLK) + string(#OPNUM) + string("grad"))                 \
-          .set_input_dy(input_dy, "backprops")                                                                \
-          .set_input_x(LAYER##_##BLK##_##OPNUM##_input, "y")                                                  \
-          .set_input_scale(LAYER##_##BLK##_##OPNUM##_scale)                                                   \
-          .set_input_save_mean(LAYER##_##BLK##_##OPNUM, "save_mean")                                          \
-          .set_input_save_inv_variance(LAYER##_##BLK##_##OPNUM, "save_inv_variance")                          \
-          .set_attr_epsilon(0.0001);                                                                          \
-                                                                                                              \
-  auto LAYER##_##BLK##_##OPNUM##_momentum_scale =                                                             \
-      op::ApplyMomentum()                                                                                     \
-          .set_input_accum(LAYER##_##BLK##_##OPNUM##_mom_scale)                                               \
-          .set_input_grad(LAYER##_##BLK##_##OPNUM##_grad, LAYER##_##BLK##_##OPNUM##_grad.name_out_bn_scale()) \
-          .set_input_lr(label1)                                                                               \
-          .set_input_momentum(label1)                                                                         \
-          .set_input_var(LAYER##_##BLK##_##OPNUM##_scale);                                                    \
-                                                                                                              \
-  auto LAYER##_##BLK##_##OPNUM##_momentum_b =                                                                 \
-      op::ApplyMomentum()                                                                                     \
-          .set_input_accum(LAYER##_##BLK##_##OPNUM##_mom_b)                                                   \
-          .set_input_grad(LAYER##_##BLK##_##OPNUM##_grad, LAYER##_##BLK##_##OPNUM##_grad.name_out_bn_bias())  \
-          .set_input_lr(label1)                                                                               \
-          .set_input_momentum(label1)                                                                         \
-          .set_input_var(LAYER##_##BLK##_##OPNUM##_b);
-
-// (Operator& input)
-#define GENERATE_CONV_PROP_FILTER(LAYER, BLK, OPNUM, input_bngrad, stride)                                    \
-  auto LAYER##_##BLK##_##OPNUM##_propfilter =                                                                 \
-      op::Conv2DBackpropFilterD(string(#LAYER) + string(#BLK) + string(#OPNUM) + string("_propfilter"))       \
-          .set_input_x(LAYER##_##BLK##_##OPNUM##_input, "y")                                                  \
-          .set_attr_filter_size(LAYER##_##BLK##_##OPNUM##_desc.GetShape().GetDims())                          \
-          .set_input_out_backprop(input_bngrad, input_bngrad.name_out_dx())                                   \
-          .set_attr_strides(stride)                                                                           \
-          .set_attr_pads({1, 1, 1, 1});                                                                       \
-                                                                                                              \
-  update_op_format(LAYER##_##BLK##_##OPNUM##_propfilter);                                                     \
-  auto LAYER##_##BLK##_##OPNUM##_momentum_weight = op::ApplyMomentum()                                        \
-                                                       .set_input_accum(LAYER##_##BLK##_##OPNUM##_mom_weight) \
-                                                       .set_input_grad(LAYER##_##BLK##_##OPNUM##_propfilter)  \
-                                                       .set_input_lr(label1)                                  \
-                                                       .set_input_momentum(label1)                            \
-                                                       .set_input_var(LAYER##_##BLK##_##OPNUM##_weight);
-
-///.set_attr_input_size({input_bngrad.name_out_dx().GetOutputDesc().GetShape().GetDim(0),LAYER##_##BLK##_##OPNUM##_weight.GetOutputDesc().GetShape().GetDim(1),
-///input_bngrad.name_out_dx().GetOutputDesc().GetShape().GetDim(2)*stride[2],
-///input_bngrad.name_out_dx().GetOutputDesc().GetShape().GetDim(3)*stride[3]})
-#define GENERATE_CONV_PROP_INPUT(LAYER, BLK, OPNUM, input_bngrad, stride)                                           \
-  auto LAYER##_##BLK##_##OPNUM##_propinput =                                                                        \
-      op::Conv2DBackpropInputD(string(#LAYER) + string(#BLK) + string(#OPNUM) + string("_propinput"))               \
-          .set_attr_input_size(LAYER##_##BLK##_##OPNUM##_input.GetOutputDesc("y").GetShape().GetDims())             \
-          .set_input_filter(LAYER##_##BLK##_##OPNUM##_weight)                                                       \
-          .set_input_out_backprop(input_bngrad, input_bngrad.name_out_dx())                                         \
-          .set_attr_strides(stride)                                                                                 \
-          .set_attr_pads({1, 1, 1, 1});                                                                             \
-  cout << string(#LAYER) + string(#BLK) + string(#OPNUM) + "_propinput"                                             \
-       << "'s input_x op's shape is:" << input_bngrad.GetOutputDesc("dx").GetShape().GetDim(3) * stride[3] << endl; \
-  cout << string(#LAYER) + string(#BLK) + string(#OPNUM) + "_propinput"                                             \
-       << "'s input_x op's shape is:" << input_bngrad.GetOutputDesc("dx").GetShape().GetDim(2) * stride[2] << endl; \
-                                                                                                                    \
-  update_op_format(LAYER##_##BLK##_##OPNUM##_propinput);                                                            \
-  auto &LAYER##_##BLK##_##OPNUM##_propinput_label = "y"
-
-// (int out_channels, Operator& input)
-#define GENERATE_ADD_GRAD(LAYER, BLK, OPNUM, input_x1, input_x1_label, input_x2, input_x2_label)                 \
-  auto LAYER##_##BLK##_##OPNUM##_grad = op::Add(string(#LAYER) + string(#BLK) + string(#OPNUM) + string("grad")) \
-                                            .set_input_x1(input_x1, input_x1_label)                              \
-                                            .set_input_x2(input_x2, input_x2_label);
-
-// (Operator& input)
-#define MAKE_RESIDUAL_BLOCK_GRAD(LAYER, BLK, input_dy, dy_label)                                              \
-  GENERATE_RELU_GRAD(LAYER, BLK, relu5, input_dy, dy_label);                                                  \
-                                                                                                              \
-  GENERATE_BN_GRAD(LAYER, BLK, bn4, LAYER##_##BLK##_relu5_grad);                                              \
-  GENERATE_CONV_PROP_FILTER(LAYER, BLK, conv4, LAYER##_##BLK##_bn4_grad, LAYER##_##BLK##_stride);             \
-  GENERATE_CONV_PROP_INPUT(LAYER, BLK, conv4, LAYER##_##BLK##_bn4_grad, LAYER##_##BLK##_stride);              \
-                                                                                                              \
-  GENERATE_BN_GRAD(LAYER, BLK, bn3, LAYER##_##BLK##_relu5_grad);                                              \
-  GENERATE_CONV_PROP_FILTER(LAYER, BLK, conv3, LAYER##_##BLK##_bn3_grad, stride_1);                           \
-  GENERATE_CONV_PROP_INPUT(LAYER, BLK, conv3, LAYER##_##BLK##_bn3_grad, stride_1);                            \
-                                                                                                              \
-  GENERATE_RELU_GRAD(LAYER, BLK, relu2, LAYER##_##BLK##_conv3_propinput, "y");                                \
-  GENERATE_BN_GRAD(LAYER, BLK, bn2, LAYER##_##BLK##_relu2_grad);                                              \
-  GENERATE_CONV_PROP_FILTER(LAYER, BLK, conv2, LAYER##_##BLK##_bn2_grad, stride_1);                           \
-  GENERATE_CONV_PROP_INPUT(LAYER, BLK, conv2, LAYER##_##BLK##_bn2_grad, stride_1);                            \
-                                                                                                              \
-  GENERATE_RELU_GRAD(LAYER, BLK, relu1, LAYER##_##BLK##_conv2_propinput, "y");                                \
-  GENERATE_BN_GRAD(LAYER, BLK, bn1, LAYER##_##BLK##_relu1_grad);                                              \
-  GENERATE_CONV_PROP_FILTER(LAYER, BLK, conv1, LAYER##_##BLK##_bn1_grad, LAYER##_##BLK##_stride);             \
-  GENERATE_CONV_PROP_INPUT(LAYER, BLK, conv1, LAYER##_##BLK##_bn1_grad, LAYER##_##BLK##_stride);              \
-                                                                                                              \
-  GENERATE_ADD_GRAD(LAYER, BLK, add5, LAYER##_##BLK##_conv1_propinput, LAYER##_##BLK##_conv1_propinput_label, \
-                    LAYER##_##BLK##_conv4_propinput, LAYER##_##BLK##_conv4_propinput_label);                  \
-                                                                                                              \
-  auto &LAYER##_##BLK##_grad_output = LAYER##_##BLK##_add5_grad;                                              \
-  auto &LAYER##_##BLK##_grad_output_label = "y"
-
-// (Operator& input)
-#define MAKE_NORMAL_BLOCK_GRAD(LAYER, BLK, input_dy, dy_label)                                                \
-  GENERATE_RELU_GRAD(LAYER, BLK, relu5, input_dy, dy_label);                                                  \
-                                                                                                              \
-  GENERATE_BN_GRAD(LAYER, BLK, bn3, LAYER##_##BLK##_relu5_grad);                                              \
-  GENERATE_CONV_PROP_FILTER(LAYER, BLK, conv3, LAYER##_##BLK##_bn3_grad, stride_1);                           \
-  GENERATE_CONV_PROP_INPUT(LAYER, BLK, conv3, LAYER##_##BLK##_bn3_grad, stride_1);                            \
-                                                                                                              \
-  GENERATE_RELU_GRAD(LAYER, BLK, relu2, LAYER##_##BLK##_conv3_propinput, "y");                                \
-  GENERATE_BN_GRAD(LAYER, BLK, bn2, LAYER##_##BLK##_relu2_grad);                                              \
-  GENERATE_CONV_PROP_FILTER(LAYER, BLK, conv2, LAYER##_##BLK##_bn2_grad, stride_1);                           \
-  GENERATE_CONV_PROP_INPUT(LAYER, BLK, conv2, LAYER##_##BLK##_bn2_grad, stride_1);                            \
-                                                                                                              \
-  GENERATE_RELU_GRAD(LAYER, BLK, relu1, LAYER##_##BLK##_conv2_propinput, "y");                                \
-  GENERATE_BN_GRAD(LAYER, BLK, bn1, LAYER##_##BLK##_relu1_grad);                                              \
-  GENERATE_CONV_PROP_FILTER(LAYER, BLK, conv1, LAYER##_##BLK##_bn1_grad, LAYER##_##BLK##_stride);             \
-  GENERATE_CONV_PROP_INPUT(LAYER, BLK, conv1, LAYER##_##BLK##_bn1_grad, LAYER##_##BLK##_stride);              \
-                                                                                                              \
-  GENERATE_ADD_GRAD(LAYER, BLK, add5, LAYER##_##BLK##_conv1_propinput, LAYER##_##BLK##_conv1_propinput_label, \
-                    input_dy, dy_label);                                                                      \
-                                                                                                              \
-  auto &LAYER##_##BLK##_grad_output = LAYER##_##BLK##_add5_grad;                                              \
-  auto &LAYER##_##BLK##_grad_output_label = "y"
-
-// (Operator& input_dy)
-#define MAKE_RESIDUAL_LAYER_GRAD(LAYER, input_dy, dy_label)  \
-  MAKE_RESIDUAL_BLOCK_GRAD(LAYER, blk1, input_dy, dy_label); \
-                                                             \
-  auto &LAYER##_grad_output = LAYER##_blk1_grad_output;      \
-  auto &LAYER##_grad_output_label = LAYER##_blk1_grad_output_label;
-
-// (Operator& input_dy)
-#define MAKE_NORMAL_LAYER_GRAD(LAYER, input_dy, dy_label)  \
-  MAKE_NORMAL_BLOCK_GRAD(LAYER, blk1, input_dy, dy_label); \
-                                                           \
-  auto &LAYER##_grad_output = LAYER##_blk1_grad_output;    \
-  auto &LAYER##_grad_output_label = LAYER##_blk1_grad_output_label;
-
-#define MAKE_RESNET50_GRAD(input_dy, dy_label)                                      \
-  MAKE_NORMAL_LAYER_GRAD(layer16, input_dy, dy_label)                               \
-  MAKE_NORMAL_LAYER_GRAD(layer15, layer16_grad_output, layer16_grad_output_label)   \
-  MAKE_RESIDUAL_LAYER_GRAD(layer14, layer15_grad_output, layer15_grad_output_label) \
-  MAKE_NORMAL_LAYER_GRAD(layer13, layer14_grad_output, layer14_grad_output_label)   \
-  MAKE_NORMAL_LAYER_GRAD(layer12, layer13_grad_output, layer13_grad_output_label)   \
-  MAKE_NORMAL_LAYER_GRAD(layer11, layer12_grad_output, layer12_grad_output_label)   \
-  MAKE_NORMAL_LAYER_GRAD(layer10, layer11_grad_output, layer11_grad_output_label)   \
-  MAKE_NORMAL_LAYER_GRAD(layer9, layer10_grad_output, layer10_grad_output_label)    \
-  MAKE_RESIDUAL_LAYER_GRAD(layer8, layer9_grad_output, layer9_grad_output_label)    \
-  MAKE_NORMAL_LAYER_GRAD(layer7, layer8_grad_output, layer8_grad_output_label)      \
-  MAKE_NORMAL_LAYER_GRAD(layer6, layer7_grad_output, layer7_grad_output_label)      \
-  MAKE_NORMAL_LAYER_GRAD(layer5, layer6_grad_output, layer6_grad_output_label)      \
-  MAKE_RESIDUAL_LAYER_GRAD(layer4, layer5_grad_output, layer5_grad_output_label)    \
-  MAKE_NORMAL_LAYER_GRAD(layer3, layer4_grad_output, layer4_grad_output_label)      \
-  MAKE_NORMAL_LAYER_GRAD(layer2, layer3_grad_output, layer3_grad_output_label)      \
-  MAKE_RESIDUAL_LAYER_GRAD(layer1, layer2_grad_output, layer2_grad_output_label)    \
-                                                                                    \
-  auto &resnet50_grad_output = layer1_grad_output;                                  \
-  auto &resnet50_grad_output_label = layer1_grad_output_label;
-
-bool resnet50(Graph &graph) {
-  auto data = op::Data().set_attr_index(0);
-  auto data1 = op::Data().set_attr_index(1);
-  TensorDesc shape_desc(ge::Shape({32, 3, 224, 224}), FORMAT_NCHW, DT_FLOAT);
-  data.update_output_desc_y(shape_desc);
-
-  TensorDesc desc(ge::Shape({64, 3, 7, 7}), FORMAT_NCHW, DT_FLOAT);
-
-  auto var = op::Variable("conv2d_var");
-  var.update_output_desc_y(desc);
-  var.update_input_desc_x(desc);
-
-  auto varw1 = op::Variable("conv2d_varw1");
-  varw1.update_output_desc_y(desc);
-
-  auto conv2d = op::Conv2D("Translate")
-                    .set_input_x(data)
-                    .set_input_filter(var)
-                    .set_attr_strides({1, 1, 2, 2})
-                    .set_attr_pads({2, 3, 2, 3})
-                    .set_attr_data_format("NCHW");
-  TensorDesc desc_y;
-  desc_y.SetFormat(FORMAT_NCHW); // shape: 32 64 112 112
-  conv2d.update_output_desc_y(desc_y);
-
-  TensorDesc desc1(ge::Shape({1, 64, 1, 1}), FORMAT_NCHW, DT_FLOAT);
-  auto var1 = op::Variable("bn_var1");
-  var1.update_output_desc_y(desc1);
-
-  auto var2 = op::Variable("bn_var2");
-  var2.update_output_desc_y(desc1);
-
-  auto var3 = op::Variable("bn_var3");
-  var3.update_output_desc_y(desc1);
-
-  auto var4 = op::Variable("bn_var4");
-  var4.update_output_desc_y(desc1);
-
-  TensorDesc desc2(ge::Shape({2048, 1001}), FORMAT_NCHW, DT_FLOAT);
-
-  auto var5 = op::Variable("var5");
-  var5.update_output_desc_y(desc2);
-
-  auto var6 = op::Variable("var6");
-  var6.update_output_desc_y(desc2);
-
-  TensorDesc desclabel(ge::Shape({1, 1001, 1, 1}), FORMAT_NCHW, DT_FLOAT);
-
-  auto label1 = op::Variable("label1");
-  label1.update_output_desc_y(desclabel);
-
-  TensorDesc descmatlabel(ge::Shape({1, 1001, 1, 1}), FORMAT_NCHW, DT_FLOAT);
-  auto matvar = op::Variable("matvar");
-  matvar.update_output_desc_y(descmatlabel);
-
-  auto matvar1 = op::Variable("matvar1");
-  matvar1.update_output_desc_y(descmatlabel);
-
-  auto bn = op::FusedBatchNorm()
-                .set_input_x(conv2d, "y")
-                .set_input_scale(var1)
-                .set_input_b(var2)
-                .set_input_mean(var3)
-                .set_input_variance(var4)
-                .set_attr_mode(1)
-                .set_attr_epsilon(1e-5)
-                .set_attr_is_training(true)
-                .set_attr_is_training_fusion(true)
-                .set_attr_moving_average_fraction(994352128);
-
-  auto relu = op::Relu().set_input_x(bn, "y");
-
-  auto maxpool = op::MaxPoolWithArgmax()
-                     .set_input_x(relu, "y")
-                     .set_attr_ksize({1, 3, 3, 1})
-                     .set_attr_padding("SAME")
-                     .set_attr_strides({1, 2, 2, 1});
-
-  MAKE_RESNET50(maxpool);
-  std::vector<Operator> inputs{data};  //,var,var1,layer1_blk1_bn1_b,var3,var4};
-  std::vector<Operator> outputs{};
-
-  graph.SetInputs(inputs).SetOutputs(outputs);
-  return true;
-}
-
-#define GENERATE_CONSTANT_USE_DESC(OPNUM, desc, val)                                 \
-  uint32_t OPNUM##_size = desc.GetShape().GetShapeSize();                            \
-  Tensor OPNUM##_tensor;                                                             \
-  OPNUM##_tensor.SetTensorDesc(desc);                                                \
-  if (desc.GetDataType() == DT_FLOAT) {                                              \
-    float *OPNUM##_data = new float[OPNUM##_size];                                   \
-    for (int i = 0; i < (int)OPNUM##_size; i++) {                                    \
-      *(OPNUM##_data + i) = val;                                                     \
-    }                                                                                \
-    OPNUM##_tensor.SetData((uint8_t *)OPNUM##_data, OPNUM##_size * sizeof(float));   \
-    delete[] OPNUM##_data;                                                           \
-  }                                                                                  \
-  if (desc.GetDataType() == DT_INT64) {                                              \
-    int64_t *OPNUM##_data = new int64_t[OPNUM##_size];                               \
-    for (int i = 0; i < (int)OPNUM##_size; i++) {                                    \
-      *(OPNUM##_data + i) = val;                                                     \
-    }                                                                                \
-    OPNUM##_tensor.SetData((uint8_t *)OPNUM##_data, OPNUM##_size * sizeof(int64_t)); \
-    delete[] OPNUM##_data;                                                           \
-  }                                                                                  \
-  auto OPNUM##_constant = op::Constant().set_attr_value(OPNUM##_tensor);             \
-  OPNUM##_constant.update_output_desc_y(desc);
-
-#define GENERATE_VAR_LAYER(OPNUM, desc, input)                                                        \
-  auto OPNUM##_weight = op::Variable(string(#OPNUM));                                                 \
-  OPNUM##_weight.update_output_desc_y(desc);                                                          \
-  auto OPNUM##_assign = op::Assign().set_input_ref(OPNUM##_weight).set_input_value(OPNUM##_constant); \
-                                                                                                      \
-  input.push_back(OPNUM##_weight);
-
-#define GENERATE_VAR_LAYER_1(OPNUM, desc, var_format, input, name)                                    \
-  auto OPNUM##_weight = op::Variable(string(name));                                                   \
-  OPNUM##_weight.update_output_desc_y(desc);                                                          \
-  auto OPNUM##_assign = op::Assign().set_input_ref(OPNUM##_weight).set_input_value(OPNUM##_constant); \
-                                                                                                      \
-  input.push_back(OPNUM##_weight);
-
-int BuildInitVarGraph(Graph &graph) {
-  std::vector<Operator> inputs{};
-  std::vector<Operator> outputs{};
-
-  TensorDesc desc(ge::Shape({64, 3, 7, 7}), FORMAT_NCHW, DT_FLOAT);
-  GENERATE_CONSTANT_USE_DESC(conv2d_var, desc, 0.01);
-  GENERATE_VAR_LAYER(conv2d_var, desc, inputs);
-
-  GENERATE_CONSTANT_USE_DESC(conv2d_varw1, desc, 0.01);
-  GENERATE_VAR_LAYER(conv2d_varw1, desc, inputs);
-
-  TensorDesc desc1(ge::Shape({1, 64, 1, 1}), FORMAT_NCHW, DT_FLOAT);
-  GENERATE_CONSTANT_USE_DESC(bn_var1, desc1, 0.01);
-  GENERATE_VAR_LAYER(bn_var1, desc1, inputs);
-  GENERATE_CONSTANT_USE_DESC(bn_var2, desc1, 0.01);
-  GENERATE_VAR_LAYER(bn_var2, desc1, inputs);
-  GENERATE_CONSTANT_USE_DESC(bn_var3, desc1, 0.01);
-  GENERATE_VAR_LAYER(bn_var3, desc1, inputs);
-  GENERATE_CONSTANT_USE_DESC(bn_var4, desc1, 0.01);
-  GENERATE_VAR_LAYER(bn_var4, desc1, inputs);
-
-  TensorDesc desc2(ge::Shape({2048, 1001}), FORMAT_NCHW, DT_FLOAT);
-  GENERATE_CONSTANT_USE_DESC(var5, desc2, 0.01);
-  GENERATE_VAR_LAYER(var5, desc2, inputs);
-  GENERATE_CONSTANT_USE_DESC(var6, desc2, 0.01);
-  GENERATE_VAR_LAYER(var6, desc2, inputs);
-
-  TensorDesc desclabel(ge::Shape({1, 1001, 1, 1}), FORMAT_NCHW, DT_FLOAT);
-  GENERATE_CONSTANT_USE_DESC(label1, desclabel, 0.1);
-  GENERATE_VAR_LAYER(label1, desclabel, inputs);
-
-  TensorDesc descmatlabel(ge::Shape({1, 1001, 1, 1}), FORMAT_NCHW, DT_FLOAT);
-  GENERATE_CONSTANT_USE_DESC(matvar, descmatlabel, 0.01);
-  GENERATE_VAR_LAYER(matvar, descmatlabel, inputs);
-  GENERATE_CONSTANT_USE_DESC(matvar1, descmatlabel, 0.01);
-  GENERATE_VAR_LAYER(matvar1, descmatlabel, inputs);
-
-  MAKE_RESNET50_VAR(inputs);
-
-  TensorDesc ctrl(ge::Shape({1, 1, 1, 1}), FORMAT_NCHW, DT_INT64);
-
-  GENERATE_CONSTANT_USE_DESC(iterations_per_loop, ctrl, 100);
-  GENERATE_VAR_LAYER_1(iterations_per_loop, ctrl, "4D", inputs, "npu_runconfig/iterations_per_loop");
-  GENERATE_CONSTANT_USE_DESC(loop_cond, ctrl, 0);
-  GENERATE_VAR_LAYER_1(loop_cond, ctrl, "4D", inputs, "npu_runconfig/loop_cond");
-  GENERATE_CONSTANT_USE_DESC(one, ctrl, 1);
-  GENERATE_VAR_LAYER_1(one, ctrl, "4D", inputs, "npu_runconfig/one");
-  GENERATE_CONSTANT_USE_DESC(zero, ctrl, 0);
-  GENERATE_VAR_LAYER_1(zero, ctrl, "4D", inputs, "npu_runconfig/zero");
-
-  graph.SetInputs(inputs).SetOutputs(outputs);
-  return 0;
-}
-int TestBuildGraphTest(Func fun, Graph &graph, vector<ge::Tensor> &inputs, vector<ge::Tensor> &outputs) {
-  bool graph_ret = fun(graph);
-  ge::Tensor shapeTensor;
-  TensorDesc shape_desc(ge::Shape({32, 3, 224, 224}), FORMAT_NCHW, DT_FLOAT);
-  uint32_t sizeshape = shape_desc.GetShape().GetShapeSize();
-  printf("[test] desc size filter shape:%u\n", sizeshape);
-  shapeTensor.SetTensorDesc(shape_desc);
-  vector<float> dataValuec;
-  for (int i = 0; i < sizeshape; i++) {
-    dataValuec.push_back(1);
-  }
-
-  shapeTensor.SetData((uint8_t *)dataValuec.data(), 4 * sizeshape);
-  inputs.push_back(shapeTensor);
-
-  ge::Tensor shapeTensor1;
-  TensorDesc shape_desc1(ge::Shape({1, 32, 1, 1}), FORMAT_NCHW, DT_FLOAT);
-  uint32_t sizeshape1 = shape_desc1.GetShape().GetShapeSize();
-  printf("[test] desc size filter shape:%u\n", sizeshape1);
-  shapeTensor1.SetTensorDesc(shape_desc1);
-  vector<int32_t> dataValuec1;
-  for (int i = 0; i < sizeshape1; i++) {
-    dataValuec1.push_back(1);
-  }
-
-  shapeTensor1.SetData((uint8_t *)dataValuec1.data(), 4 * sizeshape1);
-
-  return 0;
-}
-int runTrainGraph(Func fun, int loopCount) {
-  printf("GE BBIT begin...\n");
-  std::chrono::system_clock::time_point start = std::chrono::system_clock::now();
-
-  std::map<std::string, std::string> ge_options = {
-      {"device_id", "0"}, {"rank_table_file", ""}, {"graphType", "1"}, {"ge.graphRunMode", "2"}};
-
-  std::map<std::string, std::string> session_options = {{"a", "b"}, {TRAIN_FLAG, "1"}};
-
-  ge::Status ret;
-
-  // init ge
-  ret = GEInitialize_api_new("train", "fe,plugin");
-  printf("ge::GEInitialize ret:%d\n", ret);
-
-  // init session
-  ge::Session session(session_options);
-
-  int graphId_initvar = 1;
-  ge::Graph graph_initvar("initVarGraph");
-  bool graph_ret = BuildInitVarGraph(graph_initvar);
-
-  // session addgraph
-  int graphId = 0;
-
-  // build graph
-  ge::Graph graph("bigGraph");
-  std::vector<ge::Tensor> inputs;
-  ge::Tensor outputTensor;
-  std::vector<ge::Tensor> outputs;
-  graph_ret = TestBuildGraphTest(fun, graph, inputs, outputs);
-  printf("TestReluGrad ret:%d\n", graph_ret);
-
-  ret = session.AddGraph(graphId_initvar, graph_initvar);
-  printf("session.AddVarGraph ret:%d\n", ret);
-  if (ret) return ret;
-
-  ret = session.AddGraph(graphId, graph);
-  printf("session.AddGraph ret:%d\n", ret);
-  if (ret) return ret;
-
-  std::vector<ge::Tensor> inputs1;
-  std::vector<ge::Tensor> outputs1;
-  ret = session.RunGraph(graphId_initvar, inputs1, outputs1);
-
-  if (ret != SUCCESS) {
-    return ret;
-  }
-  // add loop for test of stabilty:
-  for (int i = 0; i < loopCount; i++) {
-    // session rungraph
-    printf("loopCount:%d\n", loopCount);
-    ret = session.RunGraph(graphId, inputs, outputs);
-    printf("session.RunGraph ret:%d\n", ret);
-    if (ret) return ret;
-
-    // define 99999 as loop forever
-    if (loopCount == 99999) i = 0;
-  }
-  std::chrono::system_clock::time_point end = std::chrono::system_clock::now();
-  auto millisecondsduration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-  auto ms = millisecondsduration.count();
-  std::stringstream ss;
-  ss << ms << "ms";
-  std::string run_time = ss.str();
-  printf("run time is : %s \n", run_time.c_str());
-
-  return 0;
-}
-
-int main(int argc, char *argv[]) {
-  // add loop for test of stabilty:
-  int loopCount = 1;
-  if (argc >= 2) loopCount = atoi(argv[1]);
-
-  Status ret = SUCCESS;
-  ret = runTrainGraph(resnet50, loopCount);
-  if (ret == SUCCESS) {
-    std::cout << "[train resnet50 success]" << std::endl;
-  } else {
-    std::cout << "!!! train resnet50 fail !!!" << std::endl;
-  }
-  return ret;
-}
diff --git a/tests/st/test_ge_st.py b/tests/st/test_ge_st.py
deleted file mode 100644
index b5479cfc..00000000
--- a/tests/st/test_ge_st.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright 2019-2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-"""
-ge st test.
-"""
-import pytest
-import subprocess
-import os
-
-@pytest.mark.level0
-@pytest.mark.platform_arm_ascend_training
-@pytest.mark.platform_x86_ascend_training
-@pytest.mark.env_card
-@pytest.mark.component_ge
-def test_resnet50_train():
-    ge_st_dir=os.environ.get('GE_ST_DIR',
-            '/home/jenkins/workspace/release_pkg/gate/graphengine_lib')
-    ge_lib_dir=os.environ.get('GRAPHENGINE_LIB', '/home/jenkins/workspace/release_pkg/gate/graphengine_lib')
-
-    real_pythonpath=os.environ.get('REAL_PYTHONPATH')
-    pythonpath=os.environ.get('PYTHONPATH')
-    if real_pythonpath:
-        if pythonpath:
-            os.environ['PYTHONPATH']=real_pythonpath+':'+pythonpath
-        else:
-            os.environ['PYTHONPATH']=real_pythonpath
-    print('PYTHONPATH: '+os.environ.get('PYTHONPATH'))
-
-    os.environ['ASCEND_OPP_PATH']='/usr/local/Ascend/opp'
-    os.environ['ASCEND_ENGINE_PATH']='/usr/local/Ascend/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:' \
-                                     '/usr/local/Ascend/fwkacllib/lib64/plugin/opskernel/libfe.so:' \
-                                     '/usr/local/Ascend/fwkacllib/lib64/plugin/opskernel/librts_engine.so:'+ \
-                                     ge_lib_dir + '/libge_local_engine.so'
-    print('ASCEND_OPP_PATH: '+os.environ.get('ASCEND_OPP_PATH'))
-    print('ASCEND_ENGINE_PATH: '+os.environ.get('ASCEND_ENGINE_PATH'))
-    print('LD_LIBRARY_PATH: '+os.environ.get('LD_LIBRARY_PATH'))
-
-    cmd=ge_st_dir + '/st_resnet50_train'
-    print('cmd: '+cmd)
-    os.environ['SLOG_PRINT_TO_STDOUT']="1"
-    ret=subprocess.call([cmd], shell=True)
-    assert ret==0
-
diff --git a/tests/ut/ge/graph/build/logical_stream_allocator_unittest.cc b/tests/ut/ge/graph/build/logical_stream_allocator_unittest.cc
index 68416409..5b87939f 100644
--- a/tests/ut/ge/graph/build/logical_stream_allocator_unittest.cc
+++ b/tests/ut/ge/graph/build/logical_stream_allocator_unittest.cc
@@ -306,8 +306,8 @@ class UtestLogicalStreamAllocator : public testing::Test {
     max_parallel_num["aicpu"] = parallel_num;
 
     Status status = AssignLogicalStreams({const1, const2, get_next, genmask1, genmask2, domask, subgraph4, subgraph5,
-                                          subgraph6, allreduce1, allreduce2, apply1, apply2},
-                                          confs, max_parallel_num);
+                                         subgraph6, allreduce1, allreduce2, apply1, apply2},
+                                         confs, max_parallel_num);
     EXPECT_EQ(status, ge::SUCCESS);
 
     EXPECT_EQ(GetStream(get_next), 0);
diff --git a/tests/ut/ge/graph/load/new_op_test_utils.h b/tests/ut/ge/graph/load/new_op_test_utils.h
index 325a3f1f..4cbc78ac 100644
--- a/tests/ut/ge/graph/load/new_op_test_utils.h
+++ b/tests/ut/ge/graph/load/new_op_test_utils.h
@@ -154,7 +154,7 @@ class OmeTestOpUtils {
     if (model->HasAttr(MODEL_ATTR_TASKS)) {
        ge::Buffer task_buffer;
        GE_CHK_BOOL_RET_STATUS(ge::AttrUtils::GetZeroCopyBytes(model, MODEL_ATTR_TASKS, task_buffer), FAILED,
-		              "Get bytes failed.");
+		                          "Get bytes failed.");
        std::shared_ptr<ModelTaskDef> task = ge::MakeShared<ModelTaskDef>();
        GE_CHECK_NOTNULL(task);
        GE_IF_BOOL_EXEC(task_buffer.GetData() == nullptr, GELOGE(FAILED, "Get data fail"); return FAILED);
diff --git a/third_party/fwkacllib/inc/aicpu/aicpu_schedule/aicpu_op_type_list.h b/third_party/fwkacllib/inc/aicpu/aicpu_schedule/aicpu_op_type_list.h
new file mode 100644
index 00000000..7e0f94a8
--- /dev/null
+++ b/third_party/fwkacllib/inc/aicpu/aicpu_schedule/aicpu_op_type_list.h
@@ -0,0 +1,60 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_OP_TYPE_LIST_H_
+#define AICPU_OP_TYPE_LIST_H_
+
+enum OpKernelType {
+    TF_KERNEL,
+    CPU_KERNEL
+};
+
+enum ReturnCode {
+    OP_TYPE_NOT_SUPPORT,
+    FORMAT_NOT_SUPPORT,
+    DTYPE_NOT_SUPPORT
+};
+
+#pragma pack(push, 1)
+//One byte alignment
+struct SysOpInfo {
+    uint64_t opLen;
+    uint64_t opType;
+    OpKernelType kernelsType;
+};
+
+struct OpParamInfo {
+    uint64_t num;
+    uint64_t dtypeList;
+    uint64_t formatList;
+};
+
+struct SysOpCheckInfo {
+    uint64_t opListNum;
+    uint64_t offSetLen;
+    uint64_t sysOpInfoList;
+    uint64_t opParamInfoList;
+};
+
+struct SysOpCheckResp {
+    uint64_t opListNum;
+    bool isWithoutJson;
+    uint64_t returnCodeList;
+    uint64_t sysOpInfoList;
+    uint64_t opParamInfoList;
+};
+#pragma pack(pop)
+#endif  // AICPU_OP_TYPE_LIST_H_
diff --git a/third_party/fwkacllib/inc/cce/aicpu_engine_struct.h b/third_party/fwkacllib/inc/cce/aicpu_engine_struct.h
index a5f43be9..8c0c1847 100644
--- a/third_party/fwkacllib/inc/cce/aicpu_engine_struct.h
+++ b/third_party/fwkacllib/inc/cce/aicpu_engine_struct.h
@@ -33,18 +33,22 @@ typedef enum {
   FMK_KERNEL_TYPE_RESERVED
 } FwkkernelType_t;
 
+#pragma pack(push, 1)
 typedef struct {
   uint32_t fwkKernelType;  // FwkkernelType_t
   union {
     ::aicpu::FWKAdapter::FWKOperateParam fwk_kernel;
   } fwkKernelBase;
-} __attribute__((packed)) STR_FWK_OP_KERNEL;
+} STR_FWK_OP_KERNEL;
+#pragma pack(pop)
 
+#pragma pack(push, 1)
 struct SessionInfo {
   uint64_t sessionId;
   uint64_t kernelId;
   bool sessFlag;
-} __attribute__((packed));
+};
+#pragma pack(pop)
 
 #ifdef __cplusplus
 }
diff --git a/third_party/fwkacllib/inc/cce/fwk_adpt_struct.h b/third_party/fwkacllib/inc/cce/fwk_adpt_struct.h
index 79d94023..50b39d91 100644
--- a/third_party/fwkacllib/inc/cce/fwk_adpt_struct.h
+++ b/third_party/fwkacllib/inc/cce/fwk_adpt_struct.h
@@ -70,6 +70,7 @@ enum FWKExtUpdateAddrType {
   FWK_ADPT_UPDATE_INPUT_OUTPUT
 };
 
+#pragma pack(push, 1)
 // API Parameter Structure
 struct StrFWKKernel {
   FWKOperateType opType;
@@ -89,31 +90,39 @@ struct StrFWKKernel {
 
   uint64_t extInfoLen;         // extend info total length
   uint64_t extInfoAddr;        // extend info addr, ExtInfo structure
-} __attribute__((packed));
+};
+#pragma pack(pop)
 
 typedef StrFWKKernel FWKOperateParam;
 
 // Extent info ShapeAndType
 const uint32_t kMaxShapeDims = 8;
+#pragma pack(push, 1)
 struct ShapeAndType {
   int32_t type;
   int64_t dims[kMaxShapeDims];
-} __attribute__((packed));
+};
+#pragma pack(pop)
 
 // Extend info structure for extInfoAddr
 const uint32_t kExtInfoHeadSize = 8;
+
+#pragma pack(push, 1)
 struct ExtInfo {
   int32_t  infoType;    // extend type
   uint32_t infoLen;     // length for infoMsg
   char     infoMsg[0];  // extend value
-} __attribute__((packed));
+};
+#pragma pack(pop)
 
+#pragma pack(push, 1)
 struct ResultSummary {
   uint64_t shape_data_ptr;   // shape data addr, need convert to void*
   uint64_t shape_data_size;  // num of dims
   uint64_t raw_data_ptr;     // raw data addr,  need convert to void*
   uint64_t raw_data_size;    // size of raw data
-} __attribute__((packed));
+};
+#pragma pack(pop)
 }  // end  namespace FWKAdapter
 }  // namespace aicpu
 
diff --git a/third_party/fwkacllib/inc/hccl/base.h b/third_party/fwkacllib/inc/hccl/base.h
index 8194097e..9facd20c 100644
--- a/third_party/fwkacllib/inc/hccl/base.h
+++ b/third_party/fwkacllib/inc/hccl/base.h
@@ -22,7 +22,8 @@
 
 #ifndef HCCL_BASE_H_
 #define HCCL_BASE_H_
-
+#include <hccl/hccl_types.h>
+#include <string>
 #ifdef __cplusplus
 extern "C" {
 #endif // __cplusplus
@@ -95,6 +96,33 @@ typedef void *rtStream_t;
 */
 typedef void *rtModel_t;
 
+struct HcomOperation {
+    std::string hcclType;
+    void *inputPtr;
+    void *outputPtr;
+    u64 count;
+    HcclDataType dataType;
+    HcclReduceOp opType;
+    u32 root;
+
+    HcomOperation()
+    {
+        inputPtr = nullptr;
+        outputPtr = nullptr;
+        count = 0;
+        dataType = HCCL_DATA_TYPE_RESERVED;
+        opType = HCCL_REDUCE_RESERVED;
+        root = 0;
+    }
+};
+
+struct HcomRemoteAccessAddrInfo {
+    u32 remotetRankID;
+    u64 remoteAddr;  // host embedding table address
+    u64 localAddr;  // device HBM address
+    u64 length;   // Memory Length in Bytes 
+};
+
 #ifdef __cplusplus
 }
 #endif // __cplusplus
diff --git a/third_party/fwkacllib/inc/hccl/hccl_types.h b/third_party/fwkacllib/inc/hccl/hccl_types.h
new file mode 100644
index 00000000..50a64795
--- /dev/null
+++ b/third_party/fwkacllib/inc/hccl/hccl_types.h
@@ -0,0 +1,101 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file hccl_types.h
+ * @brief HCCL data type definition 
+ * 
+ */
+ 
+#ifndef HCCL_TYPES_H_
+#define HCCL_TYPES_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+/**
+ * @brief HCCL functions return value definition
+ */
+typedef enum {
+    HCCL_SUCCESS = 0,               /**< success */
+    HCCL_E_PARA = 1,                /**< parameter error */
+    HCCL_E_PTR = 2,                 /**< empty pointer */
+    HCCL_E_MEMORY = 3,              /**< memory error */
+    HCCL_E_INTERNAL = 4,            /**< internal error */
+    HCCL_E_NOT_SUPPORT = 5,         /**< not support feature */
+    HCCL_E_NOT_FOUND = 6,           /**< not found specific resource */
+    HCCL_E_UNAVAIL = 7,             /**< resource unavailable */
+    HCCL_E_SYSCALL = 8,             /**< call system interface error */
+    HCCL_E_TIMEOUT = 9,             /**< timeout */
+    HCCL_E_OPEN_FILE_FAILURE = 10,  /**< open file fail */
+    HCCL_E_TCP_CONNECT = 11,        /**< tcp connect fail */
+    HCCL_E_ROCE_CONNECT = 12,       /**< roce connect fail */
+    HCCL_E_TCP_TRANSFER = 13,       /**< tcp transfer fail */
+    HCCL_E_ROCE_TRANSFER = 14,      /**< roce transfer fail */
+    HCCL_E_RUNTIME = 15,            /**< call runtime api fail */
+    HCCL_E_DRV = 16,                /**< call driver api fail */
+    HCCL_E_PROFILING = 17,          /**< call profiling api fail */
+    HCCL_E_CCE = 18,                /**< call cce api fail */
+    HCCL_E_NETWORK = 19,            /**< call network api fail */
+    HCCL_E_RESERVED                 /**< reserved */
+} HcclResult;
+
+/**
+ * @brief handle to HCCL communicator
+ */
+typedef void *HcclComm;
+
+/**
+ * @brief HCCL Reduction opperation
+ */
+typedef enum {
+    HCCL_REDUCE_SUM = 0,    /**< sum */
+    HCCL_REDUCE_PROD = 1,   /**< prod */
+    HCCL_REDUCE_MAX = 2,    /**< max */
+    HCCL_REDUCE_MIN = 3,    /**< min */
+    HCCL_REDUCE_RESERVED    /**< reserved */
+} HcclReduceOp;
+
+/**
+ * @brief HCCL data type
+ */
+typedef enum {
+    HCCL_DATA_TYPE_INT8 = 0,    /**< int8 */
+    HCCL_DATA_TYPE_INT16 = 1,   /**< int16 */
+    HCCL_DATA_TYPE_INT32 = 2,   /**< int32 */
+    HCCL_DATA_TYPE_FP16 = 3,    /**< fp16 */
+    HCCL_DATA_TYPE_FP32 = 4,    /**< fp32 */
+    HCCL_DATA_TYPE_INT64 = 5,    /**< int64 */
+    HCCL_DATA_TYPE_UINT64 = 6,    /**< uint64 */
+    HCCL_DATA_TYPE_RESERVED     /**< reserved */
+} HcclDataType;
+
+const uint32_t HCCL_ROOT_INFO_BYTES =  4108; // 4108: root info length
+
+/**
+ * @brief HCCL root info
+ */
+typedef struct HcclRootInfoDef {
+    char internal[HCCL_ROOT_INFO_BYTES];
+} HcclRootInfo;
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+#endif // HCCL_TYPES_H_
diff --git a/third_party/fwkacllib/inc/hccl/hcom.h b/third_party/fwkacllib/inc/hccl/hcom.h
index de140b4b..e491d43f 100644
--- a/third_party/fwkacllib/inc/hccl/hcom.h
+++ b/third_party/fwkacllib/inc/hccl/hcom.h
@@ -24,6 +24,8 @@
 
 #include <hccl/base.h>
 #include <hccl/hccl_types.h>
+#include <functional>
+#include <vector>
 
 #ifdef __cplusplus
 extern "C" {
@@ -40,6 +42,15 @@ extern "C" {
  */
 HcclResult hcom_get_rank_size(const char *group, u32 *rankSize);
 
+/**
+ * @brief Get the rank number in the group.
+ *
+ * @param group A string identifying the group name.
+ * @param rankSize A pointer identifying the rank number.
+ * @return HcclResult 
+ */
+HcclResult HcomGetRankSize(const char *group, u32 *rankSize);
+
 /**
  * @brief Get the rank number of this rank's server within the group.
  *
@@ -49,6 +60,15 @@ HcclResult hcom_get_rank_size(const char *group, u32 *rankSize);
  */
 HcclResult hcom_get_local_rank_size(const char *group, u32 *localRankSize);
 
+/**
+ * @brief Get the rank number of this rank's server within the group.
+ *
+ * @param group A string identifying the group name.
+ * @param localRankSize A pointer identifying the rank number.
+ * @return HcclResult 
+ */
+HcclResult HcomGetLocalRankSize(const char *group, u32 *localRankSize);
+
 /**
  * @brief Get the rank id of this rank.
  *
@@ -58,6 +78,15 @@ HcclResult hcom_get_local_rank_size(const char *group, u32 *localRankSize);
  */
 HcclResult hcom_get_rank_id(const char *group, u32 *rankId);
 
+/**
+ * @brief Get the rank id of this rank.
+ *
+ * @param group A string identifying the group name.
+ * @param rankId A pointer identifying the rank id.
+ * @return HcclResult 
+ */
+HcclResult HcomGetRankId(const char *group, u32 *rankId);
+
 /**
  * @brief Get the local rank id of this rank's server within the group.
  *
@@ -67,6 +96,15 @@ HcclResult hcom_get_rank_id(const char *group, u32 *rankId);
  */
 HcclResult hcom_get_local_rank_id(const char *group, u32 *localRankId);
 
+/**
+ * @brief Get the local rank id of this rank's server within the group.
+ *
+ * @param group A string identifying the group name.
+ * @param localRankId A pointer identifying the local rank id.
+ * @return HcclResult 
+ */
+HcclResult HcomGetLocalRankId(const char *group, u32 *localRankId);
+
 /**
  * @brief Get the world rank id according to the group rank id.
  *
@@ -77,6 +115,16 @@ HcclResult hcom_get_local_rank_id(const char *group, u32 *localRankId);
  */
 HcclResult hcom_get_world_rank_from_group_rank(const char *group, u32 groupRank, u32 *worldRank);
 
+/**
+ * @brief Get the world rank id according to the group rank id.
+ *
+ * @param group A string identifying the group name.
+ * @param groupRank An integer(u32) identifying the group rank id.
+ * @param worldRank A pointer identifying the world rank id.
+ * @return HcclResult 
+ */
+HcclResult HcomGetWorldRankFromGroupRank(const char *group, u32 groupRank, u32 *worldRank);
+
 /**
  * @brief Get the group rank id according to the world rank id.
  *
@@ -87,6 +135,16 @@ HcclResult hcom_get_world_rank_from_group_rank(const char *group, u32 groupRank,
  */
 HcclResult hcom_get_group_rank_from_world_rank(u32 worldRank, const char *group, u32 *groupRank);
 
+/**
+ * @brief Get the group rank id according to the world rank id.
+ *
+ * @param worldRank An integer(u32) identifying the world rank id.
+ * @param group A string identifying the group name.
+ * @param groupRank A pointer identifying the group rank id.
+ * @return HcclResult 
+ */
+HcclResult HcomGetGroupRankFromWorldRank(u32 worldRank, const char *group, u32 *groupRank);
+
 /**
  * @brief Create group.
  *
@@ -97,6 +155,16 @@ HcclResult hcom_get_group_rank_from_world_rank(u32 worldRank, const char *group,
  */
 HcclResult hcom_create_group(const char *group, u32 rankNum, u32 *rankIds);
 
+/**
+ * @brief Create group.
+ *
+ * @param group A string identifying the group name.
+ * @param rankNum An integer(u32) identifying the number of ranks in the group.
+ * @param rankIds A list identifying the ranks in the group.
+ * @return HcclResult 
+ */
+HcclResult HcomCreateGroup(const char *group, u32 rankNum, u32 *rankIds);
+
 /**
  * @brief Destroy group
  *
@@ -105,6 +173,14 @@ HcclResult hcom_create_group(const char *group, u32 rankNum, u32 *rankIds);
  */
 HcclResult hcom_destroy_group(const char *group);
 
+/**
+ * @brief Destroy group
+ *
+ * @param group A string identifying the group name.
+ * @return HcclResult 
+ */
+HcclResult HcomDestroyGroup(const char *group);
+
 /**
  * @brief Set the gradient split strategy with in the group, according to gradient index.
  *
@@ -115,6 +191,16 @@ HcclResult hcom_destroy_group(const char *group);
  */
 extern HcclResult hcom_set_split_strategy_by_index(const char *group, u32 segmentNum, const u32 *IdxList);
 
+/**
+ * @brief Set the gradient split strategy with in the group, according to gradient index.
+ *
+ * @param group A string identifying the group name.
+ * @param segmentNum An integer(u32) identifying the segments number of gradients.
+ * @param IdxList A list identifying the index of end gradient in each segment.
+ * @return HcclResult
+ */
+extern HcclResult HcomSetGradFusionByIndex(const char *group, u32 segmentNum, const u32 *IdxList);
+
 /**
  * @brief Set the gradient split strategy with in the group, according to gradient data size.
  *
@@ -125,6 +211,16 @@ extern HcclResult hcom_set_split_strategy_by_index(const char *group, u32 segmen
  */
 extern HcclResult hcom_set_split_strategy_by_size(const char *group, u32 segmentNum, const float *sizeList);
 
+/**
+ * @brief Set the gradient split strategy with in the group, according to gradient data size.
+ *
+ * @param group A string identifying the group name.
+ * @param segmentNum An integer(u32) identifying the segments number of gradients.
+ * @param sizeList A list identifying the percent of each segment.
+ * @return HcclResult
+ */
+extern HcclResult HcomSetGradFusionBySize(const char *group, u32 segmentNum, const float *sizeList);
+
 /**
  * @brief Register memories and init resources for remote access.
  *
@@ -134,6 +230,25 @@ extern HcclResult hcom_set_split_strategy_by_size(const char *group, u32 segment
  */
 extern HcclResult hcom_remote_access_mem_register(const MemRegisterAddr* addrList, u32 count);
 
+/**
+ * @brief Register memories and init resources for remote access.
+ *
+ * @param addrList memory addresses for remote access.
+ * @param count number of remote memory addresses.
+ * @return HcclResult
+ */
+extern HcclResult HcomRegRemoteAccessMem(const MemRegisterAddr* addrList, u32 count);
+
+HcclResult HcomExecInitialize();
+
+HcclResult HcomExecFinalize();
+
+HcclResult HcomExecEnqueueOperation(HcomOperation opInfo, std::function<void(HcclResult status)> callback);
+
+HcclResult HcomExecEnqueueRemoteAccess(const std::string& remoteAccessType,
+                               const std::vector<HcomRemoteAccessAddrInfo>& addrInfos,
+                               std::function<void(HcclResult status)> callback);
+
 #ifdef __cplusplus
 }
 #endif // __cplusplus
diff --git a/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_linux.h b/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_linux.h
index c74f95ac..66638bbb 100644
--- a/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_linux.h
+++ b/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_linux.h
@@ -50,7 +50,7 @@ typedef int (*mmFilter)(const mmDirent *entry);
 typedef int (*mmFilter2)(const mmDirent2 *entry);
 typedef int (*mmSort)(const mmDirent **a, const mmDirent **b);
 typedef int (*mmSort2)(const mmDirent2 **a, const mmDirent2 **b);
-typedef size_t mmSize_t;
+typedef size_t mmSize_t; //lint !e410 !e1051
 typedef off_t mmOfft_t;
 typedef pid_t mmPid_t;
 typedef long MM_LONG;
@@ -215,6 +215,10 @@ typedef struct {
 #define S_IWRITE S_IWUSR
 #endif
 
+#define mm_no_argument        no_argument
+#define mm_required_argument  required_argument
+#define mm_optional_argument  optional_argument
+
 #define M_FILE_RDONLY O_RDONLY
 #define M_FILE_WRONLY O_WRONLY
 #define M_FILE_RDWR O_RDWR
@@ -412,8 +416,12 @@ MMPA_FUNC_VISIBILITY VOID mmClosePipe(mmPipeHandle pipe[], UINT32 pipeCount);
 // Poll related interface
 MMPA_FUNC_VISIBILITY mmCompletionHandle mmCreateCompletionPort();
 MMPA_FUNC_VISIBILITY VOID mmCloseCompletionPort(mmCompletionHandle handle);
-MMPA_FUNC_VISIBILITY INT32 mmPoll(mmPollfd *fds, INT32 fdCount, INT32 timeout, mmCompletionHandle handleIOCP,
-                                    pmmPollData polledData, mmPollBack pollBack);
+MMPA_FUNC_VISIBILITY INT32 mmPoll(mmPollfd *fds,
+                                  INT32 fdCount,
+                                  INT32 timeout,
+                                  mmCompletionHandle handleIOCP,
+                                  pmmPollData polledData,
+                                  mmPollBack pollBack);
 MMPA_FUNC_VISIBILITY INT32 mmGetErrorCode();
 MMPA_FUNC_VISIBILITY CHAR *mmGetErrorFormatMessage(mmErrorMsg errnum, CHAR *buf, mmSize size);
 MMPA_FUNC_VISIBILITY INT32 mmGetTimeOfDay(mmTimeval *timeVal, mmTimezone *timeZone);
diff --git a/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_win.h b/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_win.h
index a5a22b4f..aa58e722 100644
--- a/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_win.h
+++ b/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_win.h
@@ -237,6 +237,11 @@ typedef struct {
 } mmThreadAttr;
 
 typedef VOID (*mmPf)(VOID);
+
+#define mm_no_argument        0
+#define mm_required_argument  1
+#define mm_optional_argument  2
+
 #define M_FILE_RDONLY GENERIC_READ
 #define M_FILE_WRONLY GENERIC_WRITE
 #define M_FILE_RDWR (GENERIC_READ | GENERIC_WRITE)
diff --git a/third_party/fwkacllib/inc/runtime/base.h b/third_party/fwkacllib/inc/runtime/base.h
index 85f16cc5..aa8263f9 100644
--- a/third_party/fwkacllib/inc/runtime/base.h
+++ b/third_party/fwkacllib/inc/runtime/base.h
@@ -18,6 +18,7 @@
 #define __CCE_RUNTIME_BASE_H__
 
 #include <stdint.h>
+#include "toolchain/prof_callback.h"
 
 #if defined(__cplusplus) && !defined(COMPILE_OMG_PACKAGE)
 extern "C" {
@@ -86,10 +87,20 @@ typedef struct rtExceptionInfo {
     uint32_t deviceid;
 } rtExceptionInfo;
 
+typedef struct rtTaskFailInfo {
+    uint32_t taskid;
+    uint32_t streamid;
+    uint32_t tid;
+    uint32_t deviceid;
+    uint32_t retcode;
+} rtTaskFailInfo;
+
 typedef void (*rtErrorCallback)(rtExceptionType);
 
 typedef void (*rtTaskFailCallback)(rtExceptionInfo *exceptionInfo);
 
+typedef void (*rtTaskFailCallbackByModule)(rtTaskFailInfo *exceptionInfo);
+
 typedef void (*rtDeviceStateCallback)(uint32_t devId, bool isOpen);
 
 /**
@@ -146,6 +157,12 @@ RTS_API rtError_t rtProfilerStop(uint64_t profConfig, int32_t numsDev, uint32_t*
  */
 RTS_API rtError_t rtProfilerTrace(uint64_t id, bool notify, uint32_t flags, rtStream_t stream);
 
+/**
+ * @ingroup profiling_base
+ * @brief ts set profiling reporter callback.
+ */
+RTS_API rtError_t rtSetMsprofReporterCallback(MsprofReporterCallback callback);
+
 /**
  * @ingroup dvrt_base
  * @brief Returns the last error from a runtime call.
@@ -184,6 +201,16 @@ RTS_API rtError_t rtSetTaskFailCallback(rtTaskFailCallback callback);
  */
 RTS_API rtError_t rtRegDeviceStateCallback(const char *regName, rtDeviceStateCallback callback);
 
+/**
+ * @ingroup dvrt_base
+ * @brief register callback for fail task 
+ * @param [in] uniName unique register name, can't be null
+ * @param [in] callback fail task callback function
+ * @param [out] NA
+ * @return RT_ERROR_NONE for ok
+ */
+RTS_API rtError_t rtRegTaskFailCallbackByModule(const char *moduleName, rtTaskFailCallbackByModule callback);
+
 /**
  * @ingroup dvrt_base
  * @brief notify handle.
diff --git a/third_party/fwkacllib/inc/runtime/config.h b/third_party/fwkacllib/inc/runtime/config.h
index c471f128..c35a1278 100644
--- a/third_party/fwkacllib/inc/runtime/config.h
+++ b/third_party/fwkacllib/inc/runtime/config.h
@@ -121,14 +121,6 @@ typedef struct tagRtMemoryConfig {
 
 typedef struct tagRtPlatformConfig { uint32_t platformConfig; } rtPlatformConfig_t;
 
-/**
- * @ingroup
- * @brief get platform
- * @param [in] platForm
- * @return platForm
- */
-RTS_API rtError_t rtGetPlatformConfig(rtPlatformConfig_t *platForm);
-
 /**
  * @ingroup
  * @brief get AI core count
@@ -169,13 +161,6 @@ RTS_API rtError_t rtGetAiCoreMemoryRates(rtAiCoreMemoryRates_t *aiCoreMemoryRate
  */
 RTS_API rtError_t rtGetMemoryConfig(rtMemoryConfig_t *memoryConfig);
 
-/**
- * @ingroup
- * @brief set platform in gen ctx
- * @param [in] platForm
- * @return RT_ERROR_NONE for ok, errno for failed
- */
-RTS_API rtError_t rtSetPlatformType(rtPlatformType_t platformType);
 
 /**
  * @ingroup
diff --git a/third_party/fwkacllib/inc/tdt/tsd_client.h b/third_party/fwkacllib/inc/tdt/tsd_client.h
index 6066a12e..665c8b82 100644
--- a/third_party/fwkacllib/inc/tdt/tsd_client.h
+++ b/third_party/fwkacllib/inc/tdt/tsd_client.h
@@ -23,6 +23,7 @@
 #include <mutex>
 #include "tdt/status.h"
 #include "tdt/data_common.h"
+#include "toolchain/prof_callback.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -37,7 +38,7 @@ extern "C" {
 * Used for the Framework process to communicate with the TSDDaemon process,
 * and notify TSD to complete the initialization of other processes
 *
-* @param phyDeviceId [IN] type #unsigned int. Physical device ID
+* @param logicDeviceId [IN] type #unsigned int. Logic device ID
 * @param rankSize [IN] type #unsigned int. The rankSize of the training.
 * The default value is 1. When rankSize is greater than 1,
 * HCCP will be pulled to perform set communication related operations.
@@ -49,7 +50,7 @@ extern "C" {
 * @li tsd_client.h: Header file where the interface declaration is located.
 * @li data_common.h: Header file where 'TDT_StatusT' defined
 */
-TDT_LIB_EXPORT TDT_StatusT TsdOpen(const uint32_t phyDeviceId, const uint32_t rankSize);
+TDT_LIB_EXPORT TDT_StatusT TsdOpen(const uint32_t logicDeviceId, const uint32_t rankSize);
 
 /**
 * @ingroup Close
@@ -67,7 +68,7 @@ TDT_LIB_EXPORT TDT_StatusT TsdOpen(const uint32_t phyDeviceId, const uint32_t ra
 * @li tsd_client.h: Header file where the interface declaration is located.
 * @li data_common.h: Header file where 'TDT_StatusT' defined
 */
-TDT_LIB_EXPORT TDT_StatusT TsdClose(const uint32_t phyDeviceId);
+TDT_LIB_EXPORT TDT_StatusT TsdClose(const uint32_t logicDeviceId);
 
 /**
 * @ingroup UpdateProfilingMode
@@ -85,7 +86,26 @@ TDT_LIB_EXPORT TDT_StatusT TsdClose(const uint32_t phyDeviceId);
 * @li tsd_client.h: Header file where the interface declaration is located.
 * @li data_common.h: Header file where 'TDT_StatusT' defined
 */
-TDT_LIB_EXPORT TDT_StatusT UpdateProfilingMode(const uint32_t phyDeviceId, const uint32_t flag);
+TDT_LIB_EXPORT TDT_StatusT UpdateProfilingMode(const uint32_t logicDeviceId, const uint32_t flag);
+
+/**
+* @ingroup TsdSetMsprofReporterCallback
+* @brief 用于推理场景下设置aicpu的profilng的callback函数
+*
+* @par Function
+* 设置offline模式下aicpu_sd进程的profiling的callback函数
+*
+* @param callback [IN] type #MsprofReporterCallback. 回调函数
+* @retval TDT_OK Success
+* @retval OtherValues Failure
+*
+* @par Dependency
+* @li libtsdclient.so: Library to which the interface belongs.
+* @li tsd_client.h: Header file where the interface declaration is located.
+* @li data_common.h: Header file where 'TDT_StatusT' defined
+* @li prof_callback.h: Headerfile where 'MsprofReporterCallback' defined
+*/
+TDT_LIB_EXPORT TDT_StatusT TsdSetMsprofReporterCallback(MsprofReporterCallback callback);
 
 /**
 * @ingroup CreateCmdParameterObj
diff --git a/third_party/fwkacllib/inc/toolchain/prof_acl_api.h b/third_party/fwkacllib/inc/toolchain/prof_acl_api.h
index 430ed14d..efb37cfb 100644
--- a/third_party/fwkacllib/inc/toolchain/prof_acl_api.h
+++ b/third_party/fwkacllib/inc/toolchain/prof_acl_api.h
@@ -17,380 +17,76 @@
 #ifndef MSPROFILER_API_PROF_ACL_API_H_
 #define MSPROFILER_API_PROF_ACL_API_H_
 
-#define MSVP_MAX_DEV_NUM 64
-#ifndef OS_TYPE
-#define OS_TYPE 0
-#endif // OS_TYPE
-
-
-#if (OS_TYPE != LINUX)
-#define MSVP_PROF_API __declspec(dllexport)
-#else
-#define MSVP_PROF_API __attribute__((visibility("default")))
-#endif
-
 // DataTypeConfig
-#define PROF_ACL_API                0x0001
-#define PROF_TASK_TIME              0x0002
-#define PROF_AICORE_METRICS         0x0004
-#define PROF_AICPU_TRACE            0x0008
-#define PROF_MODEL_EXECUTE          0x0010
-#define PROF_RUNTIME_API            0x0020
-#define PROF_RUNTIME_TRACE          0x0040
-#define PROF_SCHEDULE_TIMELINE      0x0080
-#define PROF_SCHEDULE_TRACE         0x0100
-#define PROF_AIVECTORCORE_METRICS   0x0200
-#define PROF_SUBTASK_TIME           0x0400
-
-#define PROF_TRAINING_TRACE         0x0800
-#define PROF_HCCL_TRACE             0x1000
-#define PROF_DATA_PROCESS           0x2000
-#define PROF_TASK_TRACE             0x3842
+#define PROF_ACL_API                0x00000001
+#define PROF_TASK_TIME              0x00000002
+#define PROF_AICORE_METRICS         0x00000004
+#define PROF_AICPU_TRACE            0x00000008
+#define PROF_MODEL_EXECUTE          0x00000010
+#define PROF_RUNTIME_API            0x00000020
+#define PROF_RUNTIME_TRACE          0x00000040
+#define PROF_SCHEDULE_TIMELINE      0x00000080
+#define PROF_SCHEDULE_TRACE         0x00000100
+#define PROF_AIVECTORCORE_METRICS   0x00000200
+#define PROF_SUBTASK_TIME           0x00000400
+
+#define PROF_TRAINING_TRACE         0x00000800
+#define PROF_HCCL_TRACE             0x00001000
+
+#define PROF_TASK_TRACE             0x00001852
+
+// system profilinig switch
+#define PROF_CPU                    0x00010000
+#define PROF_HARDWARE_MEMORY        0x00020000
+#define PROF_IO                     0x00040000
+#define PROF_INTER_CONNECTION       0x00080000
+#define PROF_DVPP                   0x00100000
+#define PROF_SYS_AICORE_SAMPLE      0x00200000
+#define PROF_AIVECTORCORE_SAMPLE    0x00400000
 
 #define PROF_MODEL_LOAD             0x8000000000000000
 
 // DataTypeConfig MASK
-#define PROF_ACL_API_MASK                0x0001
-#define PROF_TASK_TIME_MASK              0x0002
-#define PROF_AICORE_METRICS_MASK         0x0004
-#define PROF_AICPU_TRACE_MASK            0x0008
-#define PROF_MODEL_EXECUTE_MASK          0x0010
-#define PROF_RUNTIME_API_MASK            0x0020
-#define PROF_RUNTIME_TRACE_MASK          0x0040
-#define PROF_SCHEDULE_TIMELINE_MASK      0x0080
-#define PROF_SCHEDULE_TRACE_MASK         0x0100
-#define PROF_AIVECTORCORE_METRICS_MASK   0x0200
-#define PROF_SUBTASK_TIME_MASK           0x0400
-
-#define PROF_TRAINING_TRACE_MASK         0x0800
-#define PROF_HCCL_TRACE_MASK             0x1000
-#define PROF_DATA_PROCESS_MASK           0x2000
+#define PROF_ACL_API_MASK                0x00000001
+#define PROF_TASK_TIME_MASK              0x00000002
+#define PROF_AICORE_METRICS_MASK         0x00000004
+#define PROF_AICPU_TRACE_MASK            0x00000008
+#define PROF_MODEL_EXECUTE_MASK          0x00000010
+#define PROF_RUNTIME_API_MASK            0x00000020
+#define PROF_RUNTIME_TRACE_MASK          0x00000040
+#define PROF_SCHEDULE_TIMELINE_MASK      0x00000080
+#define PROF_SCHEDULE_TRACE_MASK         0x00000100
+#define PROF_AIVECTORCORE_METRICS_MASK   0x00000200
+#define PROF_SUBTASK_TIME_MASK           0x00000400
+
+#define PROF_TRAINING_TRACE_MASK         0x00000800
+#define PROF_HCCL_TRACE_MASK             0x00001000
+
+// system profilinig mask
+#define PROF_CPU_MASK                    0x00010000
+#define PROF_HARDWARE_MEMORY_MASK        0x00020000
+#define PROF_IO_MASK                     0x00040000
+#define PROF_INTER_CONNECTION_MASK       0x00080000
+#define PROF_DVPP_MASK                   0x00100000
+#define PROF_SYS_AICORE_SAMPLE_MASK      0x00200000
+#define PROF_AIVECTORCORE_SAMPLE_MASK    0x00400000
 
 #define PROF_MODEL_LOAD_MASK             0x8000000000000000
 
 #include <cstdint>
-#include <string>
-
-/**
- * @name  ProrErrorCode
- * @brief error code enum of prof_acl_apis
- */
-enum ProfErrorCode {
-    PROF_ERROR_NONE = 0,            // ok
-    PROF_ERROR_PARAM_INVALID,       // param invalid, for example nullptr
-    PROF_ERROR_REPEAT_INIT,         // profiling has already been inited
-    PROF_ERROR_CONFIG_INVALID,      // config invalid, for example invalid json string
-    PROF_ERROR_DIR_NO_ACCESS,       // dir is not accessable
-    PROF_ERROR_FAILURE,             // failed to init or start profiling
-    PROF_ERROR_NOT_INITED,          // profiling has not been inited
-    PROF_ERROR_DEVICE_INVALID,      // device id invalid
-    PROF_ERROR_UNSUPPORTED,         // unsupported data type or ai core metrics
-    PROF_ERROR_REPEAT_START,        // profiilng has already been started
-    PROF_ERROR_NOT_STARTED,         // profiling has not been started
-    PROF_ERROR_REPEAT_SUBSCRIBE,    // same model id has already been subscribed
-    PROF_ERROR_MODEL_ID_INVALID,    // model id does not exist or has not been subscribed
-    PROF_ERROR_API_CONFLICT,        // prof ctrl api mode conflicts with subscribe mode
-};
-
-/**
- * @brief transfer profiling config in acl.json to sample config
- * @param aclCfg       [IN]  profiling json string from acl.json as {"switch":"on", "result_path":"/home",...}
- * @param sampleCfg    [OUT] json string for GE as {"startCfg":[{"deviceID":"all","jobID":"1234",...}]}
- * @return ProfErrorCode
- */
-MSVP_PROF_API int32_t ProfAclCfgToSampleCfg(const std::string &aclCfg, std::string &sampleCfg);
-
-/**
- * @name  ProfInit
- * @brief init profiling
- * @param profInitCfg [IN] config of init profiling of json format
- * @return ProfErrorCode
- */
-MSVP_PROF_API int32_t ProfInit(const std::string &profInitCfg);
-
-/**
- * @name  ProfAicoreMetrics
- * @brief aicore metrics enum
- */
-enum ProfAicoreMetrics {
-    PROF_AICORE_ARITHMATIC_THROUGHPUT = 0,
-    PROF_AICORE_PIPELINE = 1,
-    PROF_AICORE_SYNCHRONIZATION = 2,
-    PROF_AICORE_MEMORY = 3,
-    PROF_AICORE_INTERNAL_MEMORY = 4,
-    PROF_AICORE_STALL = 5,
-    PROF_AICORE_METRICS_COUNT,
-    PROF_AICORE_NONE = 0xff,
-};
-
-/**
- * @name  ProfConfig
- * @brief struct of ProfStart
- */
-struct ProfConfig {
-    uint32_t devNums;                     // length of device id list
-    uint32_t devIdList[MSVP_MAX_DEV_NUM]; // physical device id list
-    ProfAicoreMetrics aicoreMetrics;      // aicore metric
-    uint64_t dataTypeConfig;              // data type to start profiling
-};
-
-/**
- * @name  ProfStartProfiling
- * @brief start profiling
- * @param profStartCfg [IN] config to start profiling
- * @return ProfErrorCode
- */
-MSVP_PROF_API int32_t ProfStartProfiling(const ProfConfig *profStartCfg);
-
-/**
- * @name  ProfStopProfiling
- * @brief stop profiling
- * @param profStopCfg [IN] config to stop profiling
- * @return ProfErrorCode
- */
-MSVP_PROF_API int32_t ProfStopProfiling(const ProfConfig *profStopCfg);
-
-/**
- * @name  ProfFinalize
- * @brief finalize profiling task
- * @return ProfErrorCode
- */
-MSVP_PROF_API int32_t ProfFinalize();
-
-/**
- * @name  ProfGetDataTypeConfig
- * @brief get dataTypeConfig started with of one device
- * @param deviceId          [IN] deviceId to get dataTypeConfig
- * @param dataTypeConfig    [OUT] result get
- * @return ProfErrorCode
- */
-MSVP_PROF_API int32_t ProfGetDataTypeConfig(uint32_t deviceId, uint64_t &dataTypeConfig);
 
 namespace Msprofiler {
 namespace Api {
-/**
- * @brief transfer profiling config in acl.json to sample config
- * @param aclCfg       [IN]  profiling json string from acl.json as {"switch":"on", "result_path":"/home",...}
- * @param sampleCfg    [OUT] json string for GE as {"startCfg":[{"deviceID":"all","jobID":"1234",...}]}
- * @return ProfErrorCode
- */
-MSVP_PROF_API int32_t ProfAclCfgToSampleCfg(const std::string &aclCfg, std::string &sampleCfg);
-
-/**
- * @name  ProfInit
- * @brief init profiling
- * @param profInitCfg [IN] config of init profiling of json format
- * @return ProfErrorCode
- */
-MSVP_PROF_API int32_t ProfInit(const std::string &profInitCfg);
-
-/**
- * @name  ProfStartProfiling
- * @brief start profiling
- * @param profStartCfg [IN] config to start profiling
- * @return ProfErrorCode
- */
-MSVP_PROF_API int32_t ProfStartProfiling(const ProfConfig *profStartCfg);
-
-/**
- * @name  ProfStopProfiling
- * @brief stop profiling
- * @param profStopCfg [IN] config to stop profiling
- * @return ProfErrorCode
- */
-MSVP_PROF_API int32_t ProfStopProfiling(const ProfConfig *profStopCfg);
-
-/**
- * @name  ProfFinalize
- * @brief finalize profiling task
- * @return ProfErrorCode
- */
-MSVP_PROF_API int32_t ProfFinalize();
-
-/**
- * @name  ProfGetDataTypeConfig
- * @brief get dataTypeConfig started with of one device
- * @param deviceId          [IN] deviceId to get dataTypeConfig
- * @param dataTypeConfig    [OUT] result get
- * @return ProfErrorCode
- */
-MSVP_PROF_API int32_t ProfGetDataTypeConfig(uint32_t deviceId, uint64_t &dataTypeConfig);
-
-/**
- * @name  WorkMode
- * @brief profiling api work mode
- */
-enum WorkMode {
-    WORK_MODE_OFF,          // profiling not at work
-    WORK_MODE_API_CTRL,     // profiling work on api ctrl mode, (ProfInit)
-    WORK_MODE_SUBSCRIBE,    // profiling work on subscribe mode
-};
-
-/**
- * @name  ProfGetApiWorkMode
- * @brief get profiling api work mode
- * @return WorkMode
- */
-MSVP_PROF_API WorkMode ProfGetApiWorkMode();
-
-/**
- * @name  ProfSubscribeConfig
- * @brief config of subscribe api
- */
-struct ProfSubscribeConfig {
-    bool timeInfo;                      // subscribe op time
-    ProfAicoreMetrics aicoreMetrics;    // subscribe ai core metrics
-    void* fd;                           // pipe fd
-};
-
-/**
- * @name  ProfGetDataTypeConfig
- * @brief get DataTypeConfig of subscribe
- * @param profSubscribeConfig [IN] config to subscribe data
- * @return DataTypeConfig
- */
-MSVP_PROF_API uint64_t ProfGetDataTypeConfig(const ProfSubscribeConfig *profSubscribeConfig);
-
-/**
- * @name  ProfModelSubscribe
- * @brief subscribe data of one model id
- * @param modelId [IN] model id to subscribe data
- * @param devId [IN] device id of model
- * @param profSubscribeConfig [IN] config to subscribe data
- * @return ProfErrorCode
- */
-MSVP_PROF_API int32_t ProfModelSubscribe(uint32_t modelId, uint32_t devId,
-                                         const ProfSubscribeConfig *profSubscribeConfig);
-
-/**
- * @name  ProfIsModelSubscribed
- * @brief check if a model id is subscribed
- * @param modeiId [IN] modei id to check
- * @return true: subscribed, false: not
- */
-MSVP_PROF_API bool ProfIsModelSubscribed(uint32_t modelId);
-
-/**
- * @name  ProfModelUnSubscribe
- * @brief unsubscribe a model id
- * @param modeiId [IN] modei id to unsubscribe
- * @return ProfErrorCode
- */
-MSVP_PROF_API int32_t ProfModelUnSubscribe(uint32_t modelId);
-
-/**
- * @name  ProfGetOpDescSize
- * @brief get profiling data struct size
- * @param opDescSize [OUT] bytes of profiling subscribe data struct
- * @return ProfErrorCode
- */
-MSVP_PROF_API int32_t ProfGetOpDescSize(uint32_t *opDescSize);
-
-/**
- * @name  ProfGetOpNum
- * @brief get how many op data there are in data
- * @param data [IN] data read from pipe
- * @param len [IN] data length
- * @param opNum [OUT] number of op in data
- * @return ProfErrorCode
- */
-MSVP_PROF_API int32_t ProfGetOpNum(const void *data, uint32_t len, uint32_t *opNum);
-
-/**
- * @name  ProfGetModelId
- * @brief get model id of specific part of data
- * @param data [IN] data read from pipe
- * @param len [IN] data length
- * @param index [IN] index of part(op)
- * @return model id
- */
-MSVP_PROF_API uint32_t ProfGetModelId(const void *data, uint32_t len, uint32_t index);
-
-/**
- * @name  ProfGetOpType
- * @brief get op type of specific part of data
- * @param data [IN] data read from pipe
- * @param len [IN] data length
- * @param opType [OUT] op type buffer
- * @param opTypeLen [IN] buffer size of param opType
- * @param index [IN] index of part(op)
- * @return ProfErrorCode
- */
-MSVP_PROF_API int32_t ProfGetOpType(const void *data, uint32_t len, char *opType, uint32_t opTypeLen, uint32_t index);
-
-/**
- * @name  ProfGetOpName
- * @brief get op name of specific part of data
- * @param data [IN] data read from pipe
- * @param len [IN] data length
- * @param opType [OUT] op name buffer
- * @param opTypeLen [IN] buffer size of param opName
- * @param index [IN] index of part(op)
- * @return ProfErrorCode
- */
-MSVP_PROF_API int32_t ProfGetOpName(const void *data, uint32_t len, char *opName, uint32_t opNameLen, uint32_t index);
-
-/**
- * @name  ProfGetOpStart
- * @brief get op start timestamp of specific part of data
- * @param data [IN] data read from pipe
- * @param len [IN] data length
- * @param index [IN] index of part(op)
- * @return op start timestamp (us)
- */
-MSVP_PROF_API uint64_t ProfGetOpStart(const void *data, uint32_t len, uint32_t index);
-
-/**
- * @name  ProfGetOpEnd
- * @brief get op end timestamp of specific part of data
- * @param data [IN] data read from pipe
- * @param len [IN] data length
- * @param index [IN] index of part(op)
- * @return op end timestamp (us)
- */
-MSVP_PROF_API uint64_t ProfGetOpEnd(const void *data, uint32_t len, uint32_t index);
-
-/**
- * @name  ProfGetOpDuration
- * @brief get op duration of specific part of data
- * @param data [IN] data read from pipe
- * @param len [IN] data length
- * @param index [IN] index of part(op)
- * @return op duration (us)
- */
-MSVP_PROF_API uint64_t ProfGetOpDuration(const void *data, uint32_t len, uint32_t index);
-
 /**
  * @name  ProfGetOpExecutionTime
  * @brief get op execution time of specific part of data
- * @param data [IN] data read from pipe
- * @param len [IN] data length
+ * @param data  [IN] data read from pipe
+ * @param len   [IN] data length
  * @param index [IN] index of part(op)
  * @return op execution time (us)
  */
-MSVP_PROF_API uint64_t ProfGetOpExecutionTime(const void *data, uint32_t len, uint32_t index);
-
-/**
- * @name  ProfGetOpCubeOps
- * @brief get op cube fops of specific part of data
- * @param data [IN] data read from pipe
- * @param len [IN] data length
- * @param index [IN] index of part(op)
- * @return op cube fops
- */
-MSVP_PROF_API uint64_t ProfGetOpCubeOps(const void *data, uint32_t len, uint32_t index);
-
-/**
- * @name  ProfGetOpVectorOps
- * @brief get op vector fops of specific part of data
- * @param data [IN] data read from pipe
- * @param len [IN] data length
- * @param index [IN] index of part(op)
- * @return op vector fops
- */
-MSVP_PROF_API uint64_t ProfGetOpVectorOps(const void *data, uint32_t len, uint32_t index);
-
-}   // namespace Api
-}   // namespace Msprofiler
+uint64_t ProfGetOpExecutionTime(const void *data, uint32_t len, uint32_t index);
+}
+}
 
 #endif  // MSPROFILER_API_PROF_ACL_API_H_
diff --git a/third_party/fwkacllib/inc/toolchain/prof_callback.h b/third_party/fwkacllib/inc/toolchain/prof_callback.h
new file mode 100644
index 00000000..1299ae59
--- /dev/null
+++ b/third_party/fwkacllib/inc/toolchain/prof_callback.h
@@ -0,0 +1,132 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MSPROFILER_PROF_CALLBACK_H_
+#define MSPROFILER_PROF_CALLBACK_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+
+#include "stddef.h"
+#include "stdint.h"
+
+/**
+ * @name  MsprofErrorCode
+ * @brief error code
+ */
+enum MsprofErrorCode {
+    MSPROF_ERROR_NONE = 0,
+    MSPROF_ERROR_MEM_NOT_ENOUGH,
+    MSPROF_ERROR_GET_ENV,
+    MSPROF_ERROR_CONFIG_INVALID,
+    MSPROF_ERROR_ACL_JSON_OFF,
+    MSPROF_ERROR,
+};
+
+#define MSPROF_ENGINE_MAX_TAG_LEN (31)
+
+/**
+ * @name  ReporterData
+ * @brief struct of data to report
+ */
+struct ReporterData {
+    char tag[MSPROF_ENGINE_MAX_TAG_LEN + 1];  // the sub-type of the module, data with different tag will be writen
+    int deviceId;                             // the index of device
+    size_t dataLen;                           // the length of send data
+    unsigned char *data;                      // the data content
+};
+
+/**
+ * @name  MsprofReporterModuleId
+ * @brief module id of data to report
+ */
+enum MsprofReporterModuleId {
+    MSPROF_MODULE_DATA_PREPROCESS = 0,    // DATA_PREPROCESS
+    MSPROF_MODULE_HCCL,                   // HCCL
+    MSPROF_MODULE_ACL,                    // AclModule
+    MSPROF_MODULE_FRAMEWORK,              // Framework
+    MSPROF_MODULE_RUNTIME                 // runtime
+};
+
+/**
+ * @name  MsprofReporterCallbackType
+ * @brief reporter callback request type
+ */
+enum MsprofReporterCallbackType {
+    MSPROF_REPORTER_REPORT = 0,           // report data
+    MSPROF_REPORTER_INIT,                 // init reporter
+    MSPROF_REPORTER_UNINIT,               // uninit reporter
+};
+
+/**
+ * @name  MsprofReporterCallback
+ * @brief callback to start reporter/stop reporter/report date
+ * @param moduleId  [IN] enum MsprofReporterModuleId
+ * @param type      [IN] enum MsprofReporterCallbackType
+ * @param data      [IN] callback data (nullptr on INTI/UNINIT)
+ * @param len       [IN] callback data size (0 on INIT/UNINIT)
+ * @return enum MsprofErrorCode
+ */
+typedef int32_t (*MsprofReporterCallback)(uint32_t moduleId, uint32_t type, void *data, uint32_t len);
+
+
+#define MSPROF_OPTIONS_DEF_LEN_MAX (2048)
+
+/**
+ * @name  MsprofGeOptions
+ * @brief struct of MSPROF_CTRL_INIT_GE_OPTIONS
+ */
+struct MsprofGeOptions {
+    char jobId[MSPROF_OPTIONS_DEF_LEN_MAX];
+    char options[MSPROF_OPTIONS_DEF_LEN_MAX];
+};
+
+/**
+ * @name  MsprofCtrlCallbackType
+ * @brief ctrl callback request type
+ */
+enum MsprofCtrlCallbackType {
+    MSPROF_CTRL_INIT_ACL_ENV = 0,           // start profiling with acl env
+    MSPROF_CTRL_INIT_ACL_JSON,              // start profiling with acl.json
+    MSPROF_CTRL_INIT_GE_OPTIONS,            // start profiling with ge env and options
+    MSPROF_CTRL_FINALIZE                    // stop profiling
+};
+
+/**
+ * @name  MsprofCtrlCallback
+ * @brief callback to start/stop profiling
+ * @param type      [IN] enum MsprofCtrlCallbackType
+ * @param data      [IN] callback data
+ * @param len       [IN] callback data size
+ * @return enum MsprofErrorCode
+ */
+typedef int32_t (*MsprofCtrlCallback)(uint32_t type, void *data, uint32_t len);
+
+/**
+ * @name  MsprofSetDeviceCallback
+ * @brief callback to notify set/reset device
+ * @param devId     [IN] device id
+ * @param isOpenDevice  [IN] true: set device, false: reset device
+ */
+typedef void (*MsprofSetDeviceCallback)(uint32_t devId, bool isOpenDevice);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MSPROFILER_PROF_CALLBACK_H_
diff --git a/third_party/fwkacllib/inc/toolchain/prof_reporter.h b/third_party/fwkacllib/inc/toolchain/prof_reporter.h
index 949011d3..ff91351b 100644
--- a/third_party/fwkacllib/inc/toolchain/prof_reporter.h
+++ b/third_party/fwkacllib/inc/toolchain/prof_reporter.h
@@ -26,6 +26,8 @@
 #define MSVP_PROF_API __attribute__((visibility("default")))
 #endif
 
+#include "prof_callback.h"
+
 /**
  * @file prof_reporter.h
  * @defgroup reporter the reporter group
@@ -33,20 +35,6 @@
  */
 namespace Msprof {
 namespace Engine {
-/// the max tag length
-#define MSPROF_ENGINE_MAX_TAG_LEN (31)
-/**
- * @ingroup reporter
- * @brief struct ReporterData
- * the sturct of the data send to libmsprof
- */
-struct ReporterData {
-  char tag[MSPROF_ENGINE_MAX_TAG_LEN + 1];  ///< the sub-type of the module, data with different tag will be writen
-  int deviceId;                             ///< the physical id of device
-  size_t dataLen;                           ///< the length of send data
-  unsigned char *data;                      ///< the data content
-};
-
 /**
  * @ingroup reporter
  * @brief class Reporter